[llvm] 110c544 - [NFC][Codegen] Add tests with oversized shifts by non-byte-multiple

Roman Lebedev via llvm-commits llvm-commits at lists.llvm.org
Sat Dec 24 08:27:48 PST 2022


Author: Roman Lebedev
Date: 2022-12-24T19:26:41+03:00
New Revision: 110c5442b885625c28a14f17b60713624b3bba55

URL: https://github.com/llvm/llvm-project/commit/110c5442b885625c28a14f17b60713624b3bba55
DIFF: https://github.com/llvm/llvm-project/commit/110c5442b885625c28a14f17b60713624b3bba55.diff

LOG: [NFC][Codegen] Add tests with oversized shifts by non-byte-multiple

Added: 
    llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll
    llvm/test/CodeGen/PowerPC/wide-scalar-shift-legalization.ll
    llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
    llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll

Modified: 
    

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll
new file mode 100644
index 000000000000..c9caa58ac584
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll
@@ -0,0 +1,342 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64-- | FileCheck %s --check-prefixes=ALL
+
+define void @lshr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
+; ALL-LABEL: lshr_4bytes:
+; ALL:       // %bb.0:
+; ALL-NEXT:    ldr w8, [x1]
+; ALL-NEXT:    ldr w9, [x0]
+; ALL-NEXT:    lsr w8, w9, w8
+; ALL-NEXT:    str w8, [x2]
+; ALL-NEXT:    ret
+  %src = load i32, ptr %src.ptr, align 1
+  %bitOff = load i32, ptr %bitOff.ptr, align 1
+  %res = lshr i32 %src, %bitOff
+  store i32 %res, ptr %dst, align 1
+  ret void
+}
+define void @shl_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
+; ALL-LABEL: shl_4bytes:
+; ALL:       // %bb.0:
+; ALL-NEXT:    ldr w8, [x1]
+; ALL-NEXT:    ldr w9, [x0]
+; ALL-NEXT:    lsl w8, w9, w8
+; ALL-NEXT:    str w8, [x2]
+; ALL-NEXT:    ret
+  %src = load i32, ptr %src.ptr, align 1
+  %bitOff = load i32, ptr %bitOff.ptr, align 1
+  %res = shl i32 %src, %bitOff
+  store i32 %res, ptr %dst, align 1
+  ret void
+}
+define void @ashr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
+; ALL-LABEL: ashr_4bytes:
+; ALL:       // %bb.0:
+; ALL-NEXT:    ldr w8, [x1]
+; ALL-NEXT:    ldr w9, [x0]
+; ALL-NEXT:    asr w8, w9, w8
+; ALL-NEXT:    str w8, [x2]
+; ALL-NEXT:    ret
+  %src = load i32, ptr %src.ptr, align 1
+  %bitOff = load i32, ptr %bitOff.ptr, align 1
+  %res = ashr i32 %src, %bitOff
+  store i32 %res, ptr %dst, align 1
+  ret void
+}
+
+define void @lshr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
+; ALL-LABEL: lshr_8bytes:
+; ALL:       // %bb.0:
+; ALL-NEXT:    ldr x8, [x1]
+; ALL-NEXT:    ldr x9, [x0]
+; ALL-NEXT:    lsr x8, x9, x8
+; ALL-NEXT:    str x8, [x2]
+; ALL-NEXT:    ret
+  %src = load i64, ptr %src.ptr, align 1
+  %bitOff = load i64, ptr %bitOff.ptr, align 1
+  %res = lshr i64 %src, %bitOff
+  store i64 %res, ptr %dst, align 1
+  ret void
+}
+define void @shl_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
+; ALL-LABEL: shl_8bytes:
+; ALL:       // %bb.0:
+; ALL-NEXT:    ldr x8, [x1]
+; ALL-NEXT:    ldr x9, [x0]
+; ALL-NEXT:    lsl x8, x9, x8
+; ALL-NEXT:    str x8, [x2]
+; ALL-NEXT:    ret
+  %src = load i64, ptr %src.ptr, align 1
+  %bitOff = load i64, ptr %bitOff.ptr, align 1
+  %res = shl i64 %src, %bitOff
+  store i64 %res, ptr %dst, align 1
+  ret void
+}
+define void @ashr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
+; ALL-LABEL: ashr_8bytes:
+; ALL:       // %bb.0:
+; ALL-NEXT:    ldr x8, [x1]
+; ALL-NEXT:    ldr x9, [x0]
+; ALL-NEXT:    asr x8, x9, x8
+; ALL-NEXT:    str x8, [x2]
+; ALL-NEXT:    ret
+  %src = load i64, ptr %src.ptr, align 1
+  %bitOff = load i64, ptr %bitOff.ptr, align 1
+  %res = ashr i64 %src, %bitOff
+  store i64 %res, ptr %dst, align 1
+  ret void
+}
+
+define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
+; ALL-LABEL: lshr_16bytes:
+; ALL:       // %bb.0:
+; ALL-NEXT:    ldr x8, [x1]
+; ALL-NEXT:    ldp x10, x9, [x0]
+; ALL-NEXT:    mvn w11, w8
+; ALL-NEXT:    tst x8, #0x40
+; ALL-NEXT:    lsr x10, x10, x8
+; ALL-NEXT:    lsl x12, x9, #1
+; ALL-NEXT:    lsr x9, x9, x8
+; ALL-NEXT:    lsl x11, x12, x11
+; ALL-NEXT:    orr x8, x11, x10
+; ALL-NEXT:    csel x10, xzr, x9, ne
+; ALL-NEXT:    csel x8, x9, x8, ne
+; ALL-NEXT:    stp x8, x10, [x2]
+; ALL-NEXT:    ret
+  %src = load i128, ptr %src.ptr, align 1
+  %bitOff = load i128, ptr %bitOff.ptr, align 1
+  %res = lshr i128 %src, %bitOff
+  store i128 %res, ptr %dst, align 1
+  ret void
+}
+define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
+; ALL-LABEL: shl_16bytes:
+; ALL:       // %bb.0:
+; ALL-NEXT:    ldr x8, [x1]
+; ALL-NEXT:    ldp x9, x10, [x0]
+; ALL-NEXT:    mvn w11, w8
+; ALL-NEXT:    tst x8, #0x40
+; ALL-NEXT:    lsr x12, x9, #1
+; ALL-NEXT:    lsl x9, x9, x8
+; ALL-NEXT:    lsl x10, x10, x8
+; ALL-NEXT:    lsr x11, x12, x11
+; ALL-NEXT:    orr x8, x10, x11
+; ALL-NEXT:    csel x10, xzr, x9, ne
+; ALL-NEXT:    csel x8, x9, x8, ne
+; ALL-NEXT:    stp x10, x8, [x2]
+; ALL-NEXT:    ret
+  %src = load i128, ptr %src.ptr, align 1
+  %bitOff = load i128, ptr %bitOff.ptr, align 1
+  %res = shl i128 %src, %bitOff
+  store i128 %res, ptr %dst, align 1
+  ret void
+}
+define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
+; ALL-LABEL: ashr_16bytes:
+; ALL:       // %bb.0:
+; ALL-NEXT:    ldr x8, [x1]
+; ALL-NEXT:    ldp x10, x9, [x0]
+; ALL-NEXT:    mvn w11, w8
+; ALL-NEXT:    tst x8, #0x40
+; ALL-NEXT:    lsr x10, x10, x8
+; ALL-NEXT:    lsl x12, x9, #1
+; ALL-NEXT:    lsl x11, x12, x11
+; ALL-NEXT:    asr x12, x9, x8
+; ALL-NEXT:    asr x8, x9, #63
+; ALL-NEXT:    orr x9, x11, x10
+; ALL-NEXT:    csel x8, x8, x12, ne
+; ALL-NEXT:    csel x9, x12, x9, ne
+; ALL-NEXT:    stp x9, x8, [x2]
+; ALL-NEXT:    ret
+  %src = load i128, ptr %src.ptr, align 1
+  %bitOff = load i128, ptr %bitOff.ptr, align 1
+  %res = ashr i128 %src, %bitOff
+  store i128 %res, ptr %dst, align 1
+  ret void
+}
+
+define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
+; ALL-LABEL: lshr_32bytes:
+; ALL:       // %bb.0:
+; ALL-NEXT:    ldr x9, [x1]
+; ALL-NEXT:    mov w8, #128
+; ALL-NEXT:    ldp x10, x11, [x0, #16]
+; ALL-NEXT:    sub x8, x8, x9
+; ALL-NEXT:    mvn w16, w9
+; ALL-NEXT:    ldp x13, x12, [x0]
+; ALL-NEXT:    mvn w0, w8
+; ALL-NEXT:    lsr x14, x10, #1
+; ALL-NEXT:    lsl x1, x11, x8
+; ALL-NEXT:    tst x8, #0x40
+; ALL-NEXT:    lsl x8, x10, x8
+; ALL-NEXT:    lsl x17, x11, #1
+; ALL-NEXT:    lsr x14, x14, x0
+; ALL-NEXT:    csel x0, xzr, x8, ne
+; ALL-NEXT:    orr x14, x1, x14
+; ALL-NEXT:    lsr x15, x10, x9
+; ALL-NEXT:    csel x8, x8, x14, ne
+; ALL-NEXT:    lsl x14, x12, #1
+; ALL-NEXT:    lsl x3, x17, x16
+; ALL-NEXT:    lsr x1, x13, x9
+; ALL-NEXT:    lsl x14, x14, x16
+; ALL-NEXT:    lsr x18, x11, x9
+; ALL-NEXT:    orr x15, x3, x15
+; ALL-NEXT:    tst x9, #0x40
+; ALL-NEXT:    orr x14, x14, x1
+; ALL-NEXT:    lsr x16, x12, x9
+; ALL-NEXT:    csel x15, x18, x15, ne
+; ALL-NEXT:    csel x14, x16, x14, ne
+; ALL-NEXT:    csel x16, xzr, x16, ne
+; ALL-NEXT:    csel x18, xzr, x18, ne
+; ALL-NEXT:    subs x1, x9, #128
+; ALL-NEXT:    orr x14, x14, x0
+; ALL-NEXT:    mvn w3, w1
+; ALL-NEXT:    orr x8, x16, x8
+; ALL-NEXT:    lsr x10, x10, x1
+; ALL-NEXT:    lsr x11, x11, x1
+; ALL-NEXT:    lsl x17, x17, x3
+; ALL-NEXT:    orr x10, x17, x10
+; ALL-NEXT:    csel x17, x18, xzr, lo
+; ALL-NEXT:    tst x1, #0x40
+; ALL-NEXT:    csel x10, x11, x10, ne
+; ALL-NEXT:    csel x11, xzr, x11, ne
+; ALL-NEXT:    cmp x9, #128
+; ALL-NEXT:    csel x10, x14, x10, lo
+; ALL-NEXT:    csel x14, x15, xzr, lo
+; ALL-NEXT:    csel x8, x8, x11, lo
+; ALL-NEXT:    cmp x9, #0
+; ALL-NEXT:    csel x9, x13, x10, eq
+; ALL-NEXT:    csel x8, x12, x8, eq
+; ALL-NEXT:    stp x14, x17, [x2, #16]
+; ALL-NEXT:    stp x9, x8, [x2]
+; ALL-NEXT:    ret
+  %src = load i256, ptr %src.ptr, align 1
+  %bitOff = load i256, ptr %bitOff.ptr, align 1
+  %res = lshr i256 %src, %bitOff
+  store i256 %res, ptr %dst, align 1
+  ret void
+}
+define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
+; ALL-LABEL: shl_32bytes:
+; ALL:       // %bb.0:
+; ALL-NEXT:    ldr x9, [x1]
+; ALL-NEXT:    mov w8, #128
+; ALL-NEXT:    ldp x11, x10, [x0]
+; ALL-NEXT:    sub x8, x8, x9
+; ALL-NEXT:    mvn w16, w9
+; ALL-NEXT:    ldp x12, x13, [x0, #16]
+; ALL-NEXT:    mvn w0, w8
+; ALL-NEXT:    tst x8, #0x40
+; ALL-NEXT:    lsl x14, x10, #1
+; ALL-NEXT:    lsr x1, x11, x8
+; ALL-NEXT:    lsr x8, x10, x8
+; ALL-NEXT:    lsr x17, x11, #1
+; ALL-NEXT:    lsl x14, x14, x0
+; ALL-NEXT:    csel x0, xzr, x8, ne
+; ALL-NEXT:    orr x14, x14, x1
+; ALL-NEXT:    lsl x15, x10, x9
+; ALL-NEXT:    csel x8, x8, x14, ne
+; ALL-NEXT:    lsr x14, x12, #1
+; ALL-NEXT:    lsr x3, x17, x16
+; ALL-NEXT:    lsl x1, x13, x9
+; ALL-NEXT:    lsr x14, x14, x16
+; ALL-NEXT:    lsl x18, x11, x9
+; ALL-NEXT:    orr x15, x15, x3
+; ALL-NEXT:    tst x9, #0x40
+; ALL-NEXT:    orr x14, x1, x14
+; ALL-NEXT:    lsl x16, x12, x9
+; ALL-NEXT:    csel x15, x18, x15, ne
+; ALL-NEXT:    csel x14, x16, x14, ne
+; ALL-NEXT:    csel x16, xzr, x16, ne
+; ALL-NEXT:    csel x18, xzr, x18, ne
+; ALL-NEXT:    subs x1, x9, #128
+; ALL-NEXT:    orr x14, x14, x0
+; ALL-NEXT:    mvn w3, w1
+; ALL-NEXT:    orr x8, x16, x8
+; ALL-NEXT:    lsl x10, x10, x1
+; ALL-NEXT:    lsl x11, x11, x1
+; ALL-NEXT:    lsr x17, x17, x3
+; ALL-NEXT:    orr x10, x10, x17
+; ALL-NEXT:    csel x17, x18, xzr, lo
+; ALL-NEXT:    tst x1, #0x40
+; ALL-NEXT:    csel x10, x11, x10, ne
+; ALL-NEXT:    csel x11, xzr, x11, ne
+; ALL-NEXT:    cmp x9, #128
+; ALL-NEXT:    csel x10, x14, x10, lo
+; ALL-NEXT:    csel x14, x15, xzr, lo
+; ALL-NEXT:    csel x8, x8, x11, lo
+; ALL-NEXT:    cmp x9, #0
+; ALL-NEXT:    csel x9, x13, x10, eq
+; ALL-NEXT:    csel x8, x12, x8, eq
+; ALL-NEXT:    stp x17, x14, [x2]
+; ALL-NEXT:    stp x8, x9, [x2, #16]
+; ALL-NEXT:    ret
+  %src = load i256, ptr %src.ptr, align 1
+  %bitOff = load i256, ptr %bitOff.ptr, align 1
+  %res = shl i256 %src, %bitOff
+  store i256 %res, ptr %dst, align 1
+  ret void
+}
+define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
+; ALL-LABEL: ashr_32bytes:
+; ALL:       // %bb.0:
+; ALL-NEXT:    ldr x9, [x1]
+; ALL-NEXT:    mov w8, #128
+; ALL-NEXT:    ldp x11, x10, [x0, #8]
+; ALL-NEXT:    sub x8, x8, x9
+; ALL-NEXT:    ldr x13, [x0, #24]
+; ALL-NEXT:    mvn w18, w8
+; ALL-NEXT:    ldr x12, [x0]
+; ALL-NEXT:    mvn w16, w9
+; ALL-NEXT:    tst x8, #0x40
+; ALL-NEXT:    lsr x14, x10, #1
+; ALL-NEXT:    lsl x1, x13, x8
+; ALL-NEXT:    lsr x14, x14, x18
+; ALL-NEXT:    lsl x8, x10, x8
+; ALL-NEXT:    orr x14, x1, x14
+; ALL-NEXT:    lsl x17, x13, #1
+; ALL-NEXT:    csel x18, xzr, x8, ne
+; ALL-NEXT:    csel x8, x8, x14, ne
+; ALL-NEXT:    lsl x14, x11, #1
+; ALL-NEXT:    lsr x15, x10, x9
+; ALL-NEXT:    lsl x3, x17, x16
+; ALL-NEXT:    lsr x1, x12, x9
+; ALL-NEXT:    lsl x14, x14, x16
+; ALL-NEXT:    asr x0, x13, x9
+; ALL-NEXT:    orr x15, x3, x15
+; ALL-NEXT:    tst x9, #0x40
+; ALL-NEXT:    orr x14, x14, x1
+; ALL-NEXT:    lsr x16, x11, x9
+; ALL-NEXT:    asr x1, x13, #63
+; ALL-NEXT:    csel x15, x0, x15, ne
+; ALL-NEXT:    csel x14, x16, x14, ne
+; ALL-NEXT:    csel x16, xzr, x16, ne
+; ALL-NEXT:    csel x0, x1, x0, ne
+; ALL-NEXT:    subs x3, x9, #128
+; ALL-NEXT:    mvn w4, w3
+; ALL-NEXT:    orr x14, x14, x18
+; ALL-NEXT:    orr x8, x16, x8
+; ALL-NEXT:    lsr x10, x10, x3
+; ALL-NEXT:    asr x13, x13, x3
+; ALL-NEXT:    lsl x17, x17, x4
+; ALL-NEXT:    orr x10, x17, x10
+; ALL-NEXT:    csel x17, x0, x1, lo
+; ALL-NEXT:    tst x3, #0x40
+; ALL-NEXT:    csel x10, x13, x10, ne
+; ALL-NEXT:    csel x13, x1, x13, ne
+; ALL-NEXT:    cmp x9, #128
+; ALL-NEXT:    csel x10, x14, x10, lo
+; ALL-NEXT:    csel x14, x15, x1, lo
+; ALL-NEXT:    csel x8, x8, x13, lo
+; ALL-NEXT:    cmp x9, #0
+; ALL-NEXT:    csel x9, x12, x10, eq
+; ALL-NEXT:    csel x8, x11, x8, eq
+; ALL-NEXT:    stp x14, x17, [x2, #16]
+; ALL-NEXT:    stp x9, x8, [x2]
+; ALL-NEXT:    ret
+  %src = load i256, ptr %src.ptr, align 1
+  %bitOff = load i256, ptr %bitOff.ptr, align 1
+  %res = ashr i256 %src, %bitOff
+  store i256 %res, ptr %dst, align 1
+  ret void
+}

diff  --git a/llvm/test/CodeGen/PowerPC/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/PowerPC/wide-scalar-shift-legalization.ll
new file mode 100644
index 000000000000..92d582e27123
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/wide-scalar-shift-legalization.ll
@@ -0,0 +1,2407 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,LE,LE-64BIT
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu   | FileCheck %s --check-prefixes=ALL,BE
+; RUN: llc < %s -mtriple=ppc32--                       | FileCheck %s --check-prefixes=ALL,LE,LE-32BIT
+
+define void @lshr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
+; ALL-LABEL: lshr_4bytes:
+; ALL:       # %bb.0:
+; ALL-NEXT:    lwz 3, 0(3)
+; ALL-NEXT:    lwz 4, 0(4)
+; ALL-NEXT:    srw 3, 3, 4
+; ALL-NEXT:    stw 3, 0(5)
+; ALL-NEXT:    blr
+  %src = load i32, ptr %src.ptr, align 1
+  %bitOff = load i32, ptr %bitOff.ptr, align 1
+  %res = lshr i32 %src, %bitOff
+  store i32 %res, ptr %dst, align 1
+  ret void
+}
+define void @shl_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
+; ALL-LABEL: shl_4bytes:
+; ALL:       # %bb.0:
+; ALL-NEXT:    lwz 3, 0(3)
+; ALL-NEXT:    lwz 4, 0(4)
+; ALL-NEXT:    slw 3, 3, 4
+; ALL-NEXT:    stw 3, 0(5)
+; ALL-NEXT:    blr
+  %src = load i32, ptr %src.ptr, align 1
+  %bitOff = load i32, ptr %bitOff.ptr, align 1
+  %res = shl i32 %src, %bitOff
+  store i32 %res, ptr %dst, align 1
+  ret void
+}
+define void @ashr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
+; ALL-LABEL: ashr_4bytes:
+; ALL:       # %bb.0:
+; ALL-NEXT:    lwz 3, 0(3)
+; ALL-NEXT:    lwz 4, 0(4)
+; ALL-NEXT:    sraw 3, 3, 4
+; ALL-NEXT:    stw 3, 0(5)
+; ALL-NEXT:    blr
+  %src = load i32, ptr %src.ptr, align 1
+  %bitOff = load i32, ptr %bitOff.ptr, align 1
+  %res = ashr i32 %src, %bitOff
+  store i32 %res, ptr %dst, align 1
+  ret void
+}
+
+define void @lshr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
+; LE-64BIT-LABEL: lshr_8bytes:
+; LE-64BIT:       # %bb.0:
+; LE-64BIT-NEXT:    ld 3, 0(3)
+; LE-64BIT-NEXT:    lwz 4, 0(4)
+; LE-64BIT-NEXT:    srd 3, 3, 4
+; LE-64BIT-NEXT:    std 3, 0(5)
+; LE-64BIT-NEXT:    blr
+;
+; BE-LABEL: lshr_8bytes:
+; BE:       # %bb.0:
+; BE-NEXT:    ld 3, 0(3)
+; BE-NEXT:    lwz 4, 4(4)
+; BE-NEXT:    srd 3, 3, 4
+; BE-NEXT:    std 3, 0(5)
+; BE-NEXT:    blr
+;
+; LE-32BIT-LABEL: lshr_8bytes:
+; LE-32BIT:       # %bb.0:
+; LE-32BIT-NEXT:    lwz 4, 4(4)
+; LE-32BIT-NEXT:    lwz 6, 4(3)
+; LE-32BIT-NEXT:    lwz 3, 0(3)
+; LE-32BIT-NEXT:    subfic 7, 4, 32
+; LE-32BIT-NEXT:    srw 6, 6, 4
+; LE-32BIT-NEXT:    addi 8, 4, -32
+; LE-32BIT-NEXT:    slw 7, 3, 7
+; LE-32BIT-NEXT:    srw 4, 3, 4
+; LE-32BIT-NEXT:    srw 3, 3, 8
+; LE-32BIT-NEXT:    or 6, 6, 7
+; LE-32BIT-NEXT:    or 3, 6, 3
+; LE-32BIT-NEXT:    stw 4, 0(5)
+; LE-32BIT-NEXT:    stw 3, 4(5)
+; LE-32BIT-NEXT:    blr
+  %src = load i64, ptr %src.ptr, align 1
+  %bitOff = load i64, ptr %bitOff.ptr, align 1
+  %res = lshr i64 %src, %bitOff
+  store i64 %res, ptr %dst, align 1
+  ret void
+}
+define void @shl_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
+; LE-64BIT-LABEL: shl_8bytes:
+; LE-64BIT:       # %bb.0:
+; LE-64BIT-NEXT:    ld 3, 0(3)
+; LE-64BIT-NEXT:    lwz 4, 0(4)
+; LE-64BIT-NEXT:    sld 3, 3, 4
+; LE-64BIT-NEXT:    std 3, 0(5)
+; LE-64BIT-NEXT:    blr
+;
+; BE-LABEL: shl_8bytes:
+; BE:       # %bb.0:
+; BE-NEXT:    ld 3, 0(3)
+; BE-NEXT:    lwz 4, 4(4)
+; BE-NEXT:    sld 3, 3, 4
+; BE-NEXT:    std 3, 0(5)
+; BE-NEXT:    blr
+;
+; LE-32BIT-LABEL: shl_8bytes:
+; LE-32BIT:       # %bb.0:
+; LE-32BIT-NEXT:    lwz 4, 4(4)
+; LE-32BIT-NEXT:    lwz 6, 0(3)
+; LE-32BIT-NEXT:    lwz 3, 4(3)
+; LE-32BIT-NEXT:    subfic 7, 4, 32
+; LE-32BIT-NEXT:    slw 6, 6, 4
+; LE-32BIT-NEXT:    addi 8, 4, -32
+; LE-32BIT-NEXT:    srw 7, 3, 7
+; LE-32BIT-NEXT:    slw 4, 3, 4
+; LE-32BIT-NEXT:    slw 3, 3, 8
+; LE-32BIT-NEXT:    or 6, 6, 7
+; LE-32BIT-NEXT:    or 3, 6, 3
+; LE-32BIT-NEXT:    stw 4, 4(5)
+; LE-32BIT-NEXT:    stw 3, 0(5)
+; LE-32BIT-NEXT:    blr
+  %src = load i64, ptr %src.ptr, align 1
+  %bitOff = load i64, ptr %bitOff.ptr, align 1
+  %res = shl i64 %src, %bitOff
+  store i64 %res, ptr %dst, align 1
+  ret void
+}
+define void @ashr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
+; LE-64BIT-LABEL: ashr_8bytes:
+; LE-64BIT:       # %bb.0:
+; LE-64BIT-NEXT:    ld 3, 0(3)
+; LE-64BIT-NEXT:    lwz 4, 0(4)
+; LE-64BIT-NEXT:    srad 3, 3, 4
+; LE-64BIT-NEXT:    std 3, 0(5)
+; LE-64BIT-NEXT:    blr
+;
+; BE-LABEL: ashr_8bytes:
+; BE:       # %bb.0:
+; BE-NEXT:    ld 3, 0(3)
+; BE-NEXT:    lwz 4, 4(4)
+; BE-NEXT:    srad 3, 3, 4
+; BE-NEXT:    std 3, 0(5)
+; BE-NEXT:    blr
+;
+; LE-32BIT-LABEL: ashr_8bytes:
+; LE-32BIT:       # %bb.0:
+; LE-32BIT-NEXT:    lwz 4, 4(4)
+; LE-32BIT-NEXT:    lwz 6, 4(3)
+; LE-32BIT-NEXT:    lwz 3, 0(3)
+; LE-32BIT-NEXT:    subfic 7, 4, 32
+; LE-32BIT-NEXT:    srw 6, 6, 4
+; LE-32BIT-NEXT:    addi 8, 4, -32
+; LE-32BIT-NEXT:    slw 7, 3, 7
+; LE-32BIT-NEXT:    sraw 4, 3, 4
+; LE-32BIT-NEXT:    sraw 3, 3, 8
+; LE-32BIT-NEXT:    cmpwi 8, 1
+; LE-32BIT-NEXT:    or 6, 6, 7
+; LE-32BIT-NEXT:    bc 12, 0, .LBB5_1
+; LE-32BIT-NEXT:    b .LBB5_2
+; LE-32BIT-NEXT:  .LBB5_1:
+; LE-32BIT-NEXT:    addi 3, 6, 0
+; LE-32BIT-NEXT:  .LBB5_2:
+; LE-32BIT-NEXT:    stw 4, 0(5)
+; LE-32BIT-NEXT:    stw 3, 4(5)
+; LE-32BIT-NEXT:    blr
+  %src = load i64, ptr %src.ptr, align 1
+  %bitOff = load i64, ptr %bitOff.ptr, align 1
+  %res = ashr i64 %src, %bitOff
+  store i64 %res, ptr %dst, align 1
+  ret void
+}
+
+define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
+; LE-64BIT-LABEL: lshr_16bytes:
+; LE-64BIT:       # %bb.0:
+; LE-64BIT-NEXT:    lwz 4, 0(4)
+; LE-64BIT-NEXT:    ld 6, 0(3)
+; LE-64BIT-NEXT:    ld 3, 8(3)
+; LE-64BIT-NEXT:    subfic 7, 4, 64
+; LE-64BIT-NEXT:    srd 6, 6, 4
+; LE-64BIT-NEXT:    addi 8, 4, -64
+; LE-64BIT-NEXT:    sld 7, 3, 7
+; LE-64BIT-NEXT:    or 6, 6, 7
+; LE-64BIT-NEXT:    srd 7, 3, 8
+; LE-64BIT-NEXT:    or 6, 6, 7
+; LE-64BIT-NEXT:    srd 3, 3, 4
+; LE-64BIT-NEXT:    std 3, 8(5)
+; LE-64BIT-NEXT:    std 6, 0(5)
+; LE-64BIT-NEXT:    blr
+;
+; BE-LABEL: lshr_16bytes:
+; BE:       # %bb.0:
+; BE-NEXT:    lwz 4, 12(4)
+; BE-NEXT:    ld 6, 0(3)
+; BE-NEXT:    ld 3, 8(3)
+; BE-NEXT:    subfic 7, 4, 64
+; BE-NEXT:    srd 3, 3, 4
+; BE-NEXT:    sld 7, 6, 7
+; BE-NEXT:    addi 8, 4, -64
+; BE-NEXT:    or 3, 3, 7
+; BE-NEXT:    srd 7, 6, 8
+; BE-NEXT:    srd 4, 6, 4
+; BE-NEXT:    or 3, 3, 7
+; BE-NEXT:    std 4, 0(5)
+; BE-NEXT:    std 3, 8(5)
+; BE-NEXT:    blr
+;
+; LE-32BIT-LABEL: lshr_16bytes:
+; LE-32BIT:       # %bb.0:
+; LE-32BIT-NEXT:    stwu 1, -32(1)
+; LE-32BIT-NEXT:    lwz 4, 12(4)
+; LE-32BIT-NEXT:    li 8, 0
+; LE-32BIT-NEXT:    lwz 6, 8(3)
+; LE-32BIT-NEXT:    lwz 7, 12(3)
+; LE-32BIT-NEXT:    subfic 10, 4, 96
+; LE-32BIT-NEXT:    lwz 9, 4(3)
+; LE-32BIT-NEXT:    addi 11, 4, -64
+; LE-32BIT-NEXT:    lwz 3, 0(3)
+; LE-32BIT-NEXT:    cmplwi 4, 64
+; LE-32BIT-NEXT:    stw 27, 12(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    srw 27, 9, 11
+; LE-32BIT-NEXT:    stw 28, 16(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    srw 28, 3, 4
+; LE-32BIT-NEXT:    stw 30, 24(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    subfic 30, 4, 32
+; LE-32BIT-NEXT:    slw 10, 3, 10
+; LE-32BIT-NEXT:    addi 12, 4, -96
+; LE-32BIT-NEXT:    srw 0, 7, 4
+; LE-32BIT-NEXT:    or 10, 27, 10
+; LE-32BIT-NEXT:    slw 27, 6, 30
+; LE-32BIT-NEXT:    bc 12, 0, .LBB6_2
+; LE-32BIT-NEXT:  # %bb.1:
+; LE-32BIT-NEXT:    ori 28, 8, 0
+; LE-32BIT-NEXT:    b .LBB6_2
+; LE-32BIT-NEXT:  .LBB6_2:
+; LE-32BIT-NEXT:    stw 29, 20(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    srw 29, 9, 4
+; LE-32BIT-NEXT:    or 0, 0, 27
+; LE-32BIT-NEXT:    slw 27, 3, 30
+; LE-32BIT-NEXT:    stw 28, 0(5)
+; LE-32BIT-NEXT:    subfic 28, 4, 64
+; LE-32BIT-NEXT:    srw 12, 3, 12
+; LE-32BIT-NEXT:    or 29, 29, 27
+; LE-32BIT-NEXT:    addi 27, 4, -32
+; LE-32BIT-NEXT:    or 10, 10, 12
+; LE-32BIT-NEXT:    subfic 12, 28, 32
+; LE-32BIT-NEXT:    slw 30, 9, 30
+; LE-32BIT-NEXT:    srw 12, 9, 12
+; LE-32BIT-NEXT:    slw 9, 9, 28
+; LE-32BIT-NEXT:    slw 28, 3, 28
+; LE-32BIT-NEXT:    srw 11, 3, 11
+; LE-32BIT-NEXT:    srw 3, 3, 27
+; LE-32BIT-NEXT:    srw 27, 6, 27
+; LE-32BIT-NEXT:    or 0, 0, 27
+; LE-32BIT-NEXT:    or 12, 28, 12
+; LE-32BIT-NEXT:    cmplwi 1, 4, 0
+; LE-32BIT-NEXT:    srw 4, 6, 4
+; LE-32BIT-NEXT:    or 3, 29, 3
+; LE-32BIT-NEXT:    or 9, 0, 9
+; LE-32BIT-NEXT:    or 12, 12, 30
+; LE-32BIT-NEXT:    bc 12, 0, .LBB6_4
+; LE-32BIT-NEXT:  # %bb.3:
+; LE-32BIT-NEXT:    ori 3, 8, 0
+; LE-32BIT-NEXT:    ori 8, 10, 0
+; LE-32BIT-NEXT:    b .LBB6_5
+; LE-32BIT-NEXT:  .LBB6_4:
+; LE-32BIT-NEXT:    addi 8, 9, 0
+; LE-32BIT-NEXT:  .LBB6_5:
+; LE-32BIT-NEXT:    or 4, 4, 12
+; LE-32BIT-NEXT:    stw 3, 4(5)
+; LE-32BIT-NEXT:    bc 12, 6, .LBB6_7
+; LE-32BIT-NEXT:  # %bb.6:
+; LE-32BIT-NEXT:    ori 3, 8, 0
+; LE-32BIT-NEXT:    b .LBB6_8
+; LE-32BIT-NEXT:  .LBB6_7:
+; LE-32BIT-NEXT:    addi 3, 7, 0
+; LE-32BIT-NEXT:  .LBB6_8:
+; LE-32BIT-NEXT:    bc 12, 0, .LBB6_10
+; LE-32BIT-NEXT:  # %bb.9:
+; LE-32BIT-NEXT:    ori 4, 11, 0
+; LE-32BIT-NEXT:    b .LBB6_10
+; LE-32BIT-NEXT:  .LBB6_10:
+; LE-32BIT-NEXT:    stw 3, 12(5)
+; LE-32BIT-NEXT:    bc 12, 6, .LBB6_12
+; LE-32BIT-NEXT:  # %bb.11:
+; LE-32BIT-NEXT:    ori 3, 4, 0
+; LE-32BIT-NEXT:    b .LBB6_13
+; LE-32BIT-NEXT:  .LBB6_12:
+; LE-32BIT-NEXT:    addi 3, 6, 0
+; LE-32BIT-NEXT:  .LBB6_13:
+; LE-32BIT-NEXT:    stw 3, 8(5)
+; LE-32BIT-NEXT:    lwz 30, 24(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 29, 20(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 28, 16(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 27, 12(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    addi 1, 1, 32
+; LE-32BIT-NEXT:    blr
+  %src = load i128, ptr %src.ptr, align 1
+  %bitOff = load i128, ptr %bitOff.ptr, align 1
+  %res = lshr i128 %src, %bitOff
+  store i128 %res, ptr %dst, align 1
+  ret void
+}
+define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
+; LE-64BIT-LABEL: shl_16bytes:
+; LE-64BIT:       # %bb.0:
+; LE-64BIT-NEXT:    lwz 4, 0(4)
+; LE-64BIT-NEXT:    ld 6, 8(3)
+; LE-64BIT-NEXT:    ld 3, 0(3)
+; LE-64BIT-NEXT:    subfic 7, 4, 64
+; LE-64BIT-NEXT:    sld 6, 6, 4
+; LE-64BIT-NEXT:    addi 8, 4, -64
+; LE-64BIT-NEXT:    srd 7, 3, 7
+; LE-64BIT-NEXT:    or 6, 6, 7
+; LE-64BIT-NEXT:    sld 7, 3, 8
+; LE-64BIT-NEXT:    or 6, 6, 7
+; LE-64BIT-NEXT:    sld 3, 3, 4
+; LE-64BIT-NEXT:    std 3, 0(5)
+; LE-64BIT-NEXT:    std 6, 8(5)
+; LE-64BIT-NEXT:    blr
+;
+; BE-LABEL: shl_16bytes:
+; BE:       # %bb.0:
+; BE-NEXT:    lwz 4, 12(4)
+; BE-NEXT:    ld 6, 8(3)
+; BE-NEXT:    ld 3, 0(3)
+; BE-NEXT:    subfic 7, 4, 64
+; BE-NEXT:    sld 3, 3, 4
+; BE-NEXT:    srd 7, 6, 7
+; BE-NEXT:    addi 8, 4, -64
+; BE-NEXT:    or 3, 3, 7
+; BE-NEXT:    sld 7, 6, 8
+; BE-NEXT:    sld 4, 6, 4
+; BE-NEXT:    or 3, 3, 7
+; BE-NEXT:    std 4, 8(5)
+; BE-NEXT:    std 3, 0(5)
+; BE-NEXT:    blr
+;
+; LE-32BIT-LABEL: shl_16bytes:
+; LE-32BIT:       # %bb.0:
+; LE-32BIT-NEXT:    stwu 1, -32(1)
+; LE-32BIT-NEXT:    lwz 4, 12(4)
+; LE-32BIT-NEXT:    li 8, 0
+; LE-32BIT-NEXT:    lwz 6, 4(3)
+; LE-32BIT-NEXT:    lwz 7, 0(3)
+; LE-32BIT-NEXT:    subfic 10, 4, 96
+; LE-32BIT-NEXT:    lwz 9, 8(3)
+; LE-32BIT-NEXT:    addi 11, 4, -64
+; LE-32BIT-NEXT:    lwz 3, 12(3)
+; LE-32BIT-NEXT:    cmplwi 4, 64
+; LE-32BIT-NEXT:    stw 27, 12(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    slw 27, 9, 11
+; LE-32BIT-NEXT:    stw 28, 16(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    slw 28, 3, 4
+; LE-32BIT-NEXT:    stw 30, 24(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    subfic 30, 4, 32
+; LE-32BIT-NEXT:    srw 10, 3, 10
+; LE-32BIT-NEXT:    addi 12, 4, -96
+; LE-32BIT-NEXT:    slw 0, 7, 4
+; LE-32BIT-NEXT:    or 10, 27, 10
+; LE-32BIT-NEXT:    srw 27, 6, 30
+; LE-32BIT-NEXT:    bc 12, 0, .LBB7_2
+; LE-32BIT-NEXT:  # %bb.1:
+; LE-32BIT-NEXT:    ori 28, 8, 0
+; LE-32BIT-NEXT:    b .LBB7_2
+; LE-32BIT-NEXT:  .LBB7_2:
+; LE-32BIT-NEXT:    stw 29, 20(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    slw 29, 9, 4
+; LE-32BIT-NEXT:    or 0, 0, 27
+; LE-32BIT-NEXT:    srw 27, 3, 30
+; LE-32BIT-NEXT:    stw 28, 12(5)
+; LE-32BIT-NEXT:    subfic 28, 4, 64
+; LE-32BIT-NEXT:    slw 12, 3, 12
+; LE-32BIT-NEXT:    or 29, 29, 27
+; LE-32BIT-NEXT:    addi 27, 4, -32
+; LE-32BIT-NEXT:    or 10, 10, 12
+; LE-32BIT-NEXT:    subfic 12, 28, 32
+; LE-32BIT-NEXT:    srw 30, 9, 30
+; LE-32BIT-NEXT:    slw 12, 9, 12
+; LE-32BIT-NEXT:    srw 9, 9, 28
+; LE-32BIT-NEXT:    srw 28, 3, 28
+; LE-32BIT-NEXT:    slw 11, 3, 11
+; LE-32BIT-NEXT:    slw 3, 3, 27
+; LE-32BIT-NEXT:    slw 27, 6, 27
+; LE-32BIT-NEXT:    or 0, 0, 27
+; LE-32BIT-NEXT:    or 12, 28, 12
+; LE-32BIT-NEXT:    cmplwi 1, 4, 0
+; LE-32BIT-NEXT:    slw 4, 6, 4
+; LE-32BIT-NEXT:    or 3, 29, 3
+; LE-32BIT-NEXT:    or 9, 0, 9
+; LE-32BIT-NEXT:    or 12, 12, 30
+; LE-32BIT-NEXT:    bc 12, 0, .LBB7_4
+; LE-32BIT-NEXT:  # %bb.3:
+; LE-32BIT-NEXT:    ori 3, 8, 0
+; LE-32BIT-NEXT:    ori 8, 10, 0
+; LE-32BIT-NEXT:    b .LBB7_5
+; LE-32BIT-NEXT:  .LBB7_4:
+; LE-32BIT-NEXT:    addi 8, 9, 0
+; LE-32BIT-NEXT:  .LBB7_5:
+; LE-32BIT-NEXT:    or 4, 4, 12
+; LE-32BIT-NEXT:    stw 3, 8(5)
+; LE-32BIT-NEXT:    bc 12, 6, .LBB7_7
+; LE-32BIT-NEXT:  # %bb.6:
+; LE-32BIT-NEXT:    ori 3, 8, 0
+; LE-32BIT-NEXT:    b .LBB7_8
+; LE-32BIT-NEXT:  .LBB7_7:
+; LE-32BIT-NEXT:    addi 3, 7, 0
+; LE-32BIT-NEXT:  .LBB7_8:
+; LE-32BIT-NEXT:    bc 12, 0, .LBB7_10
+; LE-32BIT-NEXT:  # %bb.9:
+; LE-32BIT-NEXT:    ori 4, 11, 0
+; LE-32BIT-NEXT:    b .LBB7_10
+; LE-32BIT-NEXT:  .LBB7_10:
+; LE-32BIT-NEXT:    stw 3, 0(5)
+; LE-32BIT-NEXT:    bc 12, 6, .LBB7_12
+; LE-32BIT-NEXT:  # %bb.11:
+; LE-32BIT-NEXT:    ori 3, 4, 0
+; LE-32BIT-NEXT:    b .LBB7_13
+; LE-32BIT-NEXT:  .LBB7_12:
+; LE-32BIT-NEXT:    addi 3, 6, 0
+; LE-32BIT-NEXT:  .LBB7_13:
+; LE-32BIT-NEXT:    stw 3, 4(5)
+; LE-32BIT-NEXT:    lwz 30, 24(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 29, 20(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 28, 16(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 27, 12(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    addi 1, 1, 32
+; LE-32BIT-NEXT:    blr
+  %src = load i128, ptr %src.ptr, align 1
+  %bitOff = load i128, ptr %bitOff.ptr, align 1
+  %res = shl i128 %src, %bitOff
+  store i128 %res, ptr %dst, align 1
+  ret void
+}
+define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
+; LE-64BIT-LABEL: ashr_16bytes:
+; LE-64BIT:       # %bb.0:
+; LE-64BIT-NEXT:    lwz 4, 0(4)
+; LE-64BIT-NEXT:    ld 6, 0(3)
+; LE-64BIT-NEXT:    ld 3, 8(3)
+; LE-64BIT-NEXT:    subfic 7, 4, 64
+; LE-64BIT-NEXT:    srd 6, 6, 4
+; LE-64BIT-NEXT:    addi 8, 4, -64
+; LE-64BIT-NEXT:    sld 7, 3, 7
+; LE-64BIT-NEXT:    cmpwi 8, 1
+; LE-64BIT-NEXT:    or 6, 6, 7
+; LE-64BIT-NEXT:    srad 7, 3, 8
+; LE-64BIT-NEXT:    isellt 6, 6, 7
+; LE-64BIT-NEXT:    srad 3, 3, 4
+; LE-64BIT-NEXT:    std 3, 8(5)
+; LE-64BIT-NEXT:    std 6, 0(5)
+; LE-64BIT-NEXT:    blr
+;
+; BE-LABEL: ashr_16bytes:
+; BE:       # %bb.0:
+; BE-NEXT:    lwz 4, 12(4)
+; BE-NEXT:    ld 6, 8(3)
+; BE-NEXT:    ld 3, 0(3)
+; BE-NEXT:    subfic 7, 4, 64
+; BE-NEXT:    srd 6, 6, 4
+; BE-NEXT:    addi 8, 4, -64
+; BE-NEXT:    sld 7, 3, 7
+; BE-NEXT:    cmpwi 8, 1
+; BE-NEXT:    or 6, 6, 7
+; BE-NEXT:    srad 7, 3, 8
+; BE-NEXT:    srad 3, 3, 4
+; BE-NEXT:    bc 12, 0, .LBB8_2
+; BE-NEXT:  # %bb.1:
+; BE-NEXT:    ori 6, 7, 0
+; BE-NEXT:    b .LBB8_2
+; BE-NEXT:  .LBB8_2:
+; BE-NEXT:    std 3, 0(5)
+; BE-NEXT:    std 6, 8(5)
+; BE-NEXT:    blr
+;
+; LE-32BIT-LABEL: ashr_16bytes:
+; LE-32BIT:       # %bb.0:
+; LE-32BIT-NEXT:    stwu 1, -32(1)
+; LE-32BIT-NEXT:    lwz 4, 12(4)
+; LE-32BIT-NEXT:    lwz 6, 8(3)
+; LE-32BIT-NEXT:    lwz 7, 12(3)
+; LE-32BIT-NEXT:    subfic 9, 4, 96
+; LE-32BIT-NEXT:    lwz 8, 4(3)
+; LE-32BIT-NEXT:    addi 10, 4, -64
+; LE-32BIT-NEXT:    lwz 3, 0(3)
+; LE-32BIT-NEXT:    subfic 0, 4, 32
+; LE-32BIT-NEXT:    stw 27, 12(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    srw 27, 8, 10
+; LE-32BIT-NEXT:    slw 9, 3, 9
+; LE-32BIT-NEXT:    srw 12, 7, 4
+; LE-32BIT-NEXT:    or 9, 27, 9
+; LE-32BIT-NEXT:    slw 27, 6, 0
+; LE-32BIT-NEXT:    stw 29, 20(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    srw 29, 8, 4
+; LE-32BIT-NEXT:    or 12, 12, 27
+; LE-32BIT-NEXT:    slw 27, 3, 0
+; LE-32BIT-NEXT:    stw 28, 16(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    cmplwi 4, 64
+; LE-32BIT-NEXT:    srawi 28, 3, 31
+; LE-32BIT-NEXT:    or 29, 29, 27
+; LE-32BIT-NEXT:    sraw 27, 3, 4
+; LE-32BIT-NEXT:    addi 11, 4, -96
+; LE-32BIT-NEXT:    bc 12, 0, .LBB8_2
+; LE-32BIT-NEXT:  # %bb.1:
+; LE-32BIT-NEXT:    ori 27, 28, 0
+; LE-32BIT-NEXT:    b .LBB8_2
+; LE-32BIT-NEXT:  .LBB8_2:
+; LE-32BIT-NEXT:    cmpwi 1, 11, 1
+; LE-32BIT-NEXT:    sraw 11, 3, 11
+; LE-32BIT-NEXT:    stw 27, 0(5)
+; LE-32BIT-NEXT:    subfic 27, 4, 64
+; LE-32BIT-NEXT:    stw 30, 24(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    addi 30, 4, -32
+; LE-32BIT-NEXT:    bc 12, 4, .LBB8_4
+; LE-32BIT-NEXT:  # %bb.3:
+; LE-32BIT-NEXT:    ori 9, 11, 0
+; LE-32BIT-NEXT:    b .LBB8_4
+; LE-32BIT-NEXT:  .LBB8_4:
+; LE-32BIT-NEXT:    subfic 11, 27, 32
+; LE-32BIT-NEXT:    slw 0, 8, 0
+; LE-32BIT-NEXT:    srw 11, 8, 11
+; LE-32BIT-NEXT:    slw 8, 8, 27
+; LE-32BIT-NEXT:    slw 27, 3, 27
+; LE-32BIT-NEXT:    sraw 10, 3, 10
+; LE-32BIT-NEXT:    sraw 3, 3, 30
+; LE-32BIT-NEXT:    cmpwi 1, 30, 1
+; LE-32BIT-NEXT:    srw 30, 6, 30
+; LE-32BIT-NEXT:    or 12, 12, 30
+; LE-32BIT-NEXT:    or 11, 27, 11
+; LE-32BIT-NEXT:    bc 12, 4, .LBB8_5
+; LE-32BIT-NEXT:    b .LBB8_6
+; LE-32BIT-NEXT:  .LBB8_5:
+; LE-32BIT-NEXT:    addi 3, 29, 0
+; LE-32BIT-NEXT:  .LBB8_6:
+; LE-32BIT-NEXT:    cmplwi 1, 4, 0
+; LE-32BIT-NEXT:    srw 4, 6, 4
+; LE-32BIT-NEXT:    or 8, 12, 8
+; LE-32BIT-NEXT:    or 11, 11, 0
+; LE-32BIT-NEXT:    bc 12, 0, .LBB8_8
+; LE-32BIT-NEXT:  # %bb.7:
+; LE-32BIT-NEXT:    ori 3, 28, 0
+; LE-32BIT-NEXT:    ori 8, 9, 0
+; LE-32BIT-NEXT:    b .LBB8_8
+; LE-32BIT-NEXT:  .LBB8_8:
+; LE-32BIT-NEXT:    or 4, 4, 11
+; LE-32BIT-NEXT:    stw 3, 4(5)
+; LE-32BIT-NEXT:    bc 12, 6, .LBB8_10
+; LE-32BIT-NEXT:  # %bb.9:
+; LE-32BIT-NEXT:    ori 3, 8, 0
+; LE-32BIT-NEXT:    b .LBB8_11
+; LE-32BIT-NEXT:  .LBB8_10:
+; LE-32BIT-NEXT:    addi 3, 7, 0
+; LE-32BIT-NEXT:  .LBB8_11:
+; LE-32BIT-NEXT:    bc 12, 0, .LBB8_13
+; LE-32BIT-NEXT:  # %bb.12:
+; LE-32BIT-NEXT:    ori 4, 10, 0
+; LE-32BIT-NEXT:    b .LBB8_13
+; LE-32BIT-NEXT:  .LBB8_13:
+; LE-32BIT-NEXT:    stw 3, 12(5)
+; LE-32BIT-NEXT:    bc 12, 6, .LBB8_15
+; LE-32BIT-NEXT:  # %bb.14:
+; LE-32BIT-NEXT:    ori 3, 4, 0
+; LE-32BIT-NEXT:    b .LBB8_16
+; LE-32BIT-NEXT:  .LBB8_15:
+; LE-32BIT-NEXT:    addi 3, 6, 0
+; LE-32BIT-NEXT:  .LBB8_16:
+; LE-32BIT-NEXT:    stw 3, 8(5)
+; LE-32BIT-NEXT:    lwz 30, 24(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 29, 20(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 28, 16(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 27, 12(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    addi 1, 1, 32
+; LE-32BIT-NEXT:    blr
+  %src = load i128, ptr %src.ptr, align 1
+  %bitOff = load i128, ptr %bitOff.ptr, align 1
+  %res = ashr i128 %src, %bitOff
+  store i128 %res, ptr %dst, align 1
+  ret void
+}
+
+define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
+; LE-64BIT-LABEL: lshr_32bytes:
+; LE-64BIT:       # %bb.0:
+; LE-64BIT-NEXT:    lwz 4, 0(4)
+; LE-64BIT-NEXT:    ld 7, 0(3)
+; LE-64BIT-NEXT:    ld 8, 8(3)
+; LE-64BIT-NEXT:    ld 9, 16(3)
+; LE-64BIT-NEXT:    li 6, 0
+; LE-64BIT-NEXT:    ld 3, 24(3)
+; LE-64BIT-NEXT:    std 28, -32(1) # 8-byte Folded Spill
+; LE-64BIT-NEXT:    std 21, -88(1) # 8-byte Folded Spill
+; LE-64BIT-NEXT:    std 24, -64(1) # 8-byte Folded Spill
+; LE-64BIT-NEXT:    subfic 28, 4, 64
+; LE-64BIT-NEXT:    std 25, -56(1) # 8-byte Folded Spill
+; LE-64BIT-NEXT:    subfic 11, 4, 192
+; LE-64BIT-NEXT:    addi 0, 4, -128
+; LE-64BIT-NEXT:    subfic 25, 4, 128
+; LE-64BIT-NEXT:    std 27, -40(1) # 8-byte Folded Spill
+; LE-64BIT-NEXT:    std 29, -24(1) # 8-byte Folded Spill
+; LE-64BIT-NEXT:    srd 29, 9, 4
+; LE-64BIT-NEXT:    addi 27, 4, -64
+; LE-64BIT-NEXT:    std 22, -80(1) # 8-byte Folded Spill
+; LE-64BIT-NEXT:    sld 24, 8, 28
+; LE-64BIT-NEXT:    sld 21, 9, 28
+; LE-64BIT-NEXT:    std 26, -48(1) # 8-byte Folded Spill
+; LE-64BIT-NEXT:    sld 28, 3, 28
+; LE-64BIT-NEXT:    srd 10, 7, 4
+; LE-64BIT-NEXT:    std 30, -16(1) # 8-byte Folded Spill
+; LE-64BIT-NEXT:    addi 30, 4, -192
+; LE-64BIT-NEXT:    sld 11, 3, 11
+; LE-64BIT-NEXT:    subfic 22, 25, 64
+; LE-64BIT-NEXT:    or 29, 29, 28
+; LE-64BIT-NEXT:    srd 26, 9, 0
+; LE-64BIT-NEXT:    srd 28, 3, 27
+; LE-64BIT-NEXT:    std 23, -72(1) # 8-byte Folded Spill
+; LE-64BIT-NEXT:    or 10, 10, 24
+; LE-64BIT-NEXT:    ld 24, -64(1) # 8-byte Folded Reload
+; LE-64BIT-NEXT:    srd 30, 3, 30
+; LE-64BIT-NEXT:    srd 23, 8, 27
+; LE-64BIT-NEXT:    or 11, 26, 11
+; LE-64BIT-NEXT:    or 29, 29, 28
+; LE-64BIT-NEXT:    srd 27, 9, 22
+; LE-64BIT-NEXT:    sld 28, 3, 25
+; LE-64BIT-NEXT:    or 10, 10, 23
+; LE-64BIT-NEXT:    or 11, 11, 30
+; LE-64BIT-NEXT:    ld 26, -48(1) # 8-byte Folded Reload
+; LE-64BIT-NEXT:    sld 9, 9, 25
+; LE-64BIT-NEXT:    or 30, 28, 27
+; LE-64BIT-NEXT:    ld 28, -32(1) # 8-byte Folded Reload
+; LE-64BIT-NEXT:    ld 27, -40(1) # 8-byte Folded Reload
+; LE-64BIT-NEXT:    ld 25, -56(1) # 8-byte Folded Reload
+; LE-64BIT-NEXT:    ld 23, -72(1) # 8-byte Folded Reload
+; LE-64BIT-NEXT:    cmplwi 4, 128
+; LE-64BIT-NEXT:    srd 12, 8, 4
+; LE-64BIT-NEXT:    or 9, 10, 9
+; LE-64BIT-NEXT:    or 30, 30, 21
+; LE-64BIT-NEXT:    ld 22, -80(1) # 8-byte Folded Reload
+; LE-64BIT-NEXT:    ld 21, -88(1) # 8-byte Folded Reload
+; LE-64BIT-NEXT:    cmplwi 1, 4, 0
+; LE-64BIT-NEXT:    srd 10, 3, 0
+; LE-64BIT-NEXT:    isellt 9, 9, 11
+; LE-64BIT-NEXT:    or 11, 12, 30
+; LE-64BIT-NEXT:    ld 30, -16(1) # 8-byte Folded Reload
+; LE-64BIT-NEXT:    isel 7, 7, 9, 6
+; LE-64BIT-NEXT:    srd 3, 3, 4
+; LE-64BIT-NEXT:    isellt 9, 11, 10
+; LE-64BIT-NEXT:    std 7, 0(5)
+; LE-64BIT-NEXT:    isellt 0, 29, 6
+; LE-64BIT-NEXT:    ld 29, -24(1) # 8-byte Folded Reload
+; LE-64BIT-NEXT:    isel 4, 8, 9, 6
+; LE-64BIT-NEXT:    std 0, 16(5)
+; LE-64BIT-NEXT:    isellt 3, 3, 6
+; LE-64BIT-NEXT:    std 4, 8(5)
+; LE-64BIT-NEXT:    std 3, 24(5)
+; LE-64BIT-NEXT:    blr
+;
+; BE-LABEL: lshr_32bytes:
+; BE:       # %bb.0:
+; BE-NEXT:    lwz 4, 28(4)
+; BE-NEXT:    ld 7, 24(3)
+; BE-NEXT:    ld 8, 16(3)
+; BE-NEXT:    ld 9, 8(3)
+; BE-NEXT:    ld 3, 0(3)
+; BE-NEXT:    std 27, -40(1) # 8-byte Folded Spill
+; BE-NEXT:    std 30, -16(1) # 8-byte Folded Spill
+; BE-NEXT:    std 28, -32(1) # 8-byte Folded Spill
+; BE-NEXT:    std 29, -24(1) # 8-byte Folded Spill
+; BE-NEXT:    li 6, 0
+; BE-NEXT:    subfic 10, 4, 192
+; BE-NEXT:    addi 11, 4, -128
+; BE-NEXT:    addi 12, 4, -192
+; BE-NEXT:    subfic 30, 4, 64
+; BE-NEXT:    sld 10, 3, 10
+; BE-NEXT:    srd 27, 9, 11
+; BE-NEXT:    srd 0, 7, 4
+; BE-NEXT:    addi 29, 4, -64
+; BE-NEXT:    subfic 28, 4, 128
+; BE-NEXT:    srd 12, 3, 12
+; BE-NEXT:    or 10, 27, 10
+; BE-NEXT:    sld 27, 8, 30
+; BE-NEXT:    or 10, 10, 12
+; BE-NEXT:    or 0, 0, 27
+; BE-NEXT:    srd 27, 8, 29
+; BE-NEXT:    subfic 12, 28, 64
+; BE-NEXT:    or 0, 0, 27
+; BE-NEXT:    sld 27, 3, 28
+; BE-NEXT:    srd 12, 9, 12
+; BE-NEXT:    sld 28, 9, 28
+; BE-NEXT:    cmplwi 4, 128
+; BE-NEXT:    or 12, 27, 12
+; BE-NEXT:    or 28, 0, 28
+; BE-NEXT:    sld 0, 9, 30
+; BE-NEXT:    srd 9, 9, 4
+; BE-NEXT:    srd 11, 3, 11
+; BE-NEXT:    cmplwi 1, 4, 0
+; BE-NEXT:    or 12, 12, 0
+; BE-NEXT:    srd 0, 8, 4
+; BE-NEXT:    bc 12, 0, .LBB9_1
+; BE-NEXT:    b .LBB9_2
+; BE-NEXT:  .LBB9_1:
+; BE-NEXT:    addi 10, 28, 0
+; BE-NEXT:  .LBB9_2:
+; BE-NEXT:    ld 28, -32(1) # 8-byte Folded Reload
+; BE-NEXT:    ld 27, -40(1) # 8-byte Folded Reload
+; BE-NEXT:    or 12, 0, 12
+; BE-NEXT:    sld 0, 3, 30
+; BE-NEXT:    srd 30, 3, 29
+; BE-NEXT:    bc 12, 0, .LBB9_3
+; BE-NEXT:    b .LBB9_4
+; BE-NEXT:  .LBB9_3:
+; BE-NEXT:    addi 11, 12, 0
+; BE-NEXT:  .LBB9_4:
+; BE-NEXT:    srd 3, 3, 4
+; BE-NEXT:    bc 12, 6, .LBB9_6
+; BE-NEXT:  # %bb.5:
+; BE-NEXT:    ori 4, 10, 0
+; BE-NEXT:    b .LBB9_7
+; BE-NEXT:  .LBB9_6:
+; BE-NEXT:    addi 4, 7, 0
+; BE-NEXT:  .LBB9_7:
+; BE-NEXT:    ld 29, -24(1) # 8-byte Folded Reload
+; BE-NEXT:    or 9, 9, 0
+; BE-NEXT:    or 9, 9, 30
+; BE-NEXT:    bc 12, 6, .LBB9_9
+; BE-NEXT:  # %bb.8:
+; BE-NEXT:    ori 7, 11, 0
+; BE-NEXT:    b .LBB9_10
+; BE-NEXT:  .LBB9_9:
+; BE-NEXT:    addi 7, 8, 0
+; BE-NEXT:  .LBB9_10:
+; BE-NEXT:    bc 12, 0, .LBB9_12
+; BE-NEXT:  # %bb.11:
+; BE-NEXT:    ori 8, 6, 0
+; BE-NEXT:    ori 3, 6, 0
+; BE-NEXT:    b .LBB9_13
+; BE-NEXT:  .LBB9_12:
+; BE-NEXT:    addi 8, 9, 0
+; BE-NEXT:  .LBB9_13:
+; BE-NEXT:    std 4, 24(5)
+; BE-NEXT:    ld 30, -16(1) # 8-byte Folded Reload
+; BE-NEXT:    std 3, 0(5)
+; BE-NEXT:    std 8, 8(5)
+; BE-NEXT:    std 7, 16(5)
+; BE-NEXT:    blr
+;
+; LE-32BIT-LABEL: lshr_32bytes:
+; LE-32BIT:       # %bb.0:
+; LE-32BIT-NEXT:    stwu 1, -144(1)
+; LE-32BIT-NEXT:    mfcr 12
+; LE-32BIT-NEXT:    stw 14, 72(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 15, 76(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 16, 80(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 17, 84(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 18, 88(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 19, 92(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 20, 96(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 21, 100(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 22, 104(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 23, 108(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 24, 112(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 25, 116(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 26, 120(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 27, 124(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 28, 128(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 29, 132(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 30, 136(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 31, 140(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 12, 68(1)
+; LE-32BIT-NEXT:    lwz 30, 28(4)
+; LE-32BIT-NEXT:    lwz 9, 28(3)
+; LE-32BIT-NEXT:    lwz 10, 4(3)
+; LE-32BIT-NEXT:    subfic 21, 30, 224
+; LE-32BIT-NEXT:    lwz 11, 0(3)
+; LE-32BIT-NEXT:    subfic 4, 30, 160
+; LE-32BIT-NEXT:    stw 5, 64(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    addi 0, 30, -128
+; LE-32BIT-NEXT:    lwz 5, 24(3)
+; LE-32BIT-NEXT:    subfic 28, 30, 96
+; LE-32BIT-NEXT:    lwz 19, 20(3)
+; LE-32BIT-NEXT:    addi 29, 30, -64
+; LE-32BIT-NEXT:    lwz 8, 16(3)
+; LE-32BIT-NEXT:    srw 20, 9, 30
+; LE-32BIT-NEXT:    lwz 12, 12(3)
+; LE-32BIT-NEXT:    slw 21, 11, 21
+; LE-32BIT-NEXT:    lwz 6, 8(3)
+; LE-32BIT-NEXT:    addi 3, 30, -192
+; LE-32BIT-NEXT:    stw 9, 60(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    subfic 9, 30, 32
+; LE-32BIT-NEXT:    srw 16, 10, 3
+; LE-32BIT-NEXT:    stw 3, 56(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    slw 15, 6, 4
+; LE-32BIT-NEXT:    srw 14, 12, 0
+; LE-32BIT-NEXT:    slw 31, 8, 28
+; LE-32BIT-NEXT:    srw 3, 19, 29
+; LE-32BIT-NEXT:    or 21, 16, 21
+; LE-32BIT-NEXT:    slw 16, 5, 9
+; LE-32BIT-NEXT:    srw 25, 19, 30
+; LE-32BIT-NEXT:    or 15, 14, 15
+; LE-32BIT-NEXT:    slw 14, 8, 9
+; LE-32BIT-NEXT:    or 3, 3, 31
+; LE-32BIT-NEXT:    slw 31, 11, 4
+; LE-32BIT-NEXT:    or 20, 20, 16
+; LE-32BIT-NEXT:    srw 16, 10, 0
+; LE-32BIT-NEXT:    addi 26, 30, -224
+; LE-32BIT-NEXT:    stw 4, 36(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    or 4, 25, 14
+; LE-32BIT-NEXT:    slw 14, 11, 28
+; LE-32BIT-NEXT:    or 16, 16, 31
+; LE-32BIT-NEXT:    srw 31, 10, 29
+; LE-32BIT-NEXT:    addi 23, 30, -160
+; LE-32BIT-NEXT:    srw 18, 12, 30
+; LE-32BIT-NEXT:    stw 0, 40(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    or 14, 31, 14
+; LE-32BIT-NEXT:    stw 29, 52(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    mr 29, 6
+; LE-32BIT-NEXT:    slw 31, 6, 9
+; LE-32BIT-NEXT:    srw 0, 11, 26
+; LE-32BIT-NEXT:    addi 24, 30, -96
+; LE-32BIT-NEXT:    srw 17, 10, 30
+; LE-32BIT-NEXT:    or 18, 18, 31
+; LE-32BIT-NEXT:    slw 31, 11, 9
+; LE-32BIT-NEXT:    or 6, 21, 0
+; LE-32BIT-NEXT:    srw 0, 29, 23
+; LE-32BIT-NEXT:    or 17, 17, 31
+; LE-32BIT-NEXT:    addi 31, 30, -32
+; LE-32BIT-NEXT:    or 0, 15, 0
+; LE-32BIT-NEXT:    srw 15, 8, 24
+; LE-32BIT-NEXT:    or 3, 3, 15
+; LE-32BIT-NEXT:    srw 15, 5, 31
+; LE-32BIT-NEXT:    or 20, 20, 15
+; LE-32BIT-NEXT:    srw 15, 8, 31
+; LE-32BIT-NEXT:    stw 3, 28(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    or 3, 4, 15
+; LE-32BIT-NEXT:    srw 23, 11, 23
+; LE-32BIT-NEXT:    stw 3, 48(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    subfic 15, 30, 64
+; LE-32BIT-NEXT:    or 3, 16, 23
+; LE-32BIT-NEXT:    stw 3, 44(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    subfic 3, 15, 32
+; LE-32BIT-NEXT:    slw 16, 29, 15
+; LE-32BIT-NEXT:    srw 22, 12, 3
+; LE-32BIT-NEXT:    or 21, 16, 22
+; LE-32BIT-NEXT:    subfic 16, 30, 128
+; LE-32BIT-NEXT:    mr 7, 10
+; LE-32BIT-NEXT:    mr 10, 5
+; LE-32BIT-NEXT:    subfic 5, 16, 32
+; LE-32BIT-NEXT:    stw 6, 32(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    mr 6, 24
+; LE-32BIT-NEXT:    slw 4, 11, 16
+; LE-32BIT-NEXT:    srw 24, 7, 5
+; LE-32BIT-NEXT:    or 22, 4, 24
+; LE-32BIT-NEXT:    slw 24, 29, 16
+; LE-32BIT-NEXT:    srw 27, 12, 5
+; LE-32BIT-NEXT:    or 27, 24, 27
+; LE-32BIT-NEXT:    slw 24, 8, 15
+; LE-32BIT-NEXT:    srw 26, 19, 3
+; LE-32BIT-NEXT:    or 26, 24, 26
+; LE-32BIT-NEXT:    subfic 24, 30, 192
+; LE-32BIT-NEXT:    mr 25, 28
+; LE-32BIT-NEXT:    subfic 28, 24, 32
+; LE-32BIT-NEXT:    mr 23, 19
+; LE-32BIT-NEXT:    srw 28, 7, 28
+; LE-32BIT-NEXT:    slw 19, 11, 24
+; LE-32BIT-NEXT:    mr 4, 29
+; LE-32BIT-NEXT:    or 28, 19, 28
+; LE-32BIT-NEXT:    srw 19, 11, 6
+; LE-32BIT-NEXT:    or 19, 14, 19
+; LE-32BIT-NEXT:    srw 14, 4, 31
+; LE-32BIT-NEXT:    or 6, 18, 14
+; LE-32BIT-NEXT:    lwz 18, 64(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    srw 3, 7, 3
+; LE-32BIT-NEXT:    slw 14, 11, 15
+; LE-32BIT-NEXT:    cmplwi 1, 30, 64
+; LE-32BIT-NEXT:    cmplwi 30, 128
+; LE-32BIT-NEXT:    slw 24, 7, 24
+; LE-32BIT-NEXT:    mr 29, 12
+; LE-32BIT-NEXT:    or 12, 14, 3
+; LE-32BIT-NEXT:    srw 14, 11, 31
+; LE-32BIT-NEXT:    crnand 28, 0, 4
+; LE-32BIT-NEXT:    srw 31, 11, 30
+; LE-32BIT-NEXT:    or 24, 0, 24
+; LE-32BIT-NEXT:    slw 0, 23, 15
+; LE-32BIT-NEXT:    or 17, 17, 14
+; LE-32BIT-NEXT:    bc 12, 28, .LBB9_2
+; LE-32BIT-NEXT:  # %bb.1:
+; LE-32BIT-NEXT:    ori 14, 31, 0
+; LE-32BIT-NEXT:    b .LBB9_3
+; LE-32BIT-NEXT:  .LBB9_2:
+; LE-32BIT-NEXT:    li 14, 0
+; LE-32BIT-NEXT:  .LBB9_3:
+; LE-32BIT-NEXT:    or 20, 20, 0
+; LE-32BIT-NEXT:    subfic 0, 16, 64
+; LE-32BIT-NEXT:    stw 14, 0(18)
+; LE-32BIT-NEXT:    subfic 14, 0, 32
+; LE-32BIT-NEXT:    slw 14, 4, 14
+; LE-32BIT-NEXT:    srw 31, 29, 0
+; LE-32BIT-NEXT:    or 14, 31, 14
+; LE-32BIT-NEXT:    slw 31, 29, 9
+; LE-32BIT-NEXT:    mr 3, 29
+; LE-32BIT-NEXT:    or 29, 21, 31
+; LE-32BIT-NEXT:    slw 31, 7, 25
+; LE-32BIT-NEXT:    stw 29, 20(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    or 29, 22, 31
+; LE-32BIT-NEXT:    slw 31, 3, 25
+; LE-32BIT-NEXT:    or 27, 27, 31
+; LE-32BIT-NEXT:    stw 27, 24(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    slw 31, 23, 9
+; LE-32BIT-NEXT:    lwz 27, 36(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    or 26, 26, 31
+; LE-32BIT-NEXT:    slw 25, 7, 9
+; LE-32BIT-NEXT:    or 12, 12, 25
+; LE-32BIT-NEXT:    slw 31, 7, 27
+; LE-32BIT-NEXT:    or 28, 28, 31
+; LE-32BIT-NEXT:    slw 31, 7, 15
+; LE-32BIT-NEXT:    or 22, 6, 31
+; LE-32BIT-NEXT:    lwz 31, 40(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    srw 0, 4, 0
+; LE-32BIT-NEXT:    lwz 6, 32(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    or 27, 29, 0
+; LE-32BIT-NEXT:    cmplwi 6, 31, 64
+; LE-32BIT-NEXT:    srw 0, 10, 30
+; LE-32BIT-NEXT:    bc 12, 24, .LBB9_5
+; LE-32BIT-NEXT:  # %bb.4:
+; LE-32BIT-NEXT:    ori 25, 6, 0
+; LE-32BIT-NEXT:    b .LBB9_6
+; LE-32BIT-NEXT:  .LBB9_5:
+; LE-32BIT-NEXT:    addi 25, 24, 0
+; LE-32BIT-NEXT:  .LBB9_6:
+; LE-32BIT-NEXT:    lwz 6, 28(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    or 26, 0, 26
+; LE-32BIT-NEXT:    srw 0, 4, 31
+; LE-32BIT-NEXT:    or 28, 0, 28
+; LE-32BIT-NEXT:    srw 0, 4, 30
+; LE-32BIT-NEXT:    bc 12, 4, .LBB9_8
+; LE-32BIT-NEXT:  # %bb.7:
+; LE-32BIT-NEXT:    ori 9, 6, 0
+; LE-32BIT-NEXT:    b .LBB9_9
+; LE-32BIT-NEXT:  .LBB9_8:
+; LE-32BIT-NEXT:    addi 9, 20, 0
+; LE-32BIT-NEXT:  .LBB9_9:
+; LE-32BIT-NEXT:    or 6, 0, 12
+; LE-32BIT-NEXT:    lwz 12, 52(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    srw 5, 4, 5
+; LE-32BIT-NEXT:    bc 12, 28, .LBB9_11
+; LE-32BIT-NEXT:  # %bb.10:
+; LE-32BIT-NEXT:    ori 0, 17, 0
+; LE-32BIT-NEXT:    b .LBB9_12
+; LE-32BIT-NEXT:  .LBB9_11:
+; LE-32BIT-NEXT:    li 0, 0
+; LE-32BIT-NEXT:  .LBB9_12:
+; LE-32BIT-NEXT:    or 5, 14, 5
+; LE-32BIT-NEXT:    stw 0, 4(18)
+; LE-32BIT-NEXT:    slw 21, 3, 16
+; LE-32BIT-NEXT:    cmplwi 7, 16, 64
+; LE-32BIT-NEXT:    cmplwi 3, 16, 0
+; LE-32BIT-NEXT:    slw 0, 7, 16
+; LE-32BIT-NEXT:    li 16, 0
+; LE-32BIT-NEXT:    bc 12, 4, .LBB9_14
+; LE-32BIT-NEXT:  # %bb.13:
+; LE-32BIT-NEXT:    ori 24, 19, 0
+; LE-32BIT-NEXT:    b .LBB9_15
+; LE-32BIT-NEXT:  .LBB9_14:
+; LE-32BIT-NEXT:    addi 24, 22, 0
+; LE-32BIT-NEXT:  .LBB9_15:
+; LE-32BIT-NEXT:    cmplwi 5, 30, 0
+; LE-32BIT-NEXT:    cmplwi 2, 31, 0
+; LE-32BIT-NEXT:    or 5, 0, 5
+; LE-32BIT-NEXT:    srw 17, 11, 12
+; LE-32BIT-NEXT:    bc 12, 28, .LBB9_17
+; LE-32BIT-NEXT:  # %bb.16:
+; LE-32BIT-NEXT:    ori 0, 16, 0
+; LE-32BIT-NEXT:    b .LBB9_18
+; LE-32BIT-NEXT:  .LBB9_17:
+; LE-32BIT-NEXT:    addi 0, 21, 0
+; LE-32BIT-NEXT:  .LBB9_18:
+; LE-32BIT-NEXT:    lwz 21, 60(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    slw 20, 3, 15
+; LE-32BIT-NEXT:    srw 19, 8, 12
+; LE-32BIT-NEXT:    bc 12, 10, .LBB9_19
+; LE-32BIT-NEXT:    b .LBB9_20
+; LE-32BIT-NEXT:  .LBB9_19:
+; LE-32BIT-NEXT:    addi 25, 3, 0
+; LE-32BIT-NEXT:  .LBB9_20:
+; LE-32BIT-NEXT:    bc 12, 22, .LBB9_22
+; LE-32BIT-NEXT:  # %bb.21:
+; LE-32BIT-NEXT:    ori 12, 24, 0
+; LE-32BIT-NEXT:    b .LBB9_23
+; LE-32BIT-NEXT:  .LBB9_22:
+; LE-32BIT-NEXT:    addi 12, 3, 0
+; LE-32BIT-NEXT:  .LBB9_23:
+; LE-32BIT-NEXT:    bc 12, 4, .LBB9_25
+; LE-32BIT-NEXT:  # %bb.24:
+; LE-32BIT-NEXT:    ori 3, 17, 0
+; LE-32BIT-NEXT:    b .LBB9_26
+; LE-32BIT-NEXT:  .LBB9_25:
+; LE-32BIT-NEXT:    addi 3, 6, 0
+; LE-32BIT-NEXT:  .LBB9_26:
+; LE-32BIT-NEXT:    lwz 6, 48(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    srw 30, 8, 30
+; LE-32BIT-NEXT:    srw 29, 11, 31
+; LE-32BIT-NEXT:    bc 12, 22, .LBB9_27
+; LE-32BIT-NEXT:    b .LBB9_28
+; LE-32BIT-NEXT:  .LBB9_27:
+; LE-32BIT-NEXT:    addi 9, 21, 0
+; LE-32BIT-NEXT:  .LBB9_28:
+; LE-32BIT-NEXT:    mr 22, 4
+; LE-32BIT-NEXT:    lwz 4, 56(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    or 9, 9, 0
+; LE-32BIT-NEXT:    bc 12, 4, .LBB9_30
+; LE-32BIT-NEXT:  # %bb.29:
+; LE-32BIT-NEXT:    ori 0, 16, 0
+; LE-32BIT-NEXT:    b .LBB9_31
+; LE-32BIT-NEXT:  .LBB9_30:
+; LE-32BIT-NEXT:    addi 0, 30, 0
+; LE-32BIT-NEXT:  .LBB9_31:
+; LE-32BIT-NEXT:    bc 12, 24, .LBB9_33
+; LE-32BIT-NEXT:  # %bb.32:
+; LE-32BIT-NEXT:    ori 30, 16, 0
+; LE-32BIT-NEXT:    b .LBB9_34
+; LE-32BIT-NEXT:  .LBB9_33:
+; LE-32BIT-NEXT:    addi 30, 29, 0
+; LE-32BIT-NEXT:  .LBB9_34:
+; LE-32BIT-NEXT:    bc 12, 4, .LBB9_36
+; LE-32BIT-NEXT:  # %bb.35:
+; LE-32BIT-NEXT:    ori 29, 16, 0
+; LE-32BIT-NEXT:    b .LBB9_37
+; LE-32BIT-NEXT:  .LBB9_36:
+; LE-32BIT-NEXT:    addi 29, 6, 0
+; LE-32BIT-NEXT:  .LBB9_37:
+; LE-32BIT-NEXT:    lwz 6, 44(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    mr 14, 18
+; LE-32BIT-NEXT:    srw 18, 11, 4
+; LE-32BIT-NEXT:    lwz 4, 20(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    bc 12, 24, .LBB9_39
+; LE-32BIT-NEXT:  # %bb.38:
+; LE-32BIT-NEXT:    ori 24, 16, 0
+; LE-32BIT-NEXT:    b .LBB9_40
+; LE-32BIT-NEXT:  .LBB9_39:
+; LE-32BIT-NEXT:    addi 24, 6, 0
+; LE-32BIT-NEXT:  .LBB9_40:
+; LE-32BIT-NEXT:    bc 12, 4, .LBB9_42
+; LE-32BIT-NEXT:  # %bb.41:
+; LE-32BIT-NEXT:    ori 26, 19, 0
+; LE-32BIT-NEXT:    b .LBB9_42
+; LE-32BIT-NEXT:  .LBB9_42:
+; LE-32BIT-NEXT:    bc 12, 22, .LBB9_43
+; LE-32BIT-NEXT:    b .LBB9_44
+; LE-32BIT-NEXT:  .LBB9_43:
+; LE-32BIT-NEXT:    addi 3, 22, 0
+; LE-32BIT-NEXT:  .LBB9_44:
+; LE-32BIT-NEXT:    bc 12, 28, .LBB9_46
+; LE-32BIT-NEXT:  # %bb.45:
+; LE-32BIT-NEXT:    ori 5, 20, 0
+; LE-32BIT-NEXT:    b .LBB9_46
+; LE-32BIT-NEXT:  .LBB9_46:
+; LE-32BIT-NEXT:    bc 12, 0, .LBB9_48
+; LE-32BIT-NEXT:  # %bb.47:
+; LE-32BIT-NEXT:    ori 9, 25, 0
+; LE-32BIT-NEXT:    b .LBB9_48
+; LE-32BIT-NEXT:  .LBB9_48:
+; LE-32BIT-NEXT:    bc 12, 24, .LBB9_50
+; LE-32BIT-NEXT:  # %bb.49:
+; LE-32BIT-NEXT:    ori 28, 18, 0
+; LE-32BIT-NEXT:    b .LBB9_50
+; LE-32BIT-NEXT:  .LBB9_50:
+; LE-32BIT-NEXT:    bc 12, 0, .LBB9_52
+; LE-32BIT-NEXT:  # %bb.51:
+; LE-32BIT-NEXT:    ori 12, 16, 0
+; LE-32BIT-NEXT:    b .LBB9_52
+; LE-32BIT-NEXT:  .LBB9_52:
+; LE-32BIT-NEXT:    lwz 6, 24(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    bc 12, 14, .LBB9_53
+; LE-32BIT-NEXT:    b .LBB9_54
+; LE-32BIT-NEXT:  .LBB9_53:
+; LE-32BIT-NEXT:    addi 5, 7, 0
+; LE-32BIT-NEXT:  .LBB9_54:
+; LE-32BIT-NEXT:    bc 12, 10, .LBB9_55
+; LE-32BIT-NEXT:    b .LBB9_56
+; LE-32BIT-NEXT:  .LBB9_55:
+; LE-32BIT-NEXT:    addi 28, 22, 0
+; LE-32BIT-NEXT:  .LBB9_56:
+; LE-32BIT-NEXT:    bc 12, 28, .LBB9_57
+; LE-32BIT-NEXT:    b .LBB9_58
+; LE-32BIT-NEXT:  .LBB9_57:
+; LE-32BIT-NEXT:    addi 4, 27, 0
+; LE-32BIT-NEXT:  .LBB9_58:
+; LE-32BIT-NEXT:    stw 12, 12(14)
+; LE-32BIT-NEXT:    bc 12, 14, .LBB9_59
+; LE-32BIT-NEXT:    b .LBB9_60
+; LE-32BIT-NEXT:  .LBB9_59:
+; LE-32BIT-NEXT:    addi 4, 11, 0
+; LE-32BIT-NEXT:  .LBB9_60:
+; LE-32BIT-NEXT:    bc 12, 28, .LBB9_62
+; LE-32BIT-NEXT:  # %bb.61:
+; LE-32BIT-NEXT:    ori 27, 16, 0
+; LE-32BIT-NEXT:    b .LBB9_63
+; LE-32BIT-NEXT:  .LBB9_62:
+; LE-32BIT-NEXT:    addi 27, 6, 0
+; LE-32BIT-NEXT:  .LBB9_63:
+; LE-32BIT-NEXT:    bc 12, 22, .LBB9_65
+; LE-32BIT-NEXT:  # %bb.64:
+; LE-32BIT-NEXT:    ori 6, 26, 0
+; LE-32BIT-NEXT:    b .LBB9_66
+; LE-32BIT-NEXT:  .LBB9_65:
+; LE-32BIT-NEXT:    addi 6, 10, 0
+; LE-32BIT-NEXT:  .LBB9_66:
+; LE-32BIT-NEXT:    li 26, 0
+; LE-32BIT-NEXT:    bc 12, 0, .LBB9_68
+; LE-32BIT-NEXT:  # %bb.67:
+; LE-32BIT-NEXT:    ori 3, 26, 0
+; LE-32BIT-NEXT:    b .LBB9_68
+; LE-32BIT-NEXT:  .LBB9_68:
+; LE-32BIT-NEXT:    or 6, 6, 27
+; LE-32BIT-NEXT:    stw 3, 8(14)
+; LE-32BIT-NEXT:    or 3, 0, 4
+; LE-32BIT-NEXT:    bc 12, 22, .LBB9_70
+; LE-32BIT-NEXT:  # %bb.69:
+; LE-32BIT-NEXT:    ori 4, 9, 0
+; LE-32BIT-NEXT:    b .LBB9_71
+; LE-32BIT-NEXT:  .LBB9_70:
+; LE-32BIT-NEXT:    addi 4, 21, 0
+; LE-32BIT-NEXT:  .LBB9_71:
+; LE-32BIT-NEXT:    bc 12, 0, .LBB9_73
+; LE-32BIT-NEXT:  # %bb.72:
+; LE-32BIT-NEXT:    ori 3, 30, 0
+; LE-32BIT-NEXT:    ori 6, 28, 0
+; LE-32BIT-NEXT:    b .LBB9_73
+; LE-32BIT-NEXT:  .LBB9_73:
+; LE-32BIT-NEXT:    stw 4, 28(14)
+; LE-32BIT-NEXT:    or 4, 29, 5
+; LE-32BIT-NEXT:    bc 12, 0, .LBB9_75
+; LE-32BIT-NEXT:  # %bb.74:
+; LE-32BIT-NEXT:    ori 4, 24, 0
+; LE-32BIT-NEXT:    b .LBB9_75
+; LE-32BIT-NEXT:  .LBB9_75:
+; LE-32BIT-NEXT:    bc 12, 22, .LBB9_77
+; LE-32BIT-NEXT:  # %bb.76:
+; LE-32BIT-NEXT:    ori 5, 6, 0
+; LE-32BIT-NEXT:    b .LBB9_78
+; LE-32BIT-NEXT:  .LBB9_77:
+; LE-32BIT-NEXT:    addi 3, 8, 0
+; LE-32BIT-NEXT:    addi 5, 10, 0
+; LE-32BIT-NEXT:  .LBB9_78:
+; LE-32BIT-NEXT:    stw 3, 16(14)
+; LE-32BIT-NEXT:    bc 12, 22, .LBB9_80
+; LE-32BIT-NEXT:  # %bb.79:
+; LE-32BIT-NEXT:    ori 3, 4, 0
+; LE-32BIT-NEXT:    b .LBB9_81
+; LE-32BIT-NEXT:  .LBB9_80:
+; LE-32BIT-NEXT:    addi 3, 23, 0
+; LE-32BIT-NEXT:  .LBB9_81:
+; LE-32BIT-NEXT:    stw 5, 24(14)
+; LE-32BIT-NEXT:    stw 3, 20(14)
+; LE-32BIT-NEXT:    lwz 12, 68(1)
+; LE-32BIT-NEXT:    lwz 31, 140(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    mtcrf 32, 12 # cr2
+; LE-32BIT-NEXT:    mtcrf 16, 12 # cr3
+; LE-32BIT-NEXT:    lwz 30, 136(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 29, 132(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 28, 128(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 27, 124(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 26, 120(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 25, 116(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 24, 112(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 23, 108(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 22, 104(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 21, 100(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 20, 96(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 19, 92(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 18, 88(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 17, 84(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 16, 80(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 15, 76(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 14, 72(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    addi 1, 1, 144
+; LE-32BIT-NEXT:    blr
+  %src = load i256, ptr %src.ptr, align 1
+  %bitOff = load i256, ptr %bitOff.ptr, align 1
+  %res = lshr i256 %src, %bitOff
+  store i256 %res, ptr %dst, align 1
+  ret void
+}
+define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
+; LE-64BIT-LABEL: shl_32bytes:
+; LE-64BIT:       # %bb.0:
+; LE-64BIT-NEXT:    lwz 4, 0(4)
+; LE-64BIT-NEXT:    ld 7, 24(3)
+; LE-64BIT-NEXT:    ld 8, 16(3)
+; LE-64BIT-NEXT:    ld 9, 8(3)
+; LE-64BIT-NEXT:    li 6, 0
+; LE-64BIT-NEXT:    ld 3, 0(3)
+; LE-64BIT-NEXT:    std 28, -32(1) # 8-byte Folded Spill
+; LE-64BIT-NEXT:    std 21, -88(1) # 8-byte Folded Spill
+; LE-64BIT-NEXT:    std 24, -64(1) # 8-byte Folded Spill
+; LE-64BIT-NEXT:    subfic 28, 4, 64
+; LE-64BIT-NEXT:    std 25, -56(1) # 8-byte Folded Spill
+; LE-64BIT-NEXT:    subfic 11, 4, 192
+; LE-64BIT-NEXT:    addi 0, 4, -128
+; LE-64BIT-NEXT:    subfic 25, 4, 128
+; LE-64BIT-NEXT:    std 27, -40(1) # 8-byte Folded Spill
+; LE-64BIT-NEXT:    std 29, -24(1) # 8-byte Folded Spill
+; LE-64BIT-NEXT:    sld 29, 9, 4
+; LE-64BIT-NEXT:    addi 27, 4, -64
+; LE-64BIT-NEXT:    std 22, -80(1) # 8-byte Folded Spill
+; LE-64BIT-NEXT:    srd 24, 8, 28
+; LE-64BIT-NEXT:    srd 21, 9, 28
+; LE-64BIT-NEXT:    std 26, -48(1) # 8-byte Folded Spill
+; LE-64BIT-NEXT:    srd 28, 3, 28
+; LE-64BIT-NEXT:    sld 10, 7, 4
+; LE-64BIT-NEXT:    std 30, -16(1) # 8-byte Folded Spill
+; LE-64BIT-NEXT:    addi 30, 4, -192
+; LE-64BIT-NEXT:    srd 11, 3, 11
+; LE-64BIT-NEXT:    subfic 22, 25, 64
+; LE-64BIT-NEXT:    or 29, 29, 28
+; LE-64BIT-NEXT:    sld 26, 9, 0
+; LE-64BIT-NEXT:    sld 28, 3, 27
+; LE-64BIT-NEXT:    std 23, -72(1) # 8-byte Folded Spill
+; LE-64BIT-NEXT:    or 10, 10, 24
+; LE-64BIT-NEXT:    ld 24, -64(1) # 8-byte Folded Reload
+; LE-64BIT-NEXT:    sld 30, 3, 30
+; LE-64BIT-NEXT:    sld 23, 8, 27
+; LE-64BIT-NEXT:    or 11, 26, 11
+; LE-64BIT-NEXT:    or 29, 29, 28
+; LE-64BIT-NEXT:    sld 27, 9, 22
+; LE-64BIT-NEXT:    srd 28, 3, 25
+; LE-64BIT-NEXT:    or 10, 10, 23
+; LE-64BIT-NEXT:    or 11, 11, 30
+; LE-64BIT-NEXT:    ld 26, -48(1) # 8-byte Folded Reload
+; LE-64BIT-NEXT:    srd 9, 9, 25
+; LE-64BIT-NEXT:    or 30, 28, 27
+; LE-64BIT-NEXT:    ld 28, -32(1) # 8-byte Folded Reload
+; LE-64BIT-NEXT:    ld 27, -40(1) # 8-byte Folded Reload
+; LE-64BIT-NEXT:    ld 25, -56(1) # 8-byte Folded Reload
+; LE-64BIT-NEXT:    ld 23, -72(1) # 8-byte Folded Reload
+; LE-64BIT-NEXT:    cmplwi 4, 128
+; LE-64BIT-NEXT:    sld 12, 8, 4
+; LE-64BIT-NEXT:    or 9, 10, 9
+; LE-64BIT-NEXT:    or 30, 30, 21
+; LE-64BIT-NEXT:    ld 22, -80(1) # 8-byte Folded Reload
+; LE-64BIT-NEXT:    ld 21, -88(1) # 8-byte Folded Reload
+; LE-64BIT-NEXT:    cmplwi 1, 4, 0
+; LE-64BIT-NEXT:    sld 10, 3, 0
+; LE-64BIT-NEXT:    isellt 9, 9, 11
+; LE-64BIT-NEXT:    or 11, 12, 30
+; LE-64BIT-NEXT:    ld 30, -16(1) # 8-byte Folded Reload
+; LE-64BIT-NEXT:    isel 7, 7, 9, 6
+; LE-64BIT-NEXT:    sld 3, 3, 4
+; LE-64BIT-NEXT:    isellt 9, 11, 10
+; LE-64BIT-NEXT:    std 7, 24(5)
+; LE-64BIT-NEXT:    isellt 0, 29, 6
+; LE-64BIT-NEXT:    ld 29, -24(1) # 8-byte Folded Reload
+; LE-64BIT-NEXT:    isel 4, 8, 9, 6
+; LE-64BIT-NEXT:    std 0, 8(5)
+; LE-64BIT-NEXT:    isellt 3, 3, 6
+; LE-64BIT-NEXT:    std 4, 16(5)
+; LE-64BIT-NEXT:    std 3, 0(5)
+; LE-64BIT-NEXT:    blr
+;
+; BE-LABEL: shl_32bytes:
+; BE:       # %bb.0:
+; BE-NEXT:    lwz 4, 28(4)
+; BE-NEXT:    ld 7, 0(3)
+; BE-NEXT:    ld 8, 8(3)
+; BE-NEXT:    ld 9, 16(3)
+; BE-NEXT:    ld 3, 24(3)
+; BE-NEXT:    std 27, -40(1) # 8-byte Folded Spill
+; BE-NEXT:    std 30, -16(1) # 8-byte Folded Spill
+; BE-NEXT:    std 28, -32(1) # 8-byte Folded Spill
+; BE-NEXT:    std 29, -24(1) # 8-byte Folded Spill
+; BE-NEXT:    li 6, 0
+; BE-NEXT:    subfic 10, 4, 192
+; BE-NEXT:    addi 11, 4, -128
+; BE-NEXT:    addi 12, 4, -192
+; BE-NEXT:    subfic 30, 4, 64
+; BE-NEXT:    srd 10, 3, 10
+; BE-NEXT:    sld 27, 9, 11
+; BE-NEXT:    sld 0, 7, 4
+; BE-NEXT:    addi 29, 4, -64
+; BE-NEXT:    subfic 28, 4, 128
+; BE-NEXT:    sld 12, 3, 12
+; BE-NEXT:    or 10, 27, 10
+; BE-NEXT:    srd 27, 8, 30
+; BE-NEXT:    or 10, 10, 12
+; BE-NEXT:    or 0, 0, 27
+; BE-NEXT:    sld 27, 8, 29
+; BE-NEXT:    subfic 12, 28, 64
+; BE-NEXT:    or 0, 0, 27
+; BE-NEXT:    srd 27, 3, 28
+; BE-NEXT:    sld 12, 9, 12
+; BE-NEXT:    srd 28, 9, 28
+; BE-NEXT:    cmplwi 4, 128
+; BE-NEXT:    or 12, 27, 12
+; BE-NEXT:    or 28, 0, 28
+; BE-NEXT:    srd 0, 9, 30
+; BE-NEXT:    sld 9, 9, 4
+; BE-NEXT:    sld 11, 3, 11
+; BE-NEXT:    cmplwi 1, 4, 0
+; BE-NEXT:    or 12, 12, 0
+; BE-NEXT:    sld 0, 8, 4
+; BE-NEXT:    bc 12, 0, .LBB10_1
+; BE-NEXT:    b .LBB10_2
+; BE-NEXT:  .LBB10_1:
+; BE-NEXT:    addi 10, 28, 0
+; BE-NEXT:  .LBB10_2:
+; BE-NEXT:    ld 28, -32(1) # 8-byte Folded Reload
+; BE-NEXT:    ld 27, -40(1) # 8-byte Folded Reload
+; BE-NEXT:    or 12, 0, 12
+; BE-NEXT:    srd 0, 3, 30
+; BE-NEXT:    sld 30, 3, 29
+; BE-NEXT:    bc 12, 0, .LBB10_3
+; BE-NEXT:    b .LBB10_4
+; BE-NEXT:  .LBB10_3:
+; BE-NEXT:    addi 11, 12, 0
+; BE-NEXT:  .LBB10_4:
+; BE-NEXT:    sld 3, 3, 4
+; BE-NEXT:    bc 12, 6, .LBB10_6
+; BE-NEXT:  # %bb.5:
+; BE-NEXT:    ori 4, 10, 0
+; BE-NEXT:    b .LBB10_7
+; BE-NEXT:  .LBB10_6:
+; BE-NEXT:    addi 4, 7, 0
+; BE-NEXT:  .LBB10_7:
+; BE-NEXT:    ld 29, -24(1) # 8-byte Folded Reload
+; BE-NEXT:    or 9, 9, 0
+; BE-NEXT:    or 9, 9, 30
+; BE-NEXT:    bc 12, 6, .LBB10_9
+; BE-NEXT:  # %bb.8:
+; BE-NEXT:    ori 7, 11, 0
+; BE-NEXT:    b .LBB10_10
+; BE-NEXT:  .LBB10_9:
+; BE-NEXT:    addi 7, 8, 0
+; BE-NEXT:  .LBB10_10:
+; BE-NEXT:    bc 12, 0, .LBB10_12
+; BE-NEXT:  # %bb.11:
+; BE-NEXT:    ori 8, 6, 0
+; BE-NEXT:    ori 3, 6, 0
+; BE-NEXT:    b .LBB10_13
+; BE-NEXT:  .LBB10_12:
+; BE-NEXT:    addi 8, 9, 0
+; BE-NEXT:  .LBB10_13:
+; BE-NEXT:    std 4, 0(5)
+; BE-NEXT:    ld 30, -16(1) # 8-byte Folded Reload
+; BE-NEXT:    std 3, 24(5)
+; BE-NEXT:    std 8, 16(5)
+; BE-NEXT:    std 7, 8(5)
+; BE-NEXT:    blr
+;
+; LE-32BIT-LABEL: shl_32bytes:
+; LE-32BIT:       # %bb.0:
+; LE-32BIT-NEXT:    stwu 1, -144(1)
+; LE-32BIT-NEXT:    mfcr 12
+; LE-32BIT-NEXT:    stw 14, 72(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 15, 76(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 16, 80(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 17, 84(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 18, 88(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 19, 92(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 20, 96(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 21, 100(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 22, 104(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 23, 108(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 24, 112(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 25, 116(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 26, 120(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 27, 124(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 28, 128(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 29, 132(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 30, 136(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 31, 140(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 12, 68(1)
+; LE-32BIT-NEXT:    lwz 30, 28(4)
+; LE-32BIT-NEXT:    stw 5, 64(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    lwz 6, 24(3)
+; LE-32BIT-NEXT:    subfic 21, 30, 224
+; LE-32BIT-NEXT:    lwz 5, 28(3)
+; LE-32BIT-NEXT:    subfic 29, 30, 160
+; LE-32BIT-NEXT:    lwz 7, 4(3)
+; LE-32BIT-NEXT:    addi 4, 30, -128
+; LE-32BIT-NEXT:    lwz 9, 0(3)
+; LE-32BIT-NEXT:    subfic 28, 30, 96
+; LE-32BIT-NEXT:    lwz 10, 8(3)
+; LE-32BIT-NEXT:    addi 0, 30, -64
+; LE-32BIT-NEXT:    lwz 8, 12(3)
+; LE-32BIT-NEXT:    subfic 25, 30, 32
+; LE-32BIT-NEXT:    lwz 12, 16(3)
+; LE-32BIT-NEXT:    srw 21, 5, 21
+; LE-32BIT-NEXT:    lwz 11, 20(3)
+; LE-32BIT-NEXT:    addi 3, 30, -192
+; LE-32BIT-NEXT:    slw 16, 6, 3
+; LE-32BIT-NEXT:    stw 3, 56(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    slw 20, 9, 30
+; LE-32BIT-NEXT:    srw 15, 11, 29
+; LE-32BIT-NEXT:    slw 14, 12, 4
+; LE-32BIT-NEXT:    srw 31, 8, 28
+; LE-32BIT-NEXT:    slw 3, 10, 0
+; LE-32BIT-NEXT:    or 21, 16, 21
+; LE-32BIT-NEXT:    srw 16, 7, 25
+; LE-32BIT-NEXT:    slw 19, 10, 30
+; LE-32BIT-NEXT:    or 15, 14, 15
+; LE-32BIT-NEXT:    srw 14, 8, 25
+; LE-32BIT-NEXT:    or 3, 3, 31
+; LE-32BIT-NEXT:    srw 31, 5, 29
+; LE-32BIT-NEXT:    or 20, 20, 16
+; LE-32BIT-NEXT:    slw 16, 6, 4
+; LE-32BIT-NEXT:    addi 27, 30, -224
+; LE-32BIT-NEXT:    or 19, 19, 14
+; LE-32BIT-NEXT:    srw 14, 5, 28
+; LE-32BIT-NEXT:    or 16, 16, 31
+; LE-32BIT-NEXT:    slw 31, 6, 0
+; LE-32BIT-NEXT:    addi 23, 30, -160
+; LE-32BIT-NEXT:    slw 18, 12, 30
+; LE-32BIT-NEXT:    stw 0, 52(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    or 14, 31, 14
+; LE-32BIT-NEXT:    srw 31, 11, 25
+; LE-32BIT-NEXT:    slw 0, 5, 27
+; LE-32BIT-NEXT:    addi 26, 30, -96
+; LE-32BIT-NEXT:    slw 17, 6, 30
+; LE-32BIT-NEXT:    or 18, 18, 31
+; LE-32BIT-NEXT:    srw 31, 5, 25
+; LE-32BIT-NEXT:    or 21, 21, 0
+; LE-32BIT-NEXT:    slw 0, 11, 23
+; LE-32BIT-NEXT:    or 17, 17, 31
+; LE-32BIT-NEXT:    addi 31, 30, -32
+; LE-32BIT-NEXT:    or 0, 15, 0
+; LE-32BIT-NEXT:    slw 15, 8, 26
+; LE-32BIT-NEXT:    stw 29, 40(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    or 29, 3, 15
+; LE-32BIT-NEXT:    slw 15, 7, 31
+; LE-32BIT-NEXT:    or 20, 20, 15
+; LE-32BIT-NEXT:    slw 15, 8, 31
+; LE-32BIT-NEXT:    or 3, 19, 15
+; LE-32BIT-NEXT:    subfic 15, 30, 128
+; LE-32BIT-NEXT:    slw 23, 5, 23
+; LE-32BIT-NEXT:    stw 3, 48(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    or 3, 16, 23
+; LE-32BIT-NEXT:    subfic 16, 15, 32
+; LE-32BIT-NEXT:    stw 3, 44(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    srw 3, 11, 15
+; LE-32BIT-NEXT:    slw 22, 12, 16
+; LE-32BIT-NEXT:    or 23, 3, 22
+; LE-32BIT-NEXT:    subfic 22, 30, 64
+; LE-32BIT-NEXT:    stw 9, 60(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    mr 9, 10
+; LE-32BIT-NEXT:    subfic 3, 22, 32
+; LE-32BIT-NEXT:    stw 4, 36(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    srw 4, 8, 22
+; LE-32BIT-NEXT:    slw 24, 9, 3
+; LE-32BIT-NEXT:    or 4, 4, 24
+; LE-32BIT-NEXT:    subfic 24, 30, 192
+; LE-32BIT-NEXT:    subfic 27, 24, 32
+; LE-32BIT-NEXT:    mr 10, 26
+; LE-32BIT-NEXT:    slw 27, 6, 27
+; LE-32BIT-NEXT:    srw 26, 5, 24
+; LE-32BIT-NEXT:    stw 28, 24(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    or 27, 26, 27
+; LE-32BIT-NEXT:    srw 26, 11, 22
+; LE-32BIT-NEXT:    slw 28, 12, 3
+; LE-32BIT-NEXT:    or 28, 26, 28
+; LE-32BIT-NEXT:    srw 26, 5, 15
+; LE-32BIT-NEXT:    slw 19, 6, 16
+; LE-32BIT-NEXT:    or 26, 26, 19
+; LE-32BIT-NEXT:    slw 19, 5, 10
+; LE-32BIT-NEXT:    stw 7, 32(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    mr 7, 9
+; LE-32BIT-NEXT:    or 19, 14, 19
+; LE-32BIT-NEXT:    slw 14, 11, 31
+; LE-32BIT-NEXT:    lwz 9, 64(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    or 18, 18, 14
+; LE-32BIT-NEXT:    slw 3, 6, 3
+; LE-32BIT-NEXT:    srw 14, 5, 22
+; LE-32BIT-NEXT:    cmplwi 1, 30, 64
+; LE-32BIT-NEXT:    cmplwi 30, 128
+; LE-32BIT-NEXT:    srw 24, 6, 24
+; LE-32BIT-NEXT:    or 10, 14, 3
+; LE-32BIT-NEXT:    slw 14, 5, 31
+; LE-32BIT-NEXT:    crnand 28, 0, 4
+; LE-32BIT-NEXT:    slw 31, 5, 30
+; LE-32BIT-NEXT:    or 24, 0, 24
+; LE-32BIT-NEXT:    mr 3, 7
+; LE-32BIT-NEXT:    stw 7, 28(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    srw 0, 7, 22
+; LE-32BIT-NEXT:    lwz 7, 24(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    or 17, 17, 14
+; LE-32BIT-NEXT:    bc 12, 28, .LBB10_2
+; LE-32BIT-NEXT:  # %bb.1:
+; LE-32BIT-NEXT:    ori 14, 31, 0
+; LE-32BIT-NEXT:    b .LBB10_3
+; LE-32BIT-NEXT:  .LBB10_2:
+; LE-32BIT-NEXT:    li 14, 0
+; LE-32BIT-NEXT:  .LBB10_3:
+; LE-32BIT-NEXT:    or 20, 20, 0
+; LE-32BIT-NEXT:    subfic 0, 15, 64
+; LE-32BIT-NEXT:    stw 14, 28(9)
+; LE-32BIT-NEXT:    subfic 14, 0, 32
+; LE-32BIT-NEXT:    srw 14, 11, 14
+; LE-32BIT-NEXT:    slw 31, 12, 0
+; LE-32BIT-NEXT:    or 14, 31, 14
+; LE-32BIT-NEXT:    srw 31, 12, 7
+; LE-32BIT-NEXT:    or 23, 23, 31
+; LE-32BIT-NEXT:    srw 31, 3, 25
+; LE-32BIT-NEXT:    lwz 3, 40(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    or 4, 4, 31
+; LE-32BIT-NEXT:    slw 0, 11, 0
+; LE-32BIT-NEXT:    cmplwi 3, 15, 0
+; LE-32BIT-NEXT:    srw 31, 6, 3
+; LE-32BIT-NEXT:    or 27, 27, 31
+; LE-32BIT-NEXT:    srw 31, 12, 25
+; LE-32BIT-NEXT:    or 28, 28, 31
+; LE-32BIT-NEXT:    srw 31, 6, 7
+; LE-32BIT-NEXT:    or 26, 26, 31
+; LE-32BIT-NEXT:    srw 31, 6, 22
+; LE-32BIT-NEXT:    or 18, 18, 31
+; LE-32BIT-NEXT:    lwz 31, 36(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    srw 25, 6, 25
+; LE-32BIT-NEXT:    or 3, 10, 25
+; LE-32BIT-NEXT:    or 26, 26, 0
+; LE-32BIT-NEXT:    cmplwi 6, 31, 64
+; LE-32BIT-NEXT:    bc 12, 24, .LBB10_5
+; LE-32BIT-NEXT:  # %bb.4:
+; LE-32BIT-NEXT:    ori 25, 21, 0
+; LE-32BIT-NEXT:    b .LBB10_6
+; LE-32BIT-NEXT:  .LBB10_5:
+; LE-32BIT-NEXT:    addi 25, 24, 0
+; LE-32BIT-NEXT:  .LBB10_6:
+; LE-32BIT-NEXT:    slw 24, 11, 16
+; LE-32BIT-NEXT:    slw 0, 11, 30
+; LE-32BIT-NEXT:    or 24, 14, 24
+; LE-32BIT-NEXT:    lwz 14, 32(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    or 3, 0, 3
+; LE-32BIT-NEXT:    bc 12, 28, .LBB10_8
+; LE-32BIT-NEXT:  # %bb.7:
+; LE-32BIT-NEXT:    ori 0, 17, 0
+; LE-32BIT-NEXT:    b .LBB10_9
+; LE-32BIT-NEXT:  .LBB10_8:
+; LE-32BIT-NEXT:    li 0, 0
+; LE-32BIT-NEXT:  .LBB10_9:
+; LE-32BIT-NEXT:    bc 12, 4, .LBB10_11
+; LE-32BIT-NEXT:  # %bb.10:
+; LE-32BIT-NEXT:    ori 7, 29, 0
+; LE-32BIT-NEXT:    b .LBB10_12
+; LE-32BIT-NEXT:  .LBB10_11:
+; LE-32BIT-NEXT:    addi 7, 20, 0
+; LE-32BIT-NEXT:  .LBB10_12:
+; LE-32BIT-NEXT:    srw 20, 12, 15
+; LE-32BIT-NEXT:    stw 0, 24(9)
+; LE-32BIT-NEXT:    cmplwi 7, 15, 64
+; LE-32BIT-NEXT:    srw 0, 6, 15
+; LE-32BIT-NEXT:    li 15, 0
+; LE-32BIT-NEXT:    mr 16, 9
+; LE-32BIT-NEXT:    or 24, 0, 24
+; LE-32BIT-NEXT:    lwz 9, 52(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    bc 12, 28, .LBB10_14
+; LE-32BIT-NEXT:  # %bb.13:
+; LE-32BIT-NEXT:    ori 0, 15, 0
+; LE-32BIT-NEXT:    b .LBB10_15
+; LE-32BIT-NEXT:  .LBB10_14:
+; LE-32BIT-NEXT:    addi 0, 20, 0
+; LE-32BIT-NEXT:  .LBB10_15:
+; LE-32BIT-NEXT:    slw 21, 14, 30
+; LE-32BIT-NEXT:    lwz 20, 60(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    or 4, 21, 4
+; LE-32BIT-NEXT:    slw 21, 11, 31
+; LE-32BIT-NEXT:    cmplwi 5, 30, 0
+; LE-32BIT-NEXT:    or 27, 21, 27
+; LE-32BIT-NEXT:    bc 12, 4, .LBB10_17
+; LE-32BIT-NEXT:  # %bb.16:
+; LE-32BIT-NEXT:    ori 21, 19, 0
+; LE-32BIT-NEXT:    b .LBB10_18
+; LE-32BIT-NEXT:  .LBB10_17:
+; LE-32BIT-NEXT:    addi 21, 18, 0
+; LE-32BIT-NEXT:  .LBB10_18:
+; LE-32BIT-NEXT:    slw 19, 8, 9
+; LE-32BIT-NEXT:    slw 17, 5, 9
+; LE-32BIT-NEXT:    bc 12, 22, .LBB10_20
+; LE-32BIT-NEXT:  # %bb.19:
+; LE-32BIT-NEXT:    ori 9, 7, 0
+; LE-32BIT-NEXT:    b .LBB10_21
+; LE-32BIT-NEXT:  .LBB10_20:
+; LE-32BIT-NEXT:    addi 9, 20, 0
+; LE-32BIT-NEXT:  .LBB10_21:
+; LE-32BIT-NEXT:    lwz 7, 48(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    slw 30, 8, 30
+; LE-32BIT-NEXT:    lwz 10, 56(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    slw 29, 5, 31
+; LE-32BIT-NEXT:    or 9, 9, 0
+; LE-32BIT-NEXT:    bc 12, 4, .LBB10_23
+; LE-32BIT-NEXT:  # %bb.22:
+; LE-32BIT-NEXT:    ori 0, 15, 0
+; LE-32BIT-NEXT:    b .LBB10_24
+; LE-32BIT-NEXT:  .LBB10_23:
+; LE-32BIT-NEXT:    addi 0, 30, 0
+; LE-32BIT-NEXT:  .LBB10_24:
+; LE-32BIT-NEXT:    bc 12, 24, .LBB10_26
+; LE-32BIT-NEXT:  # %bb.25:
+; LE-32BIT-NEXT:    ori 30, 15, 0
+; LE-32BIT-NEXT:    b .LBB10_27
+; LE-32BIT-NEXT:  .LBB10_26:
+; LE-32BIT-NEXT:    addi 30, 29, 0
+; LE-32BIT-NEXT:  .LBB10_27:
+; LE-32BIT-NEXT:    bc 12, 4, .LBB10_29
+; LE-32BIT-NEXT:  # %bb.28:
+; LE-32BIT-NEXT:    ori 29, 15, 0
+; LE-32BIT-NEXT:    b .LBB10_30
+; LE-32BIT-NEXT:  .LBB10_29:
+; LE-32BIT-NEXT:    addi 29, 7, 0
+; LE-32BIT-NEXT:  .LBB10_30:
+; LE-32BIT-NEXT:    lwz 7, 44(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    bc 12, 28, .LBB10_31
+; LE-32BIT-NEXT:    b .LBB10_32
+; LE-32BIT-NEXT:  .LBB10_31:
+; LE-32BIT-NEXT:    addi 28, 26, 0
+; LE-32BIT-NEXT:  .LBB10_32:
+; LE-32BIT-NEXT:    bc 12, 4, .LBB10_34
+; LE-32BIT-NEXT:  # %bb.33:
+; LE-32BIT-NEXT:    ori 3, 17, 0
+; LE-32BIT-NEXT:    b .LBB10_34
+; LE-32BIT-NEXT:  .LBB10_34:
+; LE-32BIT-NEXT:    srw 22, 12, 22
+; LE-32BIT-NEXT:    slw 18, 5, 10
+; LE-32BIT-NEXT:    bc 12, 4, .LBB10_36
+; LE-32BIT-NEXT:  # %bb.35:
+; LE-32BIT-NEXT:    ori 4, 19, 0
+; LE-32BIT-NEXT:    b .LBB10_36
+; LE-32BIT-NEXT:  .LBB10_36:
+; LE-32BIT-NEXT:    bc 12, 14, .LBB10_38
+; LE-32BIT-NEXT:  # %bb.37:
+; LE-32BIT-NEXT:    ori 5, 28, 0
+; LE-32BIT-NEXT:    b .LBB10_38
+; LE-32BIT-NEXT:  .LBB10_38:
+; LE-32BIT-NEXT:    li 28, 0
+; LE-32BIT-NEXT:    bc 12, 22, .LBB10_39
+; LE-32BIT-NEXT:    b .LBB10_40
+; LE-32BIT-NEXT:  .LBB10_39:
+; LE-32BIT-NEXT:    addi 3, 11, 0
+; LE-32BIT-NEXT:  .LBB10_40:
+; LE-32BIT-NEXT:    cmplwi 2, 31, 0
+; LE-32BIT-NEXT:    bc 12, 24, .LBB10_42
+; LE-32BIT-NEXT:  # %bb.41:
+; LE-32BIT-NEXT:    ori 27, 18, 0
+; LE-32BIT-NEXT:    b .LBB10_42
+; LE-32BIT-NEXT:  .LBB10_42:
+; LE-32BIT-NEXT:    bc 12, 28, .LBB10_44
+; LE-32BIT-NEXT:  # %bb.43:
+; LE-32BIT-NEXT:    ori 26, 22, 0
+; LE-32BIT-NEXT:    b .LBB10_45
+; LE-32BIT-NEXT:  .LBB10_44:
+; LE-32BIT-NEXT:    addi 26, 24, 0
+; LE-32BIT-NEXT:  .LBB10_45:
+; LE-32BIT-NEXT:    bc 12, 24, .LBB10_47
+; LE-32BIT-NEXT:  # %bb.46:
+; LE-32BIT-NEXT:    ori 24, 15, 0
+; LE-32BIT-NEXT:    b .LBB10_48
+; LE-32BIT-NEXT:  .LBB10_47:
+; LE-32BIT-NEXT:    addi 24, 7, 0
+; LE-32BIT-NEXT:  .LBB10_48:
+; LE-32BIT-NEXT:    bc 12, 28, .LBB10_50
+; LE-32BIT-NEXT:  # %bb.49:
+; LE-32BIT-NEXT:    ori 7, 15, 0
+; LE-32BIT-NEXT:    b .LBB10_51
+; LE-32BIT-NEXT:  .LBB10_50:
+; LE-32BIT-NEXT:    addi 7, 23, 0
+; LE-32BIT-NEXT:  .LBB10_51:
+; LE-32BIT-NEXT:    bc 12, 22, .LBB10_52
+; LE-32BIT-NEXT:    b .LBB10_53
+; LE-32BIT-NEXT:  .LBB10_52:
+; LE-32BIT-NEXT:    addi 4, 14, 0
+; LE-32BIT-NEXT:  .LBB10_53:
+; LE-32BIT-NEXT:    bc 12, 0, .LBB10_55
+; LE-32BIT-NEXT:  # %bb.54:
+; LE-32BIT-NEXT:    ori 3, 28, 0
+; LE-32BIT-NEXT:    b .LBB10_55
+; LE-32BIT-NEXT:  .LBB10_55:
+; LE-32BIT-NEXT:    bc 12, 10, .LBB10_56
+; LE-32BIT-NEXT:    b .LBB10_57
+; LE-32BIT-NEXT:  .LBB10_56:
+; LE-32BIT-NEXT:    addi 25, 12, 0
+; LE-32BIT-NEXT:  .LBB10_57:
+; LE-32BIT-NEXT:    or 5, 0, 5
+; LE-32BIT-NEXT:    bc 12, 10, .LBB10_58
+; LE-32BIT-NEXT:    b .LBB10_59
+; LE-32BIT-NEXT:  .LBB10_58:
+; LE-32BIT-NEXT:    addi 27, 11, 0
+; LE-32BIT-NEXT:  .LBB10_59:
+; LE-32BIT-NEXT:    stw 3, 20(16)
+; LE-32BIT-NEXT:    or 3, 4, 7
+; LE-32BIT-NEXT:    bc 12, 0, .LBB10_61
+; LE-32BIT-NEXT:  # %bb.60:
+; LE-32BIT-NEXT:    ori 3, 27, 0
+; LE-32BIT-NEXT:    ori 9, 25, 0
+; LE-32BIT-NEXT:    b .LBB10_61
+; LE-32BIT-NEXT:  .LBB10_61:
+; LE-32BIT-NEXT:    bc 12, 14, .LBB10_63
+; LE-32BIT-NEXT:  # %bb.62:
+; LE-32BIT-NEXT:    ori 6, 26, 0
+; LE-32BIT-NEXT:    b .LBB10_63
+; LE-32BIT-NEXT:  .LBB10_63:
+; LE-32BIT-NEXT:    bc 12, 22, .LBB10_65
+; LE-32BIT-NEXT:  # %bb.64:
+; LE-32BIT-NEXT:    ori 12, 21, 0
+; LE-32BIT-NEXT:    b .LBB10_65
+; LE-32BIT-NEXT:  .LBB10_65:
+; LE-32BIT-NEXT:    bc 12, 0, .LBB10_67
+; LE-32BIT-NEXT:  # %bb.66:
+; LE-32BIT-NEXT:    ori 5, 30, 0
+; LE-32BIT-NEXT:    b .LBB10_67
+; LE-32BIT-NEXT:  .LBB10_67:
+; LE-32BIT-NEXT:    bc 12, 22, .LBB10_69
+; LE-32BIT-NEXT:  # %bb.68:
+; LE-32BIT-NEXT:    ori 4, 9, 0
+; LE-32BIT-NEXT:    b .LBB10_70
+; LE-32BIT-NEXT:  .LBB10_69:
+; LE-32BIT-NEXT:    addi 3, 14, 0
+; LE-32BIT-NEXT:    addi 4, 20, 0
+; LE-32BIT-NEXT:  .LBB10_70:
+; LE-32BIT-NEXT:    bc 12, 0, .LBB10_72
+; LE-32BIT-NEXT:  # %bb.71:
+; LE-32BIT-NEXT:    ori 12, 15, 0
+; LE-32BIT-NEXT:    b .LBB10_72
+; LE-32BIT-NEXT:  .LBB10_72:
+; LE-32BIT-NEXT:    bc 12, 22, .LBB10_73
+; LE-32BIT-NEXT:    b .LBB10_74
+; LE-32BIT-NEXT:  .LBB10_73:
+; LE-32BIT-NEXT:    addi 5, 8, 0
+; LE-32BIT-NEXT:  .LBB10_74:
+; LE-32BIT-NEXT:    stw 3, 4(16)
+; LE-32BIT-NEXT:    lwz 3, 28(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    stw 4, 0(16)
+; LE-32BIT-NEXT:    or 4, 29, 6
+; LE-32BIT-NEXT:    bc 12, 0, .LBB10_76
+; LE-32BIT-NEXT:  # %bb.75:
+; LE-32BIT-NEXT:    ori 4, 24, 0
+; LE-32BIT-NEXT:    b .LBB10_76
+; LE-32BIT-NEXT:  .LBB10_76:
+; LE-32BIT-NEXT:    stw 12, 16(16)
+; LE-32BIT-NEXT:    bc 12, 22, .LBB10_78
+; LE-32BIT-NEXT:  # %bb.77:
+; LE-32BIT-NEXT:    ori 3, 4, 0
+; LE-32BIT-NEXT:    b .LBB10_78
+; LE-32BIT-NEXT:  .LBB10_78:
+; LE-32BIT-NEXT:    stw 5, 12(16)
+; LE-32BIT-NEXT:    stw 3, 8(16)
+; LE-32BIT-NEXT:    lwz 12, 68(1)
+; LE-32BIT-NEXT:    lwz 31, 140(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    mtcrf 32, 12 # cr2
+; LE-32BIT-NEXT:    mtcrf 16, 12 # cr3
+; LE-32BIT-NEXT:    lwz 30, 136(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 29, 132(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 28, 128(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 27, 124(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 26, 120(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 25, 116(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 24, 112(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 23, 108(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 22, 104(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 21, 100(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 20, 96(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 19, 92(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 18, 88(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 17, 84(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 16, 80(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 15, 76(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 14, 72(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    addi 1, 1, 144
+; LE-32BIT-NEXT:    blr
+  %src = load i256, ptr %src.ptr, align 1
+  %bitOff = load i256, ptr %bitOff.ptr, align 1
+  %res = shl i256 %src, %bitOff
+  store i256 %res, ptr %dst, align 1
+  ret void
+}
+define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
+; LE-64BIT-LABEL: ashr_32bytes:
+; LE-64BIT:       # %bb.0:
+; LE-64BIT-NEXT:    lwz 4, 0(4)
+; LE-64BIT-NEXT:    ld 7, 16(3)
+; LE-64BIT-NEXT:    ld 8, 24(3)
+; LE-64BIT-NEXT:    std 29, -24(1) # 8-byte Folded Spill
+; LE-64BIT-NEXT:    std 27, -40(1) # 8-byte Folded Spill
+; LE-64BIT-NEXT:    std 28, -32(1) # 8-byte Folded Spill
+; LE-64BIT-NEXT:    std 30, -16(1) # 8-byte Folded Spill
+; LE-64BIT-NEXT:    subfic 9, 4, 192
+; LE-64BIT-NEXT:    addi 10, 4, -128
+; LE-64BIT-NEXT:    addi 0, 4, -192
+; LE-64BIT-NEXT:    subfic 29, 4, 64
+; LE-64BIT-NEXT:    ld 6, 0(3)
+; LE-64BIT-NEXT:    srd 12, 7, 4
+; LE-64BIT-NEXT:    sld 9, 8, 9
+; LE-64BIT-NEXT:    addi 28, 4, -64
+; LE-64BIT-NEXT:    ld 3, 8(3)
+; LE-64BIT-NEXT:    std 26, -48(1) # 8-byte Folded Spill
+; LE-64BIT-NEXT:    std 25, -56(1) # 8-byte Folded Spill
+; LE-64BIT-NEXT:    srd 30, 7, 10
+; LE-64BIT-NEXT:    srad 27, 8, 0
+; LE-64BIT-NEXT:    cmpwi 0, 1
+; LE-64BIT-NEXT:    sld 0, 8, 29
+; LE-64BIT-NEXT:    or 9, 30, 9
+; LE-64BIT-NEXT:    subfic 30, 4, 128
+; LE-64BIT-NEXT:    srad 26, 8, 28
+; LE-64BIT-NEXT:    cmpwi 1, 28, 1
+; LE-64BIT-NEXT:    or 12, 12, 0
+; LE-64BIT-NEXT:    subfic 25, 30, 64
+; LE-64BIT-NEXT:    srd 11, 6, 4
+; LE-64BIT-NEXT:    isel 12, 12, 26, 4
+; LE-64BIT-NEXT:    sld 26, 3, 29
+; LE-64BIT-NEXT:    srd 28, 3, 28
+; LE-64BIT-NEXT:    or 11, 11, 26
+; LE-64BIT-NEXT:    sld 29, 7, 29
+; LE-64BIT-NEXT:    srd 26, 7, 25
+; LE-64BIT-NEXT:    sld 7, 7, 30
+; LE-64BIT-NEXT:    or 11, 11, 28
+; LE-64BIT-NEXT:    ld 28, -32(1) # 8-byte Folded Reload
+; LE-64BIT-NEXT:    ld 25, -56(1) # 8-byte Folded Reload
+; LE-64BIT-NEXT:    sld 30, 8, 30
+; LE-64BIT-NEXT:    isellt 9, 9, 27
+; LE-64BIT-NEXT:    or 7, 11, 7
+; LE-64BIT-NEXT:    cmplwi 4, 128
+; LE-64BIT-NEXT:    sradi 27, 8, 63
+; LE-64BIT-NEXT:    or 30, 30, 26
+; LE-64BIT-NEXT:    ld 26, -48(1) # 8-byte Folded Reload
+; LE-64BIT-NEXT:    srd 0, 3, 4
+; LE-64BIT-NEXT:    isellt 11, 12, 27
+; LE-64BIT-NEXT:    or 12, 30, 29
+; LE-64BIT-NEXT:    ld 30, -16(1) # 8-byte Folded Reload
+; LE-64BIT-NEXT:    ld 29, -24(1) # 8-byte Folded Reload
+; LE-64BIT-NEXT:    cmplwi 1, 4, 0
+; LE-64BIT-NEXT:    srad 10, 8, 10
+; LE-64BIT-NEXT:    std 11, 16(5)
+; LE-64BIT-NEXT:    isellt 7, 7, 9
+; LE-64BIT-NEXT:    or 9, 0, 12
+; LE-64BIT-NEXT:    isel 6, 6, 7, 6
+; LE-64BIT-NEXT:    srad 4, 8, 4
+; LE-64BIT-NEXT:    isellt 7, 9, 10
+; LE-64BIT-NEXT:    std 6, 0(5)
+; LE-64BIT-NEXT:    isel 3, 3, 7, 6
+; LE-64BIT-NEXT:    isellt 4, 4, 27
+; LE-64BIT-NEXT:    ld 27, -40(1) # 8-byte Folded Reload
+; LE-64BIT-NEXT:    std 3, 8(5)
+; LE-64BIT-NEXT:    std 4, 24(5)
+; LE-64BIT-NEXT:    blr
+;
+; BE-LABEL: ashr_32bytes:
+; BE:       # %bb.0:
+; BE-NEXT:    lwz 4, 28(4)
+; BE-NEXT:    ld 6, 24(3)
+; BE-NEXT:    ld 7, 16(3)
+; BE-NEXT:    ld 8, 8(3)
+; BE-NEXT:    ld 3, 0(3)
+; BE-NEXT:    std 27, -40(1) # 8-byte Folded Spill
+; BE-NEXT:    std 29, -24(1) # 8-byte Folded Spill
+; BE-NEXT:    std 28, -32(1) # 8-byte Folded Spill
+; BE-NEXT:    std 30, -16(1) # 8-byte Folded Spill
+; BE-NEXT:    subfic 9, 4, 192
+; BE-NEXT:    addi 10, 4, -128
+; BE-NEXT:    addi 11, 4, -192
+; BE-NEXT:    subfic 0, 4, 64
+; BE-NEXT:    sld 9, 3, 9
+; BE-NEXT:    srd 27, 8, 10
+; BE-NEXT:    srd 12, 6, 4
+; BE-NEXT:    subfic 29, 4, 128
+; BE-NEXT:    cmpwi 11, 1
+; BE-NEXT:    srad 11, 3, 11
+; BE-NEXT:    or 9, 27, 9
+; BE-NEXT:    sld 27, 7, 0
+; BE-NEXT:    addi 30, 4, -64
+; BE-NEXT:    srd 28, 8, 4
+; BE-NEXT:    or 12, 12, 27
+; BE-NEXT:    sld 27, 3, 0
+; BE-NEXT:    bc 12, 0, .LBB11_2
+; BE-NEXT:  # %bb.1:
+; BE-NEXT:    ori 9, 11, 0
+; BE-NEXT:    b .LBB11_2
+; BE-NEXT:  .LBB11_2:
+; BE-NEXT:    subfic 11, 29, 64
+; BE-NEXT:    or 28, 28, 27
+; BE-NEXT:    srd 27, 7, 30
+; BE-NEXT:    sld 0, 8, 0
+; BE-NEXT:    srd 11, 8, 11
+; BE-NEXT:    sld 8, 8, 29
+; BE-NEXT:    sld 29, 3, 29
+; BE-NEXT:    cmplwi 4, 128
+; BE-NEXT:    or 12, 12, 27
+; BE-NEXT:    or 11, 29, 11
+; BE-NEXT:    or 8, 12, 8
+; BE-NEXT:    srd 12, 7, 4
+; BE-NEXT:    or 11, 11, 0
+; BE-NEXT:    cmpwi 1, 30, 1
+; BE-NEXT:    srad 30, 3, 30
+; BE-NEXT:    bc 12, 0, .LBB11_4
+; BE-NEXT:  # %bb.3:
+; BE-NEXT:    ori 8, 9, 0
+; BE-NEXT:    b .LBB11_4
+; BE-NEXT:  .LBB11_4:
+; BE-NEXT:    or 9, 12, 11
+; BE-NEXT:    srad 10, 3, 10
+; BE-NEXT:    bc 12, 4, .LBB11_6
+; BE-NEXT:  # %bb.5:
+; BE-NEXT:    ori 11, 30, 0
+; BE-NEXT:    b .LBB11_7
+; BE-NEXT:  .LBB11_6:
+; BE-NEXT:    addi 11, 28, 0
+; BE-NEXT:  .LBB11_7:
+; BE-NEXT:    cmplwi 1, 4, 0
+; BE-NEXT:    bc 12, 0, .LBB11_9
+; BE-NEXT:  # %bb.8:
+; BE-NEXT:    ori 9, 10, 0
+; BE-NEXT:    b .LBB11_9
+; BE-NEXT:  .LBB11_9:
+; BE-NEXT:    sradi 10, 3, 63
+; BE-NEXT:    srad 3, 3, 4
+; BE-NEXT:    bc 12, 6, .LBB11_11
+; BE-NEXT:  # %bb.10:
+; BE-NEXT:    ori 4, 8, 0
+; BE-NEXT:    b .LBB11_12
+; BE-NEXT:  .LBB11_11:
+; BE-NEXT:    addi 4, 6, 0
+; BE-NEXT:  .LBB11_12:
+; BE-NEXT:    ld 30, -16(1) # 8-byte Folded Reload
+; BE-NEXT:    ld 29, -24(1) # 8-byte Folded Reload
+; BE-NEXT:    ld 28, -32(1) # 8-byte Folded Reload
+; BE-NEXT:    ld 27, -40(1) # 8-byte Folded Reload
+; BE-NEXT:    bc 12, 6, .LBB11_14
+; BE-NEXT:  # %bb.13:
+; BE-NEXT:    ori 6, 9, 0
+; BE-NEXT:    b .LBB11_15
+; BE-NEXT:  .LBB11_14:
+; BE-NEXT:    addi 6, 7, 0
+; BE-NEXT:  .LBB11_15:
+; BE-NEXT:    bc 12, 0, .LBB11_17
+; BE-NEXT:  # %bb.16:
+; BE-NEXT:    ori 7, 10, 0
+; BE-NEXT:    ori 3, 10, 0
+; BE-NEXT:    b .LBB11_18
+; BE-NEXT:  .LBB11_17:
+; BE-NEXT:    addi 7, 11, 0
+; BE-NEXT:  .LBB11_18:
+; BE-NEXT:    std 4, 24(5)
+; BE-NEXT:    std 3, 0(5)
+; BE-NEXT:    std 7, 8(5)
+; BE-NEXT:    std 6, 16(5)
+; BE-NEXT:    blr
+;
+; LE-32BIT-LABEL: ashr_32bytes:
+; LE-32BIT:       # %bb.0:
+; LE-32BIT-NEXT:    stwu 1, -160(1)
+; LE-32BIT-NEXT:    mfcr 12
+; LE-32BIT-NEXT:    stw 14, 88(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 15, 92(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 16, 96(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 17, 100(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 18, 104(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 19, 108(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 20, 112(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 21, 116(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 22, 120(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 23, 124(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 24, 128(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 25, 132(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 26, 136(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 27, 140(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 28, 144(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 29, 148(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 30, 152(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 31, 156(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    stw 12, 84(1)
+; LE-32BIT-NEXT:    lwz 30, 28(4)
+; LE-32BIT-NEXT:    lwz 10, 4(3)
+; LE-32BIT-NEXT:    lwz 6, 0(3)
+; LE-32BIT-NEXT:    subfic 23, 30, 224
+; LE-32BIT-NEXT:    stw 5, 80(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    addi 21, 30, -224
+; LE-32BIT-NEXT:    lwz 5, 24(3)
+; LE-32BIT-NEXT:    subfic 4, 30, 160
+; LE-32BIT-NEXT:    lwz 8, 28(3)
+; LE-32BIT-NEXT:    addi 0, 30, -128
+; LE-32BIT-NEXT:    lwz 12, 20(3)
+; LE-32BIT-NEXT:    subfic 28, 30, 96
+; LE-32BIT-NEXT:    lwz 9, 16(3)
+; LE-32BIT-NEXT:    addi 29, 30, -64
+; LE-32BIT-NEXT:    lwz 27, 12(3)
+; LE-32BIT-NEXT:    subfic 25, 30, 32
+; LE-32BIT-NEXT:    lwz 11, 8(3)
+; LE-32BIT-NEXT:    addi 3, 30, -192
+; LE-32BIT-NEXT:    slw 23, 6, 23
+; LE-32BIT-NEXT:    srw 16, 10, 3
+; LE-32BIT-NEXT:    stw 3, 72(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    srw 20, 8, 30
+; LE-32BIT-NEXT:    sraw 15, 6, 21
+; LE-32BIT-NEXT:    cmpwi 21, 1
+; LE-32BIT-NEXT:    slw 21, 11, 4
+; LE-32BIT-NEXT:    srw 14, 27, 0
+; LE-32BIT-NEXT:    slw 31, 9, 28
+; LE-32BIT-NEXT:    srw 3, 12, 29
+; LE-32BIT-NEXT:    or 23, 16, 23
+; LE-32BIT-NEXT:    slw 16, 5, 25
+; LE-32BIT-NEXT:    srw 19, 12, 30
+; LE-32BIT-NEXT:    or 21, 14, 21
+; LE-32BIT-NEXT:    slw 14, 9, 25
+; LE-32BIT-NEXT:    or 3, 3, 31
+; LE-32BIT-NEXT:    slw 31, 6, 4
+; LE-32BIT-NEXT:    or 20, 20, 16
+; LE-32BIT-NEXT:    srw 16, 10, 0
+; LE-32BIT-NEXT:    or 19, 19, 14
+; LE-32BIT-NEXT:    slw 14, 6, 28
+; LE-32BIT-NEXT:    or 16, 16, 31
+; LE-32BIT-NEXT:    srw 31, 10, 29
+; LE-32BIT-NEXT:    addi 24, 30, -160
+; LE-32BIT-NEXT:    srw 18, 27, 30
+; LE-32BIT-NEXT:    or 14, 31, 14
+; LE-32BIT-NEXT:    slw 31, 11, 25
+; LE-32BIT-NEXT:    addi 7, 30, -96
+; LE-32BIT-NEXT:    srw 17, 10, 30
+; LE-32BIT-NEXT:    stw 4, 48(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    or 18, 18, 31
+; LE-32BIT-NEXT:    slw 31, 6, 25
+; LE-32BIT-NEXT:    bc 12, 0, .LBB11_2
+; LE-32BIT-NEXT:  # %bb.1:
+; LE-32BIT-NEXT:    ori 4, 15, 0
+; LE-32BIT-NEXT:    b .LBB11_3
+; LE-32BIT-NEXT:  .LBB11_2:
+; LE-32BIT-NEXT:    addi 4, 23, 0
+; LE-32BIT-NEXT:  .LBB11_3:
+; LE-32BIT-NEXT:    srw 15, 11, 24
+; LE-32BIT-NEXT:    or 17, 17, 31
+; LE-32BIT-NEXT:    addi 31, 30, -32
+; LE-32BIT-NEXT:    or 21, 21, 15
+; LE-32BIT-NEXT:    srw 15, 9, 7
+; LE-32BIT-NEXT:    or 3, 3, 15
+; LE-32BIT-NEXT:    srw 15, 5, 31
+; LE-32BIT-NEXT:    or 20, 20, 15
+; LE-32BIT-NEXT:    srw 15, 9, 31
+; LE-32BIT-NEXT:    stw 3, 44(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    or 3, 19, 15
+; LE-32BIT-NEXT:    subfic 15, 30, 64
+; LE-32BIT-NEXT:    stw 4, 36(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    cmpwi 24, 1
+; LE-32BIT-NEXT:    sraw 24, 6, 24
+; LE-32BIT-NEXT:    subfic 4, 15, 32
+; LE-32BIT-NEXT:    stw 0, 56(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    srw 0, 27, 4
+; LE-32BIT-NEXT:    stw 3, 64(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    bc 12, 0, .LBB11_5
+; LE-32BIT-NEXT:  # %bb.4:
+; LE-32BIT-NEXT:    ori 3, 24, 0
+; LE-32BIT-NEXT:    b .LBB11_6
+; LE-32BIT-NEXT:  .LBB11_5:
+; LE-32BIT-NEXT:    addi 3, 16, 0
+; LE-32BIT-NEXT:  .LBB11_6:
+; LE-32BIT-NEXT:    slw 16, 11, 15
+; LE-32BIT-NEXT:    or 0, 16, 0
+; LE-32BIT-NEXT:    subfic 16, 30, 128
+; LE-32BIT-NEXT:    stw 5, 52(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    subfic 5, 16, 32
+; LE-32BIT-NEXT:    stw 3, 60(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    slw 3, 6, 16
+; LE-32BIT-NEXT:    srw 22, 10, 5
+; LE-32BIT-NEXT:    stw 29, 68(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    or 29, 3, 22
+; LE-32BIT-NEXT:    subfic 3, 30, 192
+; LE-32BIT-NEXT:    stw 8, 76(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    mr 8, 12
+; LE-32BIT-NEXT:    mr 23, 9
+; LE-32BIT-NEXT:    mr 9, 27
+; LE-32BIT-NEXT:    slw 22, 11, 16
+; LE-32BIT-NEXT:    srw 27, 27, 5
+; LE-32BIT-NEXT:    subfic 19, 3, 32
+; LE-32BIT-NEXT:    mr 12, 28
+; LE-32BIT-NEXT:    or 27, 22, 27
+; LE-32BIT-NEXT:    slw 22, 23, 15
+; LE-32BIT-NEXT:    srw 26, 8, 4
+; LE-32BIT-NEXT:    srw 19, 10, 19
+; LE-32BIT-NEXT:    slw 24, 6, 3
+; LE-32BIT-NEXT:    srw 4, 10, 4
+; LE-32BIT-NEXT:    slw 28, 6, 15
+; LE-32BIT-NEXT:    or 26, 22, 26
+; LE-32BIT-NEXT:    cmpwi 7, 1
+; LE-32BIT-NEXT:    sraw 22, 6, 7
+; LE-32BIT-NEXT:    or 24, 24, 19
+; LE-32BIT-NEXT:    srw 19, 11, 31
+; LE-32BIT-NEXT:    mr 7, 11
+; LE-32BIT-NEXT:    or 11, 28, 4
+; LE-32BIT-NEXT:    lwz 4, 80(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    bc 12, 0, .LBB11_7
+; LE-32BIT-NEXT:    b .LBB11_8
+; LE-32BIT-NEXT:  .LBB11_7:
+; LE-32BIT-NEXT:    addi 22, 14, 0
+; LE-32BIT-NEXT:  .LBB11_8:
+; LE-32BIT-NEXT:    cmplwi 1, 30, 64
+; LE-32BIT-NEXT:    cmplwi 30, 128
+; LE-32BIT-NEXT:    slw 3, 10, 3
+; LE-32BIT-NEXT:    or 19, 18, 19
+; LE-32BIT-NEXT:    cmpwi 5, 31, 1
+; LE-32BIT-NEXT:    sraw 18, 6, 31
+; LE-32BIT-NEXT:    crand 28, 0, 4
+; LE-32BIT-NEXT:    srawi 14, 6, 31
+; LE-32BIT-NEXT:    sraw 31, 6, 30
+; LE-32BIT-NEXT:    or 3, 21, 3
+; LE-32BIT-NEXT:    slw 21, 8, 15
+; LE-32BIT-NEXT:    bc 12, 20, .LBB11_10
+; LE-32BIT-NEXT:  # %bb.9:
+; LE-32BIT-NEXT:    ori 28, 18, 0
+; LE-32BIT-NEXT:    b .LBB11_11
+; LE-32BIT-NEXT:  .LBB11_10:
+; LE-32BIT-NEXT:    addi 28, 17, 0
+; LE-32BIT-NEXT:  .LBB11_11:
+; LE-32BIT-NEXT:    bc 12, 28, .LBB11_13
+; LE-32BIT-NEXT:  # %bb.12:
+; LE-32BIT-NEXT:    ori 18, 14, 0
+; LE-32BIT-NEXT:    b .LBB11_14
+; LE-32BIT-NEXT:  .LBB11_13:
+; LE-32BIT-NEXT:    addi 18, 31, 0
+; LE-32BIT-NEXT:  .LBB11_14:
+; LE-32BIT-NEXT:    or 21, 20, 21
+; LE-32BIT-NEXT:    subfic 20, 16, 64
+; LE-32BIT-NEXT:    stw 18, 0(4)
+; LE-32BIT-NEXT:    subfic 18, 20, 32
+; LE-32BIT-NEXT:    slw 18, 7, 18
+; LE-32BIT-NEXT:    srw 17, 9, 20
+; LE-32BIT-NEXT:    or 18, 17, 18
+; LE-32BIT-NEXT:    slw 17, 9, 25
+; LE-32BIT-NEXT:    mr 31, 8
+; LE-32BIT-NEXT:    stw 8, 40(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    or 8, 0, 17
+; LE-32BIT-NEXT:    slw 0, 10, 12
+; LE-32BIT-NEXT:    stw 8, 28(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    or 8, 29, 0
+; LE-32BIT-NEXT:    slw 0, 9, 12
+; LE-32BIT-NEXT:    or 12, 27, 0
+; LE-32BIT-NEXT:    stw 12, 32(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT:    slw 0, 31, 25
+; LE-32BIT-NEXT:    lwz 12, 48(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    or 0, 26, 0
+; LE-32BIT-NEXT:    mr 17, 10
+; LE-32BIT-NEXT:    slw 25, 10, 25
+; LE-32BIT-NEXT:    slw 26, 10, 12
+; LE-32BIT-NEXT:    or 26, 24, 26
+; LE-32BIT-NEXT:    slw 24, 10, 15
+; LE-32BIT-NEXT:    or 24, 19, 24
+; LE-32BIT-NEXT:    lwz 19, 56(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    or 11, 11, 25
+; LE-32BIT-NEXT:    lwz 10, 36(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    srw 25, 7, 20
+; LE-32BIT-NEXT:    cmplwi 6, 19, 64
+; LE-32BIT-NEXT:    or 8, 8, 25
+; LE-32BIT-NEXT:    bc 12, 24, .LBB11_16
+; LE-32BIT-NEXT:  # %bb.15:
+; LE-32BIT-NEXT:    ori 27, 10, 0
+; LE-32BIT-NEXT:    b .LBB11_17
+; LE-32BIT-NEXT:  .LBB11_16:
+; LE-32BIT-NEXT:    addi 27, 3, 0
+; LE-32BIT-NEXT:  .LBB11_17:
+; LE-32BIT-NEXT:    lwz 10, 52(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    srw 5, 7, 5
+; LE-32BIT-NEXT:    lwz 3, 44(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    or 5, 18, 5
+; LE-32BIT-NEXT:    srw 25, 10, 30
+; LE-32BIT-NEXT:    or 25, 25, 0
+; LE-32BIT-NEXT:    srw 0, 7, 19
+; LE-32BIT-NEXT:    or 26, 0, 26
+; LE-32BIT-NEXT:    srw 0, 7, 30
+; LE-32BIT-NEXT:    bc 12, 4, .LBB11_19
+; LE-32BIT-NEXT:  # %bb.18:
+; LE-32BIT-NEXT:    ori 29, 3, 0
+; LE-32BIT-NEXT:    b .LBB11_20
+; LE-32BIT-NEXT:  .LBB11_19:
+; LE-32BIT-NEXT:    addi 29, 21, 0
+; LE-32BIT-NEXT:  .LBB11_20:
+; LE-32BIT-NEXT:    mr 3, 7
+; LE-32BIT-NEXT:    or 11, 0, 11
+; LE-32BIT-NEXT:    bc 12, 28, .LBB11_22
+; LE-32BIT-NEXT:  # %bb.21:
+; LE-32BIT-NEXT:    ori 0, 14, 0
+; LE-32BIT-NEXT:    b .LBB11_23
+; LE-32BIT-NEXT:  .LBB11_22:
+; LE-32BIT-NEXT:    addi 0, 28, 0
+; LE-32BIT-NEXT:  .LBB11_23:
+; LE-32BIT-NEXT:    lwz 7, 72(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    mr 18, 4
+; LE-32BIT-NEXT:    stw 0, 4(4)
+; LE-32BIT-NEXT:    bc 12, 4, .LBB11_25
+; LE-32BIT-NEXT:  # %bb.24:
+; LE-32BIT-NEXT:    ori 24, 22, 0
+; LE-32BIT-NEXT:    b .LBB11_25
+; LE-32BIT-NEXT:  .LBB11_25:
+; LE-32BIT-NEXT:    cmplwi 5, 30, 0
+; LE-32BIT-NEXT:    lwz 4, 68(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    cmplwi 2, 19, 0
+; LE-32BIT-NEXT:    mr 31, 23
+; LE-32BIT-NEXT:    srw 30, 23, 30
+; LE-32BIT-NEXT:    slw 28, 9, 16
+; LE-32BIT-NEXT:    slw 23, 9, 15
+; LE-32BIT-NEXT:    sraw 21, 6, 7
+; LE-32BIT-NEXT:    bc 12, 10, .LBB11_27
+; LE-32BIT-NEXT:  # %bb.26:
+; LE-32BIT-NEXT:    ori 7, 27, 0
+; LE-32BIT-NEXT:    b .LBB11_28
+; LE-32BIT-NEXT:  .LBB11_27:
+; LE-32BIT-NEXT:    addi 7, 9, 0
+; LE-32BIT-NEXT:  .LBB11_28:
+; LE-32BIT-NEXT:    bc 12, 22, .LBB11_30
+; LE-32BIT-NEXT:  # %bb.29:
+; LE-32BIT-NEXT:    ori 12, 24, 0
+; LE-32BIT-NEXT:    b .LBB11_31
+; LE-32BIT-NEXT:  .LBB11_30:
+; LE-32BIT-NEXT:    addi 12, 9, 0
+; LE-32BIT-NEXT:  .LBB11_31:
+; LE-32BIT-NEXT:    lwz 9, 64(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    srw 22, 31, 4
+; LE-32BIT-NEXT:    sraw 20, 6, 4
+; LE-32BIT-NEXT:    lwz 4, 28(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    cmplwi 7, 16, 64
+; LE-32BIT-NEXT:    cmplwi 3, 16, 0
+; LE-32BIT-NEXT:    slw 0, 17, 16
+; LE-32BIT-NEXT:    lwz 16, 76(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    li 15, 0
+; LE-32BIT-NEXT:    or 5, 0, 5
+; LE-32BIT-NEXT:    bc 12, 28, .LBB11_33
+; LE-32BIT-NEXT:  # %bb.32:
+; LE-32BIT-NEXT:    ori 0, 15, 0
+; LE-32BIT-NEXT:    b .LBB11_34
+; LE-32BIT-NEXT:  .LBB11_33:
+; LE-32BIT-NEXT:    addi 0, 28, 0
+; LE-32BIT-NEXT:  .LBB11_34:
+; LE-32BIT-NEXT:    bc 12, 4, .LBB11_36
+; LE-32BIT-NEXT:  # %bb.35:
+; LE-32BIT-NEXT:    ori 28, 22, 0
+; LE-32BIT-NEXT:    ori 25, 15, 0
+; LE-32BIT-NEXT:    b .LBB11_37
+; LE-32BIT-NEXT:  .LBB11_36:
+; LE-32BIT-NEXT:    addi 28, 25, 0
+; LE-32BIT-NEXT:    addi 25, 9, 0
+; LE-32BIT-NEXT:  .LBB11_37:
+; LE-32BIT-NEXT:    lwz 9, 60(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    bc 12, 28, .LBB11_39
+; LE-32BIT-NEXT:  # %bb.38:
+; LE-32BIT-NEXT:    ori 8, 4, 0
+; LE-32BIT-NEXT:    b .LBB11_39
+; LE-32BIT-NEXT:  .LBB11_39:
+; LE-32BIT-NEXT:    bc 12, 4, .LBB11_41
+; LE-32BIT-NEXT:  # %bb.40:
+; LE-32BIT-NEXT:    ori 4, 20, 0
+; LE-32BIT-NEXT:    b .LBB11_42
+; LE-32BIT-NEXT:  .LBB11_41:
+; LE-32BIT-NEXT:    addi 4, 11, 0
+; LE-32BIT-NEXT:  .LBB11_42:
+; LE-32BIT-NEXT:    bc 12, 22, .LBB11_43
+; LE-32BIT-NEXT:    b .LBB11_44
+; LE-32BIT-NEXT:  .LBB11_43:
+; LE-32BIT-NEXT:    addi 29, 16, 0
+; LE-32BIT-NEXT:  .LBB11_44:
+; LE-32BIT-NEXT:    sraw 19, 6, 19
+; LE-32BIT-NEXT:    bc 12, 22, .LBB11_45
+; LE-32BIT-NEXT:    b .LBB11_46
+; LE-32BIT-NEXT:  .LBB11_45:
+; LE-32BIT-NEXT:    addi 4, 3, 0
+; LE-32BIT-NEXT:  .LBB11_46:
+; LE-32BIT-NEXT:    or 29, 29, 0
+; LE-32BIT-NEXT:    bc 12, 4, .LBB11_48
+; LE-32BIT-NEXT:  # %bb.47:
+; LE-32BIT-NEXT:    ori 0, 15, 0
+; LE-32BIT-NEXT:    b .LBB11_49
+; LE-32BIT-NEXT:  .LBB11_48:
+; LE-32BIT-NEXT:    addi 0, 30, 0
+; LE-32BIT-NEXT:  .LBB11_49:
+; LE-32BIT-NEXT:    bc 12, 14, .LBB11_51
+; LE-32BIT-NEXT:  # %bb.50:
+; LE-32BIT-NEXT:    ori 6, 8, 0
+; LE-32BIT-NEXT:    b .LBB11_51
+; LE-32BIT-NEXT:  .LBB11_51:
+; LE-32BIT-NEXT:    bc 12, 0, .LBB11_53
+; LE-32BIT-NEXT:  # %bb.52:
+; LE-32BIT-NEXT:    ori 4, 14, 0
+; LE-32BIT-NEXT:    b .LBB11_53
+; LE-32BIT-NEXT:  .LBB11_53:
+; LE-32BIT-NEXT:    bc 12, 24, .LBB11_55
+; LE-32BIT-NEXT:  # %bb.54:
+; LE-32BIT-NEXT:    ori 30, 14, 0
+; LE-32BIT-NEXT:    ori 26, 21, 0
+; LE-32BIT-NEXT:    b .LBB11_56
+; LE-32BIT-NEXT:  .LBB11_55:
+; LE-32BIT-NEXT:    addi 30, 19, 0
+; LE-32BIT-NEXT:  .LBB11_56:
+; LE-32BIT-NEXT:    bc 12, 28, .LBB11_58
+; LE-32BIT-NEXT:  # %bb.57:
+; LE-32BIT-NEXT:    ori 5, 23, 0
+; LE-32BIT-NEXT:    b .LBB11_58
+; LE-32BIT-NEXT:  .LBB11_58:
+; LE-32BIT-NEXT:    bc 12, 22, .LBB11_60
+; LE-32BIT-NEXT:  # %bb.59:
+; LE-32BIT-NEXT:    ori 8, 28, 0
+; LE-32BIT-NEXT:    b .LBB11_61
+; LE-32BIT-NEXT:  .LBB11_60:
+; LE-32BIT-NEXT:    addi 8, 10, 0
+; LE-32BIT-NEXT:  .LBB11_61:
+; LE-32BIT-NEXT:    bc 12, 0, .LBB11_63
+; LE-32BIT-NEXT:  # %bb.62:
+; LE-32BIT-NEXT:    ori 12, 14, 0
+; LE-32BIT-NEXT:    b .LBB11_63
+; LE-32BIT-NEXT:  .LBB11_63:
+; LE-32BIT-NEXT:    bc 12, 24, .LBB11_65
+; LE-32BIT-NEXT:  # %bb.64:
+; LE-32BIT-NEXT:    ori 24, 14, 0
+; LE-32BIT-NEXT:    b .LBB11_66
+; LE-32BIT-NEXT:  .LBB11_65:
+; LE-32BIT-NEXT:    addi 24, 9, 0
+; LE-32BIT-NEXT:  .LBB11_66:
+; LE-32BIT-NEXT:    lwz 9, 32(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    bc 12, 10, .LBB11_68
+; LE-32BIT-NEXT:  # %bb.67:
+; LE-32BIT-NEXT:    ori 28, 26, 0
+; LE-32BIT-NEXT:    b .LBB11_69
+; LE-32BIT-NEXT:  .LBB11_68:
+; LE-32BIT-NEXT:    addi 28, 3, 0
+; LE-32BIT-NEXT:  .LBB11_69:
+; LE-32BIT-NEXT:    bc 12, 0, .LBB11_71
+; LE-32BIT-NEXT:  # %bb.70:
+; LE-32BIT-NEXT:    ori 3, 7, 0
+; LE-32BIT-NEXT:    b .LBB11_72
+; LE-32BIT-NEXT:  .LBB11_71:
+; LE-32BIT-NEXT:    addi 3, 29, 0
+; LE-32BIT-NEXT:  .LBB11_72:
+; LE-32BIT-NEXT:    bc 12, 14, .LBB11_73
+; LE-32BIT-NEXT:    b .LBB11_74
+; LE-32BIT-NEXT:  .LBB11_73:
+; LE-32BIT-NEXT:    addi 5, 17, 0
+; LE-32BIT-NEXT:  .LBB11_74:
+; LE-32BIT-NEXT:    stw 4, 8(18)
+; LE-32BIT-NEXT:    or 4, 0, 6
+; LE-32BIT-NEXT:    bc 12, 0, .LBB11_76
+; LE-32BIT-NEXT:  # %bb.75:
+; LE-32BIT-NEXT:    ori 4, 30, 0
+; LE-32BIT-NEXT:    b .LBB11_76
+; LE-32BIT-NEXT:  .LBB11_76:
+; LE-32BIT-NEXT:    bc 12, 28, .LBB11_78
+; LE-32BIT-NEXT:  # %bb.77:
+; LE-32BIT-NEXT:    ori 27, 15, 0
+; LE-32BIT-NEXT:    b .LBB11_79
+; LE-32BIT-NEXT:  .LBB11_78:
+; LE-32BIT-NEXT:    addi 27, 9, 0
+; LE-32BIT-NEXT:  .LBB11_79:
+; LE-32BIT-NEXT:    bc 12, 22, .LBB11_80
+; LE-32BIT-NEXT:    b .LBB11_81
+; LE-32BIT-NEXT:  .LBB11_80:
+; LE-32BIT-NEXT:    addi 3, 16, 0
+; LE-32BIT-NEXT:  .LBB11_81:
+; LE-32BIT-NEXT:    stw 12, 12(18)
+; LE-32BIT-NEXT:    bc 12, 22, .LBB11_82
+; LE-32BIT-NEXT:    b .LBB11_83
+; LE-32BIT-NEXT:  .LBB11_82:
+; LE-32BIT-NEXT:    addi 4, 31, 0
+; LE-32BIT-NEXT:  .LBB11_83:
+; LE-32BIT-NEXT:    or 7, 8, 27
+; LE-32BIT-NEXT:    stw 4, 16(18)
+; LE-32BIT-NEXT:    bc 12, 0, .LBB11_85
+; LE-32BIT-NEXT:  # %bb.84:
+; LE-32BIT-NEXT:    ori 6, 28, 0
+; LE-32BIT-NEXT:    b .LBB11_86
+; LE-32BIT-NEXT:  .LBB11_85:
+; LE-32BIT-NEXT:    addi 6, 7, 0
+; LE-32BIT-NEXT:  .LBB11_86:
+; LE-32BIT-NEXT:    lwz 4, 40(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    stw 3, 28(18)
+; LE-32BIT-NEXT:    or 3, 25, 5
+; LE-32BIT-NEXT:    bc 12, 0, .LBB11_88
+; LE-32BIT-NEXT:  # %bb.87:
+; LE-32BIT-NEXT:    ori 3, 24, 0
+; LE-32BIT-NEXT:    b .LBB11_88
+; LE-32BIT-NEXT:  .LBB11_88:
+; LE-32BIT-NEXT:    bc 12, 22, .LBB11_90
+; LE-32BIT-NEXT:  # %bb.89:
+; LE-32BIT-NEXT:    ori 5, 6, 0
+; LE-32BIT-NEXT:    b .LBB11_91
+; LE-32BIT-NEXT:  .LBB11_90:
+; LE-32BIT-NEXT:    addi 5, 10, 0
+; LE-32BIT-NEXT:    addi 3, 4, 0
+; LE-32BIT-NEXT:  .LBB11_91:
+; LE-32BIT-NEXT:    stw 5, 24(18)
+; LE-32BIT-NEXT:    stw 3, 20(18)
+; LE-32BIT-NEXT:    lwz 12, 84(1)
+; LE-32BIT-NEXT:    lwz 31, 156(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    mtcrf 32, 12 # cr2
+; LE-32BIT-NEXT:    mtcrf 16, 12 # cr3
+; LE-32BIT-NEXT:    lwz 30, 152(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 29, 148(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 28, 144(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 27, 140(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 26, 136(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 25, 132(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 24, 128(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 23, 124(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 22, 120(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 21, 116(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 20, 112(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 19, 108(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 18, 104(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 17, 100(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 16, 96(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 15, 92(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    lwz 14, 88(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT:    addi 1, 1, 160
+; LE-32BIT-NEXT:    blr
+  %src = load i256, ptr %src.ptr, align 1
+  %bitOff = load i256, ptr %bitOff.ptr, align 1
+  %res = ashr i256 %src, %bitOff
+  store i256 %res, ptr %dst, align 1
+  ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; LE: {{.*}}

diff  --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
new file mode 100644
index 000000000000..5ae623057375
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
@@ -0,0 +1,4168 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=ALL,RV64I
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=ALL,RV32I
+
+define void @lshr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: lshr_4bytes:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lbu a3, 1(a0)
+; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a5, 2(a0)
+; RV64I-NEXT:    slli a3, a3, 8
+; RV64I-NEXT:    lb a0, 3(a0)
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    lbu a1, 0(a1)
+; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    or a0, a0, a5
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    srlw a0, a0, a1
+; RV64I-NEXT:    sb a0, 0(a2)
+; RV64I-NEXT:    srli a1, a0, 16
+; RV64I-NEXT:    sb a1, 2(a2)
+; RV64I-NEXT:    srli a1, a0, 24
+; RV64I-NEXT:    sb a1, 3(a2)
+; RV64I-NEXT:    srli a0, a0, 8
+; RV64I-NEXT:    sb a0, 1(a2)
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: lshr_4bytes:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    lbu a3, 1(a0)
+; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a0, 3(a0)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a0, a0, a5
+; RV32I-NEXT:    or a0, a0, a3
+; RV32I-NEXT:    lbu a3, 1(a1)
+; RV32I-NEXT:    lbu a4, 0(a1)
+; RV32I-NEXT:    lbu a5, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, a5
+; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    srl a0, a0, a1
+; RV32I-NEXT:    sb a0, 0(a2)
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    sb a1, 2(a2)
+; RV32I-NEXT:    srli a1, a0, 24
+; RV32I-NEXT:    sb a1, 3(a2)
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sb a0, 1(a2)
+; RV32I-NEXT:    ret
+  %src = load i32, ptr %src.ptr, align 1
+  %bitOff = load i32, ptr %bitOff.ptr, align 1
+  %res = lshr i32 %src, %bitOff
+  store i32 %res, ptr %dst, align 1
+  ret void
+}
+define void @shl_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: shl_4bytes:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lbu a3, 1(a0)
+; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a5, 2(a0)
+; RV64I-NEXT:    slli a3, a3, 8
+; RV64I-NEXT:    lb a0, 3(a0)
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    lbu a1, 0(a1)
+; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    or a0, a0, a5
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    sllw a0, a0, a1
+; RV64I-NEXT:    sb a0, 0(a2)
+; RV64I-NEXT:    srli a1, a0, 16
+; RV64I-NEXT:    sb a1, 2(a2)
+; RV64I-NEXT:    srli a1, a0, 24
+; RV64I-NEXT:    sb a1, 3(a2)
+; RV64I-NEXT:    srli a0, a0, 8
+; RV64I-NEXT:    sb a0, 1(a2)
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: shl_4bytes:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    lbu a3, 1(a0)
+; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a0, 3(a0)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a0, a0, a5
+; RV32I-NEXT:    or a0, a0, a3
+; RV32I-NEXT:    lbu a3, 1(a1)
+; RV32I-NEXT:    lbu a4, 0(a1)
+; RV32I-NEXT:    lbu a5, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, a5
+; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    sll a0, a0, a1
+; RV32I-NEXT:    sb a0, 0(a2)
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    sb a1, 2(a2)
+; RV32I-NEXT:    srli a1, a0, 24
+; RV32I-NEXT:    sb a1, 3(a2)
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sb a0, 1(a2)
+; RV32I-NEXT:    ret
+  %src = load i32, ptr %src.ptr, align 1
+  %bitOff = load i32, ptr %bitOff.ptr, align 1
+  %res = shl i32 %src, %bitOff
+  store i32 %res, ptr %dst, align 1
+  ret void
+}
+define void @ashr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: ashr_4bytes:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lbu a3, 1(a0)
+; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a5, 2(a0)
+; RV64I-NEXT:    slli a3, a3, 8
+; RV64I-NEXT:    lb a0, 3(a0)
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    lbu a1, 0(a1)
+; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    or a0, a0, a5
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    sraw a0, a0, a1
+; RV64I-NEXT:    sb a0, 0(a2)
+; RV64I-NEXT:    srli a1, a0, 16
+; RV64I-NEXT:    sb a1, 2(a2)
+; RV64I-NEXT:    srli a1, a0, 24
+; RV64I-NEXT:    sb a1, 3(a2)
+; RV64I-NEXT:    srli a0, a0, 8
+; RV64I-NEXT:    sb a0, 1(a2)
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: ashr_4bytes:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    lbu a3, 1(a0)
+; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a0, 3(a0)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a0, a0, a5
+; RV32I-NEXT:    or a0, a0, a3
+; RV32I-NEXT:    lbu a3, 1(a1)
+; RV32I-NEXT:    lbu a4, 0(a1)
+; RV32I-NEXT:    lbu a5, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, a5
+; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    sra a0, a0, a1
+; RV32I-NEXT:    sb a0, 0(a2)
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    sb a1, 2(a2)
+; RV32I-NEXT:    srli a1, a0, 24
+; RV32I-NEXT:    sb a1, 3(a2)
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sb a0, 1(a2)
+; RV32I-NEXT:    ret
+  %src = load i32, ptr %src.ptr, align 1
+  %bitOff = load i32, ptr %bitOff.ptr, align 1
+  %res = ashr i32 %src, %bitOff
+  store i32 %res, ptr %dst, align 1
+  ret void
+}
+
+define void @lshr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: lshr_8bytes:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lbu a3, 1(a0)
+; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a5, 2(a0)
+; RV64I-NEXT:    lbu a6, 3(a0)
+; RV64I-NEXT:    slli a3, a3, 8
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 5(a0)
+; RV64I-NEXT:    lbu a5, 4(a0)
+; RV64I-NEXT:    lbu a6, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    or a0, a0, a4
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    lbu a3, 1(a1)
+; RV64I-NEXT:    lbu a4, 0(a1)
+; RV64I-NEXT:    lbu a5, 2(a1)
+; RV64I-NEXT:    lbu a6, 3(a1)
+; RV64I-NEXT:    slli a3, a3, 8
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 5(a1)
+; RV64I-NEXT:    lbu a5, 4(a1)
+; RV64I-NEXT:    lbu a6, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a1, a1, 24
+; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    or a1, a1, a4
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    or a1, a1, a3
+; RV64I-NEXT:    srl a0, a0, a1
+; RV64I-NEXT:    sb a0, 0(a2)
+; RV64I-NEXT:    srli a1, a0, 48
+; RV64I-NEXT:    sb a1, 6(a2)
+; RV64I-NEXT:    srli a1, a0, 56
+; RV64I-NEXT:    sb a1, 7(a2)
+; RV64I-NEXT:    srli a1, a0, 32
+; RV64I-NEXT:    sb a1, 4(a2)
+; RV64I-NEXT:    srli a1, a0, 40
+; RV64I-NEXT:    sb a1, 5(a2)
+; RV64I-NEXT:    srli a1, a0, 16
+; RV64I-NEXT:    sb a1, 2(a2)
+; RV64I-NEXT:    srli a1, a0, 24
+; RV64I-NEXT:    sb a1, 3(a2)
+; RV64I-NEXT:    srli a0, a0, 8
+; RV64I-NEXT:    sb a0, 1(a2)
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: lshr_8bytes:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    lbu a3, 5(a0)
+; RV32I-NEXT:    lbu a4, 4(a0)
+; RV32I-NEXT:    lbu a5, 6(a0)
+; RV32I-NEXT:    lbu a6, 7(a0)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a4, a3, a4
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    lbu a3, 1(a1)
+; RV32I-NEXT:    lbu a6, 0(a1)
+; RV32I-NEXT:    lbu a7, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a6
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, a7
+; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    addi a3, a1, -32
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    bltz a3, .LBB3_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    srl a0, a4, a3
+; RV32I-NEXT:    j .LBB3_3
+; RV32I-NEXT:  .LBB3_2:
+; RV32I-NEXT:    lbu a5, 1(a0)
+; RV32I-NEXT:    lbu a6, 0(a0)
+; RV32I-NEXT:    lbu a7, 2(a0)
+; RV32I-NEXT:    lbu a0, 3(a0)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a0, a0, a7
+; RV32I-NEXT:    or a0, a0, a5
+; RV32I-NEXT:    srl a0, a0, a1
+; RV32I-NEXT:    xori a5, a1, 31
+; RV32I-NEXT:    slli a6, a4, 1
+; RV32I-NEXT:    sll a5, a6, a5
+; RV32I-NEXT:    or a0, a0, a5
+; RV32I-NEXT:  .LBB3_3:
+; RV32I-NEXT:    srl a1, a4, a1
+; RV32I-NEXT:    slti a3, a3, 0
+; RV32I-NEXT:    neg a3, a3
+; RV32I-NEXT:    and a1, a3, a1
+; RV32I-NEXT:    sb a1, 4(a2)
+; RV32I-NEXT:    sb a0, 0(a2)
+; RV32I-NEXT:    srli a3, a1, 16
+; RV32I-NEXT:    sb a3, 6(a2)
+; RV32I-NEXT:    srli a3, a1, 24
+; RV32I-NEXT:    sb a3, 7(a2)
+; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a1, 5(a2)
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    sb a1, 2(a2)
+; RV32I-NEXT:    srli a1, a0, 24
+; RV32I-NEXT:    sb a1, 3(a2)
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sb a0, 1(a2)
+; RV32I-NEXT:    ret
+  %src = load i64, ptr %src.ptr, align 1
+  %bitOff = load i64, ptr %bitOff.ptr, align 1
+  %res = lshr i64 %src, %bitOff
+  store i64 %res, ptr %dst, align 1
+  ret void
+}
+define void @shl_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: shl_8bytes:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lbu a3, 1(a0)
+; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a5, 2(a0)
+; RV64I-NEXT:    lbu a6, 3(a0)
+; RV64I-NEXT:    slli a3, a3, 8
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 5(a0)
+; RV64I-NEXT:    lbu a5, 4(a0)
+; RV64I-NEXT:    lbu a6, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    or a0, a0, a4
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    lbu a3, 1(a1)
+; RV64I-NEXT:    lbu a4, 0(a1)
+; RV64I-NEXT:    lbu a5, 2(a1)
+; RV64I-NEXT:    lbu a6, 3(a1)
+; RV64I-NEXT:    slli a3, a3, 8
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 5(a1)
+; RV64I-NEXT:    lbu a5, 4(a1)
+; RV64I-NEXT:    lbu a6, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a1, a1, 24
+; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    or a1, a1, a4
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    or a1, a1, a3
+; RV64I-NEXT:    sll a0, a0, a1
+; RV64I-NEXT:    sb a0, 0(a2)
+; RV64I-NEXT:    srli a1, a0, 48
+; RV64I-NEXT:    sb a1, 6(a2)
+; RV64I-NEXT:    srli a1, a0, 56
+; RV64I-NEXT:    sb a1, 7(a2)
+; RV64I-NEXT:    srli a1, a0, 32
+; RV64I-NEXT:    sb a1, 4(a2)
+; RV64I-NEXT:    srli a1, a0, 40
+; RV64I-NEXT:    sb a1, 5(a2)
+; RV64I-NEXT:    srli a1, a0, 16
+; RV64I-NEXT:    sb a1, 2(a2)
+; RV64I-NEXT:    srli a1, a0, 24
+; RV64I-NEXT:    sb a1, 3(a2)
+; RV64I-NEXT:    srli a0, a0, 8
+; RV64I-NEXT:    sb a0, 1(a2)
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: shl_8bytes:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    lbu a3, 1(a0)
+; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a4, a3, a4
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    lbu a3, 1(a1)
+; RV32I-NEXT:    lbu a6, 0(a1)
+; RV32I-NEXT:    lbu a7, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a6
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, a7
+; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    addi a3, a1, -32
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    bltz a3, .LBB4_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    sll a0, a4, a3
+; RV32I-NEXT:    j .LBB4_3
+; RV32I-NEXT:  .LBB4_2:
+; RV32I-NEXT:    lbu a5, 5(a0)
+; RV32I-NEXT:    lbu a6, 4(a0)
+; RV32I-NEXT:    lbu a7, 6(a0)
+; RV32I-NEXT:    lbu a0, 7(a0)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a0, a0, a7
+; RV32I-NEXT:    or a0, a0, a5
+; RV32I-NEXT:    sll a0, a0, a1
+; RV32I-NEXT:    xori a5, a1, 31
+; RV32I-NEXT:    srli a6, a4, 1
+; RV32I-NEXT:    srl a5, a6, a5
+; RV32I-NEXT:    or a0, a0, a5
+; RV32I-NEXT:  .LBB4_3:
+; RV32I-NEXT:    sll a1, a4, a1
+; RV32I-NEXT:    slti a3, a3, 0
+; RV32I-NEXT:    neg a3, a3
+; RV32I-NEXT:    and a1, a3, a1
+; RV32I-NEXT:    sb a0, 4(a2)
+; RV32I-NEXT:    sb a1, 0(a2)
+; RV32I-NEXT:    srli a3, a0, 16
+; RV32I-NEXT:    sb a3, 6(a2)
+; RV32I-NEXT:    srli a3, a0, 24
+; RV32I-NEXT:    sb a3, 7(a2)
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sb a0, 5(a2)
+; RV32I-NEXT:    srli a0, a1, 16
+; RV32I-NEXT:    sb a0, 2(a2)
+; RV32I-NEXT:    srli a0, a1, 24
+; RV32I-NEXT:    sb a0, 3(a2)
+; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a1, 1(a2)
+; RV32I-NEXT:    ret
+  %src = load i64, ptr %src.ptr, align 1
+  %bitOff = load i64, ptr %bitOff.ptr, align 1
+  %res = shl i64 %src, %bitOff
+  store i64 %res, ptr %dst, align 1
+  ret void
+}
+define void @ashr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: ashr_8bytes:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lbu a3, 1(a0)
+; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a5, 2(a0)
+; RV64I-NEXT:    lbu a6, 3(a0)
+; RV64I-NEXT:    slli a3, a3, 8
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 5(a0)
+; RV64I-NEXT:    lbu a5, 4(a0)
+; RV64I-NEXT:    lbu a6, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    or a0, a0, a4
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    lbu a3, 1(a1)
+; RV64I-NEXT:    lbu a4, 0(a1)
+; RV64I-NEXT:    lbu a5, 2(a1)
+; RV64I-NEXT:    lbu a6, 3(a1)
+; RV64I-NEXT:    slli a3, a3, 8
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 5(a1)
+; RV64I-NEXT:    lbu a5, 4(a1)
+; RV64I-NEXT:    lbu a6, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a1, a1, 24
+; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    or a1, a1, a4
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    or a1, a1, a3
+; RV64I-NEXT:    sra a0, a0, a1
+; RV64I-NEXT:    sb a0, 0(a2)
+; RV64I-NEXT:    srli a1, a0, 48
+; RV64I-NEXT:    sb a1, 6(a2)
+; RV64I-NEXT:    srli a1, a0, 56
+; RV64I-NEXT:    sb a1, 7(a2)
+; RV64I-NEXT:    srli a1, a0, 32
+; RV64I-NEXT:    sb a1, 4(a2)
+; RV64I-NEXT:    srli a1, a0, 40
+; RV64I-NEXT:    sb a1, 5(a2)
+; RV64I-NEXT:    srli a1, a0, 16
+; RV64I-NEXT:    sb a1, 2(a2)
+; RV64I-NEXT:    srli a1, a0, 24
+; RV64I-NEXT:    sb a1, 3(a2)
+; RV64I-NEXT:    srli a0, a0, 8
+; RV64I-NEXT:    sb a0, 1(a2)
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: ashr_8bytes:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    lbu a3, 5(a0)
+; RV32I-NEXT:    lbu a4, 4(a0)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    lbu a5, 1(a1)
+; RV32I-NEXT:    or a6, a3, a4
+; RV32I-NEXT:    lbu a3, 6(a0)
+; RV32I-NEXT:    lbu a4, 0(a1)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    lbu a7, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    or a5, a5, a4
+; RV32I-NEXT:    lbu a4, 7(a0)
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, a7
+; RV32I-NEXT:    slli a3, a3, 16
+; RV32I-NEXT:    slli a4, a4, 24
+; RV32I-NEXT:    or a7, a4, a3
+; RV32I-NEXT:    or a3, a1, a5
+; RV32I-NEXT:    addi a5, a3, -32
+; RV32I-NEXT:    or a1, a7, a6
+; RV32I-NEXT:    bltz a5, .LBB5_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    sra a0, a1, a5
+; RV32I-NEXT:    srai a1, a4, 31
+; RV32I-NEXT:    j .LBB5_3
+; RV32I-NEXT:  .LBB5_2:
+; RV32I-NEXT:    lbu a4, 1(a0)
+; RV32I-NEXT:    lbu a5, 0(a0)
+; RV32I-NEXT:    lbu a6, 2(a0)
+; RV32I-NEXT:    lbu a0, 3(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a0, a0, a6
+; RV32I-NEXT:    or a0, a0, a4
+; RV32I-NEXT:    srl a0, a0, a3
+; RV32I-NEXT:    xori a4, a3, 31
+; RV32I-NEXT:    slli a5, a1, 1
+; RV32I-NEXT:    sll a4, a5, a4
+; RV32I-NEXT:    or a0, a0, a4
+; RV32I-NEXT:    sra a1, a1, a3
+; RV32I-NEXT:  .LBB5_3:
+; RV32I-NEXT:    sb a1, 4(a2)
+; RV32I-NEXT:    srli a3, a1, 16
+; RV32I-NEXT:    sb a3, 6(a2)
+; RV32I-NEXT:    srli a3, a1, 24
+; RV32I-NEXT:    sb a3, 7(a2)
+; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a1, 5(a2)
+; RV32I-NEXT:    sb a0, 0(a2)
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    sb a1, 2(a2)
+; RV32I-NEXT:    srli a1, a0, 24
+; RV32I-NEXT:    sb a1, 3(a2)
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sb a0, 1(a2)
+; RV32I-NEXT:    ret
+  %src = load i64, ptr %src.ptr, align 1
+  %bitOff = load i64, ptr %bitOff.ptr, align 1
+  %res = ashr i64 %src, %bitOff
+  store i64 %res, ptr %dst, align 1
+  ret void
+}
+
+define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: lshr_16bytes:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lbu a3, 9(a0)
+; RV64I-NEXT:    lbu a4, 8(a0)
+; RV64I-NEXT:    lbu a5, 10(a0)
+; RV64I-NEXT:    lbu a6, 11(a0)
+; RV64I-NEXT:    slli a3, a3, 8
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a4, a4, a3
+; RV64I-NEXT:    lbu a3, 13(a0)
+; RV64I-NEXT:    lbu a5, 12(a0)
+; RV64I-NEXT:    lbu a6, 14(a0)
+; RV64I-NEXT:    lbu a7, 15(a0)
+; RV64I-NEXT:    slli a3, a3, 8
+; RV64I-NEXT:    or a3, a3, a5
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a3, a5, a3
+; RV64I-NEXT:    lbu a5, 1(a1)
+; RV64I-NEXT:    lbu a6, 0(a1)
+; RV64I-NEXT:    lbu a7, 2(a1)
+; RV64I-NEXT:    lbu t0, 3(a1)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a6, t0, a7
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    lbu a6, 5(a1)
+; RV64I-NEXT:    lbu a7, 4(a1)
+; RV64I-NEXT:    lbu t0, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli a1, a1, 24
+; RV64I-NEXT:    or a1, a1, t0
+; RV64I-NEXT:    slli a7, a3, 32
+; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    or a1, a1, a5
+; RV64I-NEXT:    addi a3, a1, -64
+; RV64I-NEXT:    or a4, a7, a4
+; RV64I-NEXT:    bltz a3, .LBB6_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    srl a0, a4, a3
+; RV64I-NEXT:    j .LBB6_3
+; RV64I-NEXT:  .LBB6_2:
+; RV64I-NEXT:    lbu a5, 1(a0)
+; RV64I-NEXT:    lbu a6, 0(a0)
+; RV64I-NEXT:    lbu a7, 2(a0)
+; RV64I-NEXT:    lbu t0, 3(a0)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a6, t0, a7
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    lbu a6, 5(a0)
+; RV64I-NEXT:    lbu a7, 4(a0)
+; RV64I-NEXT:    lbu t0, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    or a0, a0, t0
+; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    or a0, a0, a5
+; RV64I-NEXT:    srl a0, a0, a1
+; RV64I-NEXT:    xori a5, a1, 63
+; RV64I-NEXT:    slli a6, a4, 1
+; RV64I-NEXT:    sll a5, a6, a5
+; RV64I-NEXT:    or a0, a0, a5
+; RV64I-NEXT:  .LBB6_3:
+; RV64I-NEXT:    srl a1, a4, a1
+; RV64I-NEXT:    slti a3, a3, 0
+; RV64I-NEXT:    neg a3, a3
+; RV64I-NEXT:    and a1, a3, a1
+; RV64I-NEXT:    sb a1, 8(a2)
+; RV64I-NEXT:    sb a0, 0(a2)
+; RV64I-NEXT:    srli a3, a1, 56
+; RV64I-NEXT:    sb a3, 15(a2)
+; RV64I-NEXT:    srli a3, a1, 48
+; RV64I-NEXT:    sb a3, 14(a2)
+; RV64I-NEXT:    srli a3, a1, 40
+; RV64I-NEXT:    sb a3, 13(a2)
+; RV64I-NEXT:    srli a3, a1, 32
+; RV64I-NEXT:    sb a3, 12(a2)
+; RV64I-NEXT:    srli a3, a1, 24
+; RV64I-NEXT:    sb a3, 11(a2)
+; RV64I-NEXT:    srli a3, a1, 16
+; RV64I-NEXT:    sb a3, 10(a2)
+; RV64I-NEXT:    srli a1, a1, 8
+; RV64I-NEXT:    sb a1, 9(a2)
+; RV64I-NEXT:    srli a1, a0, 56
+; RV64I-NEXT:    sb a1, 7(a2)
+; RV64I-NEXT:    srli a1, a0, 48
+; RV64I-NEXT:    sb a1, 6(a2)
+; RV64I-NEXT:    srli a1, a0, 40
+; RV64I-NEXT:    sb a1, 5(a2)
+; RV64I-NEXT:    srli a1, a0, 32
+; RV64I-NEXT:    sb a1, 4(a2)
+; RV64I-NEXT:    srli a1, a0, 24
+; RV64I-NEXT:    sb a1, 3(a2)
+; RV64I-NEXT:    srli a1, a0, 16
+; RV64I-NEXT:    sb a1, 2(a2)
+; RV64I-NEXT:    srli a0, a0, 8
+; RV64I-NEXT:    sb a0, 1(a2)
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: lshr_16bytes:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 5(a0)
+; RV32I-NEXT:    lbu a4, 4(a0)
+; RV32I-NEXT:    lbu a5, 6(a0)
+; RV32I-NEXT:    lbu a6, 7(a0)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a7, a3, a4
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or t0, a6, a5
+; RV32I-NEXT:    lbu a3, 1(a0)
+; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or t3, a3, a4
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or t4, a6, a5
+; RV32I-NEXT:    lbu a3, 13(a0)
+; RV32I-NEXT:    lbu a4, 12(a0)
+; RV32I-NEXT:    lbu a5, 14(a0)
+; RV32I-NEXT:    lbu a6, 15(a0)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 9(a0)
+; RV32I-NEXT:    lbu a5, 8(a0)
+; RV32I-NEXT:    lbu a6, 10(a0)
+; RV32I-NEXT:    lbu a0, 11(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a5, a0, a6
+; RV32I-NEXT:    or a5, a5, a4
+; RV32I-NEXT:    lbu a0, 1(a1)
+; RV32I-NEXT:    lbu a4, 0(a1)
+; RV32I-NEXT:    lbu a6, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a0, a0, 8
+; RV32I-NEXT:    or a0, a0, a4
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a4, a1, a6
+; RV32I-NEXT:    or a4, a4, a0
+; RV32I-NEXT:    addi t1, a4, -64
+; RV32I-NEXT:    addi t2, a4, -96
+; RV32I-NEXT:    slli a6, a3, 1
+; RV32I-NEXT:    bltz t2, .LBB6_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    srl t5, a3, t2
+; RV32I-NEXT:    j .LBB6_3
+; RV32I-NEXT:  .LBB6_2:
+; RV32I-NEXT:    srl a0, a5, t1
+; RV32I-NEXT:    xori a1, t1, 31
+; RV32I-NEXT:    sll a1, a6, a1
+; RV32I-NEXT:    or t5, a0, a1
+; RV32I-NEXT:  .LBB6_3:
+; RV32I-NEXT:    or a0, t0, a7
+; RV32I-NEXT:    or a1, t4, t3
+; RV32I-NEXT:    addi t0, a4, -32
+; RV32I-NEXT:    xori a7, a4, 31
+; RV32I-NEXT:    bltz t0, .LBB6_5
+; RV32I-NEXT:  # %bb.4:
+; RV32I-NEXT:    srl s1, a0, t0
+; RV32I-NEXT:    j .LBB6_6
+; RV32I-NEXT:  .LBB6_5:
+; RV32I-NEXT:    srl t3, a1, a4
+; RV32I-NEXT:    slli t4, a0, 1
+; RV32I-NEXT:    sll t4, t4, a7
+; RV32I-NEXT:    or s1, t3, t4
+; RV32I-NEXT:  .LBB6_6:
+; RV32I-NEXT:    neg t3, a4
+; RV32I-NEXT:    sll t4, a5, t3
+; RV32I-NEXT:    li s0, 32
+; RV32I-NEXT:    li t6, 64
+; RV32I-NEXT:    sub s0, s0, a4
+; RV32I-NEXT:    bltu a4, t6, .LBB6_12
+; RV32I-NEXT:  # %bb.7:
+; RV32I-NEXT:    bnez a4, .LBB6_13
+; RV32I-NEXT:  .LBB6_8:
+; RV32I-NEXT:    bgez s0, .LBB6_10
+; RV32I-NEXT:  .LBB6_9:
+; RV32I-NEXT:    sll t3, a3, t3
+; RV32I-NEXT:    srli t4, a5, 1
+; RV32I-NEXT:    sub t5, t6, a4
+; RV32I-NEXT:    xori t5, t5, 31
+; RV32I-NEXT:    srl t4, t4, t5
+; RV32I-NEXT:    or t4, t3, t4
+; RV32I-NEXT:  .LBB6_10:
+; RV32I-NEXT:    slti t3, t0, 0
+; RV32I-NEXT:    neg t3, t3
+; RV32I-NEXT:    bltu a4, t6, .LBB6_14
+; RV32I-NEXT:  # %bb.11:
+; RV32I-NEXT:    srl t1, a3, t1
+; RV32I-NEXT:    slti t2, t2, 0
+; RV32I-NEXT:    neg t2, t2
+; RV32I-NEXT:    and t1, t2, t1
+; RV32I-NEXT:    bnez a4, .LBB6_15
+; RV32I-NEXT:    j .LBB6_16
+; RV32I-NEXT:  .LBB6_12:
+; RV32I-NEXT:    slti t5, s0, 0
+; RV32I-NEXT:    neg t5, t5
+; RV32I-NEXT:    and t5, t5, t4
+; RV32I-NEXT:    or t5, s1, t5
+; RV32I-NEXT:    beqz a4, .LBB6_8
+; RV32I-NEXT:  .LBB6_13:
+; RV32I-NEXT:    mv a1, t5
+; RV32I-NEXT:    bltz s0, .LBB6_9
+; RV32I-NEXT:    j .LBB6_10
+; RV32I-NEXT:  .LBB6_14:
+; RV32I-NEXT:    srl t1, a0, a4
+; RV32I-NEXT:    and t1, t3, t1
+; RV32I-NEXT:    or t1, t1, t4
+; RV32I-NEXT:    beqz a4, .LBB6_16
+; RV32I-NEXT:  .LBB6_15:
+; RV32I-NEXT:    mv a0, t1
+; RV32I-NEXT:  .LBB6_16:
+; RV32I-NEXT:    bltz t0, .LBB6_18
+; RV32I-NEXT:  # %bb.17:
+; RV32I-NEXT:    srl a5, a3, t0
+; RV32I-NEXT:    j .LBB6_19
+; RV32I-NEXT:  .LBB6_18:
+; RV32I-NEXT:    srl a5, a5, a4
+; RV32I-NEXT:    sll a6, a6, a7
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:  .LBB6_19:
+; RV32I-NEXT:    sltiu a6, a4, 64
+; RV32I-NEXT:    neg a6, a6
+; RV32I-NEXT:    and a5, a6, a5
+; RV32I-NEXT:    srl a3, a3, a4
+; RV32I-NEXT:    and a3, t3, a3
+; RV32I-NEXT:    and a3, a6, a3
+; RV32I-NEXT:    sb a5, 8(a2)
+; RV32I-NEXT:    sb a3, 12(a2)
+; RV32I-NEXT:    srli a4, a5, 16
+; RV32I-NEXT:    sb a4, 10(a2)
+; RV32I-NEXT:    srli a4, a5, 24
+; RV32I-NEXT:    sb a4, 11(a2)
+; RV32I-NEXT:    srli a5, a5, 8
+; RV32I-NEXT:    sb a5, 9(a2)
+; RV32I-NEXT:    srli a4, a3, 16
+; RV32I-NEXT:    sb a4, 14(a2)
+; RV32I-NEXT:    srli a4, a3, 24
+; RV32I-NEXT:    sb a4, 15(a2)
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 13(a2)
+; RV32I-NEXT:    sb a1, 0(a2)
+; RV32I-NEXT:    srli a3, a1, 16
+; RV32I-NEXT:    sb a3, 2(a2)
+; RV32I-NEXT:    srli a3, a1, 24
+; RV32I-NEXT:    sb a3, 3(a2)
+; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a1, 1(a2)
+; RV32I-NEXT:    sb a0, 4(a2)
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    sb a1, 6(a2)
+; RV32I-NEXT:    srli a1, a0, 24
+; RV32I-NEXT:    sb a1, 7(a2)
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sb a0, 5(a2)
+; RV32I-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+  %src = load i128, ptr %src.ptr, align 1
+  %bitOff = load i128, ptr %bitOff.ptr, align 1
+  %res = lshr i128 %src, %bitOff
+  store i128 %res, ptr %dst, align 1
+  ret void
+}
+define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: shl_16bytes:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lbu a3, 1(a0)
+; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a5, 2(a0)
+; RV64I-NEXT:    lbu a6, 3(a0)
+; RV64I-NEXT:    slli a3, a3, 8
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a4, a4, a3
+; RV64I-NEXT:    lbu a3, 5(a0)
+; RV64I-NEXT:    lbu a5, 4(a0)
+; RV64I-NEXT:    lbu a6, 6(a0)
+; RV64I-NEXT:    lbu a7, 7(a0)
+; RV64I-NEXT:    slli a3, a3, 8
+; RV64I-NEXT:    or a3, a3, a5
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a3, a5, a3
+; RV64I-NEXT:    lbu a5, 1(a1)
+; RV64I-NEXT:    lbu a6, 0(a1)
+; RV64I-NEXT:    lbu a7, 2(a1)
+; RV64I-NEXT:    lbu t0, 3(a1)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a6, t0, a7
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    lbu a6, 5(a1)
+; RV64I-NEXT:    lbu a7, 4(a1)
+; RV64I-NEXT:    lbu t0, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli a1, a1, 24
+; RV64I-NEXT:    or a1, a1, t0
+; RV64I-NEXT:    slli a7, a3, 32
+; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    or a1, a1, a5
+; RV64I-NEXT:    addi a3, a1, -64
+; RV64I-NEXT:    or a4, a7, a4
+; RV64I-NEXT:    bltz a3, .LBB7_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    sll a0, a4, a3
+; RV64I-NEXT:    j .LBB7_3
+; RV64I-NEXT:  .LBB7_2:
+; RV64I-NEXT:    lbu a5, 9(a0)
+; RV64I-NEXT:    lbu a6, 8(a0)
+; RV64I-NEXT:    lbu a7, 10(a0)
+; RV64I-NEXT:    lbu t0, 11(a0)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a6, t0, a7
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    lbu a6, 13(a0)
+; RV64I-NEXT:    lbu a7, 12(a0)
+; RV64I-NEXT:    lbu t0, 14(a0)
+; RV64I-NEXT:    lbu a0, 15(a0)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    or a0, a0, t0
+; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    or a0, a0, a5
+; RV64I-NEXT:    sll a0, a0, a1
+; RV64I-NEXT:    xori a5, a1, 63
+; RV64I-NEXT:    srli a6, a4, 1
+; RV64I-NEXT:    srl a5, a6, a5
+; RV64I-NEXT:    or a0, a0, a5
+; RV64I-NEXT:  .LBB7_3:
+; RV64I-NEXT:    sll a1, a4, a1
+; RV64I-NEXT:    slti a3, a3, 0
+; RV64I-NEXT:    neg a3, a3
+; RV64I-NEXT:    and a1, a3, a1
+; RV64I-NEXT:    sb a0, 8(a2)
+; RV64I-NEXT:    sb a1, 0(a2)
+; RV64I-NEXT:    srli a3, a0, 56
+; RV64I-NEXT:    sb a3, 15(a2)
+; RV64I-NEXT:    srli a3, a0, 48
+; RV64I-NEXT:    sb a3, 14(a2)
+; RV64I-NEXT:    srli a3, a0, 40
+; RV64I-NEXT:    sb a3, 13(a2)
+; RV64I-NEXT:    srli a3, a0, 32
+; RV64I-NEXT:    sb a3, 12(a2)
+; RV64I-NEXT:    srli a3, a0, 24
+; RV64I-NEXT:    sb a3, 11(a2)
+; RV64I-NEXT:    srli a3, a0, 16
+; RV64I-NEXT:    sb a3, 10(a2)
+; RV64I-NEXT:    srli a0, a0, 8
+; RV64I-NEXT:    sb a0, 9(a2)
+; RV64I-NEXT:    srli a0, a1, 56
+; RV64I-NEXT:    sb a0, 7(a2)
+; RV64I-NEXT:    srli a0, a1, 48
+; RV64I-NEXT:    sb a0, 6(a2)
+; RV64I-NEXT:    srli a0, a1, 40
+; RV64I-NEXT:    sb a0, 5(a2)
+; RV64I-NEXT:    srli a0, a1, 32
+; RV64I-NEXT:    sb a0, 4(a2)
+; RV64I-NEXT:    srli a0, a1, 24
+; RV64I-NEXT:    sb a0, 3(a2)
+; RV64I-NEXT:    srli a0, a1, 16
+; RV64I-NEXT:    sb a0, 2(a2)
+; RV64I-NEXT:    srli a1, a1, 8
+; RV64I-NEXT:    sb a1, 1(a2)
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: shl_16bytes:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 9(a0)
+; RV32I-NEXT:    lbu a4, 8(a0)
+; RV32I-NEXT:    lbu a5, 10(a0)
+; RV32I-NEXT:    lbu a6, 11(a0)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a7, a3, a4
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or t0, a6, a5
+; RV32I-NEXT:    lbu a3, 13(a0)
+; RV32I-NEXT:    lbu a4, 12(a0)
+; RV32I-NEXT:    lbu a5, 14(a0)
+; RV32I-NEXT:    lbu a6, 15(a0)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or t3, a3, a4
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or t4, a6, a5
+; RV32I-NEXT:    lbu a3, 1(a0)
+; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 5(a0)
+; RV32I-NEXT:    lbu a5, 4(a0)
+; RV32I-NEXT:    lbu a6, 6(a0)
+; RV32I-NEXT:    lbu a0, 7(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a5, a0, a6
+; RV32I-NEXT:    or a5, a5, a4
+; RV32I-NEXT:    lbu a0, 1(a1)
+; RV32I-NEXT:    lbu a4, 0(a1)
+; RV32I-NEXT:    lbu a6, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a0, a0, 8
+; RV32I-NEXT:    or a0, a0, a4
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a4, a1, a6
+; RV32I-NEXT:    or a4, a4, a0
+; RV32I-NEXT:    addi t1, a4, -64
+; RV32I-NEXT:    addi t2, a4, -96
+; RV32I-NEXT:    srli a6, a3, 1
+; RV32I-NEXT:    bltz t2, .LBB7_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    sll t5, a3, t2
+; RV32I-NEXT:    j .LBB7_3
+; RV32I-NEXT:  .LBB7_2:
+; RV32I-NEXT:    sll a0, a5, t1
+; RV32I-NEXT:    xori a1, t1, 31
+; RV32I-NEXT:    srl a1, a6, a1
+; RV32I-NEXT:    or t5, a0, a1
+; RV32I-NEXT:  .LBB7_3:
+; RV32I-NEXT:    or a0, t0, a7
+; RV32I-NEXT:    or a1, t4, t3
+; RV32I-NEXT:    addi t0, a4, -32
+; RV32I-NEXT:    xori a7, a4, 31
+; RV32I-NEXT:    bltz t0, .LBB7_5
+; RV32I-NEXT:  # %bb.4:
+; RV32I-NEXT:    sll s1, a0, t0
+; RV32I-NEXT:    j .LBB7_6
+; RV32I-NEXT:  .LBB7_5:
+; RV32I-NEXT:    sll t3, a1, a4
+; RV32I-NEXT:    srli t4, a0, 1
+; RV32I-NEXT:    srl t4, t4, a7
+; RV32I-NEXT:    or s1, t3, t4
+; RV32I-NEXT:  .LBB7_6:
+; RV32I-NEXT:    neg t3, a4
+; RV32I-NEXT:    srl t4, a5, t3
+; RV32I-NEXT:    li s0, 32
+; RV32I-NEXT:    li t6, 64
+; RV32I-NEXT:    sub s0, s0, a4
+; RV32I-NEXT:    bltu a4, t6, .LBB7_12
+; RV32I-NEXT:  # %bb.7:
+; RV32I-NEXT:    bnez a4, .LBB7_13
+; RV32I-NEXT:  .LBB7_8:
+; RV32I-NEXT:    bgez s0, .LBB7_10
+; RV32I-NEXT:  .LBB7_9:
+; RV32I-NEXT:    srl t3, a3, t3
+; RV32I-NEXT:    slli t4, a5, 1
+; RV32I-NEXT:    sub t5, t6, a4
+; RV32I-NEXT:    xori t5, t5, 31
+; RV32I-NEXT:    sll t4, t4, t5
+; RV32I-NEXT:    or t4, t3, t4
+; RV32I-NEXT:  .LBB7_10:
+; RV32I-NEXT:    slti t3, t0, 0
+; RV32I-NEXT:    neg t3, t3
+; RV32I-NEXT:    bltu a4, t6, .LBB7_14
+; RV32I-NEXT:  # %bb.11:
+; RV32I-NEXT:    sll t1, a3, t1
+; RV32I-NEXT:    slti t2, t2, 0
+; RV32I-NEXT:    neg t2, t2
+; RV32I-NEXT:    and t1, t2, t1
+; RV32I-NEXT:    bnez a4, .LBB7_15
+; RV32I-NEXT:    j .LBB7_16
+; RV32I-NEXT:  .LBB7_12:
+; RV32I-NEXT:    slti t5, s0, 0
+; RV32I-NEXT:    neg t5, t5
+; RV32I-NEXT:    and t5, t5, t4
+; RV32I-NEXT:    or t5, s1, t5
+; RV32I-NEXT:    beqz a4, .LBB7_8
+; RV32I-NEXT:  .LBB7_13:
+; RV32I-NEXT:    mv a1, t5
+; RV32I-NEXT:    bltz s0, .LBB7_9
+; RV32I-NEXT:    j .LBB7_10
+; RV32I-NEXT:  .LBB7_14:
+; RV32I-NEXT:    sll t1, a0, a4
+; RV32I-NEXT:    and t1, t3, t1
+; RV32I-NEXT:    or t1, t1, t4
+; RV32I-NEXT:    beqz a4, .LBB7_16
+; RV32I-NEXT:  .LBB7_15:
+; RV32I-NEXT:    mv a0, t1
+; RV32I-NEXT:  .LBB7_16:
+; RV32I-NEXT:    bltz t0, .LBB7_18
+; RV32I-NEXT:  # %bb.17:
+; RV32I-NEXT:    sll a5, a3, t0
+; RV32I-NEXT:    j .LBB7_19
+; RV32I-NEXT:  .LBB7_18:
+; RV32I-NEXT:    sll a5, a5, a4
+; RV32I-NEXT:    srl a6, a6, a7
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:  .LBB7_19:
+; RV32I-NEXT:    sltiu a6, a4, 64
+; RV32I-NEXT:    neg a6, a6
+; RV32I-NEXT:    and a5, a6, a5
+; RV32I-NEXT:    sll a3, a3, a4
+; RV32I-NEXT:    and a3, t3, a3
+; RV32I-NEXT:    and a3, a6, a3
+; RV32I-NEXT:    sb a3, 0(a2)
+; RV32I-NEXT:    sb a5, 4(a2)
+; RV32I-NEXT:    srli a4, a3, 16
+; RV32I-NEXT:    sb a4, 2(a2)
+; RV32I-NEXT:    srli a4, a3, 24
+; RV32I-NEXT:    sb a4, 3(a2)
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 1(a2)
+; RV32I-NEXT:    srli a3, a5, 16
+; RV32I-NEXT:    sb a3, 6(a2)
+; RV32I-NEXT:    srli a3, a5, 24
+; RV32I-NEXT:    sb a3, 7(a2)
+; RV32I-NEXT:    srli a5, a5, 8
+; RV32I-NEXT:    sb a5, 5(a2)
+; RV32I-NEXT:    sb a1, 12(a2)
+; RV32I-NEXT:    sb a0, 8(a2)
+; RV32I-NEXT:    srli a3, a1, 16
+; RV32I-NEXT:    sb a3, 14(a2)
+; RV32I-NEXT:    srli a3, a1, 24
+; RV32I-NEXT:    sb a3, 15(a2)
+; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a1, 13(a2)
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    sb a1, 10(a2)
+; RV32I-NEXT:    srli a1, a0, 24
+; RV32I-NEXT:    sb a1, 11(a2)
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sb a0, 9(a2)
+; RV32I-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+  %src = load i128, ptr %src.ptr, align 1
+  %bitOff = load i128, ptr %bitOff.ptr, align 1
+  %res = shl i128 %src, %bitOff
+  store i128 %res, ptr %dst, align 1
+  ret void
+}
+define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: ashr_16bytes:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lbu a3, 9(a0)
+; RV64I-NEXT:    lbu a4, 8(a0)
+; RV64I-NEXT:    lbu a5, 10(a0)
+; RV64I-NEXT:    lbu a6, 11(a0)
+; RV64I-NEXT:    slli a3, a3, 8
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 13(a0)
+; RV64I-NEXT:    lbu a5, 12(a0)
+; RV64I-NEXT:    lbu a6, 14(a0)
+; RV64I-NEXT:    lbu a7, 15(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    lbu a5, 1(a1)
+; RV64I-NEXT:    lbu a6, 0(a1)
+; RV64I-NEXT:    lbu a7, 2(a1)
+; RV64I-NEXT:    lbu t0, 3(a1)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a6, t0, a7
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    lbu a6, 5(a1)
+; RV64I-NEXT:    lbu a7, 4(a1)
+; RV64I-NEXT:    lbu t0, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli a1, a1, 24
+; RV64I-NEXT:    or a1, a1, t0
+; RV64I-NEXT:    slli a7, a4, 32
+; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    or a1, a1, a5
+; RV64I-NEXT:    addi a5, a1, -64
+; RV64I-NEXT:    or a3, a7, a3
+; RV64I-NEXT:    bltz a5, .LBB8_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    sra a0, a3, a5
+; RV64I-NEXT:    sraiw a1, a4, 31
+; RV64I-NEXT:    j .LBB8_3
+; RV64I-NEXT:  .LBB8_2:
+; RV64I-NEXT:    lbu a4, 1(a0)
+; RV64I-NEXT:    lbu a5, 0(a0)
+; RV64I-NEXT:    lbu a6, 2(a0)
+; RV64I-NEXT:    lbu a7, 3(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    lbu a5, 5(a0)
+; RV64I-NEXT:    lbu a6, 4(a0)
+; RV64I-NEXT:    lbu a7, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    or a0, a0, a5
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    or a0, a0, a4
+; RV64I-NEXT:    srl a0, a0, a1
+; RV64I-NEXT:    xori a4, a1, 63
+; RV64I-NEXT:    slli a5, a3, 1
+; RV64I-NEXT:    sll a4, a5, a4
+; RV64I-NEXT:    or a0, a0, a4
+; RV64I-NEXT:    sra a1, a3, a1
+; RV64I-NEXT:  .LBB8_3:
+; RV64I-NEXT:    sb a1, 8(a2)
+; RV64I-NEXT:    srli a3, a1, 56
+; RV64I-NEXT:    sb a3, 15(a2)
+; RV64I-NEXT:    srli a3, a1, 48
+; RV64I-NEXT:    sb a3, 14(a2)
+; RV64I-NEXT:    srli a3, a1, 40
+; RV64I-NEXT:    sb a3, 13(a2)
+; RV64I-NEXT:    srli a3, a1, 32
+; RV64I-NEXT:    sb a3, 12(a2)
+; RV64I-NEXT:    srli a3, a1, 24
+; RV64I-NEXT:    sb a3, 11(a2)
+; RV64I-NEXT:    srli a3, a1, 16
+; RV64I-NEXT:    sb a3, 10(a2)
+; RV64I-NEXT:    srli a1, a1, 8
+; RV64I-NEXT:    sb a1, 9(a2)
+; RV64I-NEXT:    sb a0, 0(a2)
+; RV64I-NEXT:    srli a1, a0, 56
+; RV64I-NEXT:    sb a1, 7(a2)
+; RV64I-NEXT:    srli a1, a0, 48
+; RV64I-NEXT:    sb a1, 6(a2)
+; RV64I-NEXT:    srli a1, a0, 40
+; RV64I-NEXT:    sb a1, 5(a2)
+; RV64I-NEXT:    srli a1, a0, 32
+; RV64I-NEXT:    sb a1, 4(a2)
+; RV64I-NEXT:    srli a1, a0, 24
+; RV64I-NEXT:    sb a1, 3(a2)
+; RV64I-NEXT:    srli a1, a0, 16
+; RV64I-NEXT:    sb a1, 2(a2)
+; RV64I-NEXT:    srli a0, a0, 8
+; RV64I-NEXT:    sb a0, 1(a2)
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: ashr_16bytes:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 5(a0)
+; RV32I-NEXT:    lbu a4, 4(a0)
+; RV32I-NEXT:    lbu a5, 6(a0)
+; RV32I-NEXT:    lbu a6, 7(a0)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a7, a3, a4
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or t1, a6, a5
+; RV32I-NEXT:    lbu a3, 1(a0)
+; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a5, 2(a0)
+; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or t2, a3, a4
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or t5, a6, a5
+; RV32I-NEXT:    lbu a3, 13(a0)
+; RV32I-NEXT:    lbu a4, 12(a0)
+; RV32I-NEXT:    lbu a5, 14(a0)
+; RV32I-NEXT:    lbu t0, 15(a0)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli t0, t0, 24
+; RV32I-NEXT:    or a4, t0, a5
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 9(a0)
+; RV32I-NEXT:    lbu a5, 8(a0)
+; RV32I-NEXT:    lbu a6, 10(a0)
+; RV32I-NEXT:    lbu a0, 11(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a5, a0, a6
+; RV32I-NEXT:    or a5, a5, a4
+; RV32I-NEXT:    lbu a0, 1(a1)
+; RV32I-NEXT:    lbu a4, 0(a1)
+; RV32I-NEXT:    lbu a6, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a0, a0, 8
+; RV32I-NEXT:    or a0, a0, a4
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a4, a1, a6
+; RV32I-NEXT:    or a4, a4, a0
+; RV32I-NEXT:    addi t3, a4, -64
+; RV32I-NEXT:    addi t4, a4, -96
+; RV32I-NEXT:    slli a6, a3, 1
+; RV32I-NEXT:    bltz t4, .LBB8_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    sra t6, a3, t4
+; RV32I-NEXT:    j .LBB8_3
+; RV32I-NEXT:  .LBB8_2:
+; RV32I-NEXT:    srl a0, a5, t3
+; RV32I-NEXT:    xori a1, t3, 31
+; RV32I-NEXT:    sll a1, a6, a1
+; RV32I-NEXT:    or t6, a0, a1
+; RV32I-NEXT:  .LBB8_3:
+; RV32I-NEXT:    or a0, t1, a7
+; RV32I-NEXT:    or a1, t5, t2
+; RV32I-NEXT:    addi a7, a4, -32
+; RV32I-NEXT:    xori t2, a4, 31
+; RV32I-NEXT:    bltz a7, .LBB8_5
+; RV32I-NEXT:  # %bb.4:
+; RV32I-NEXT:    srl s2, a0, a7
+; RV32I-NEXT:    j .LBB8_6
+; RV32I-NEXT:  .LBB8_5:
+; RV32I-NEXT:    srl t1, a1, a4
+; RV32I-NEXT:    slli t5, a0, 1
+; RV32I-NEXT:    sll t5, t5, t2
+; RV32I-NEXT:    or s2, t1, t5
+; RV32I-NEXT:  .LBB8_6:
+; RV32I-NEXT:    neg s0, a4
+; RV32I-NEXT:    sll t5, a5, s0
+; RV32I-NEXT:    li s1, 32
+; RV32I-NEXT:    li t1, 64
+; RV32I-NEXT:    sub s1, s1, a4
+; RV32I-NEXT:    bltu a4, t1, .LBB8_11
+; RV32I-NEXT:  # %bb.7:
+; RV32I-NEXT:    bnez a4, .LBB8_12
+; RV32I-NEXT:  .LBB8_8:
+; RV32I-NEXT:    bltz s1, .LBB8_13
+; RV32I-NEXT:  .LBB8_9:
+; RV32I-NEXT:    srai t0, t0, 31
+; RV32I-NEXT:    bltz t4, .LBB8_14
+; RV32I-NEXT:  .LBB8_10:
+; RV32I-NEXT:    mv t3, t0
+; RV32I-NEXT:    bltu a4, t1, .LBB8_15
+; RV32I-NEXT:    j .LBB8_16
+; RV32I-NEXT:  .LBB8_11:
+; RV32I-NEXT:    slti t6, s1, 0
+; RV32I-NEXT:    neg t6, t6
+; RV32I-NEXT:    and t6, t6, t5
+; RV32I-NEXT:    or t6, s2, t6
+; RV32I-NEXT:    beqz a4, .LBB8_8
+; RV32I-NEXT:  .LBB8_12:
+; RV32I-NEXT:    mv a1, t6
+; RV32I-NEXT:    bgez s1, .LBB8_9
+; RV32I-NEXT:  .LBB8_13:
+; RV32I-NEXT:    sll t5, a3, s0
+; RV32I-NEXT:    srli t6, a5, 1
+; RV32I-NEXT:    sub s0, t1, a4
+; RV32I-NEXT:    xori s0, s0, 31
+; RV32I-NEXT:    srl t6, t6, s0
+; RV32I-NEXT:    or t5, t5, t6
+; RV32I-NEXT:    srai t0, t0, 31
+; RV32I-NEXT:    bgez t4, .LBB8_10
+; RV32I-NEXT:  .LBB8_14:
+; RV32I-NEXT:    sra t3, a3, t3
+; RV32I-NEXT:    bgeu a4, t1, .LBB8_16
+; RV32I-NEXT:  .LBB8_15:
+; RV32I-NEXT:    slti t3, a7, 0
+; RV32I-NEXT:    srl t4, a0, a4
+; RV32I-NEXT:    neg t3, t3
+; RV32I-NEXT:    and t3, t3, t4
+; RV32I-NEXT:    or t3, t3, t5
+; RV32I-NEXT:  .LBB8_16:
+; RV32I-NEXT:    bnez a4, .LBB8_19
+; RV32I-NEXT:  # %bb.17:
+; RV32I-NEXT:    bltz a7, .LBB8_20
+; RV32I-NEXT:  .LBB8_18:
+; RV32I-NEXT:    sra a5, a3, a7
+; RV32I-NEXT:    bgeu a4, t1, .LBB8_21
+; RV32I-NEXT:    j .LBB8_22
+; RV32I-NEXT:  .LBB8_19:
+; RV32I-NEXT:    mv a0, t3
+; RV32I-NEXT:    bgez a7, .LBB8_18
+; RV32I-NEXT:  .LBB8_20:
+; RV32I-NEXT:    srl a5, a5, a4
+; RV32I-NEXT:    sll a6, a6, t2
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    bltu a4, t1, .LBB8_22
+; RV32I-NEXT:  .LBB8_21:
+; RV32I-NEXT:    mv a5, t0
+; RV32I-NEXT:  .LBB8_22:
+; RV32I-NEXT:    bltz a7, .LBB8_24
+; RV32I-NEXT:  # %bb.23:
+; RV32I-NEXT:    mv a3, t0
+; RV32I-NEXT:    bgeu a4, t1, .LBB8_25
+; RV32I-NEXT:    j .LBB8_26
+; RV32I-NEXT:  .LBB8_24:
+; RV32I-NEXT:    sra a3, a3, a4
+; RV32I-NEXT:    bltu a4, t1, .LBB8_26
+; RV32I-NEXT:  .LBB8_25:
+; RV32I-NEXT:    mv a3, t0
+; RV32I-NEXT:  .LBB8_26:
+; RV32I-NEXT:    sb a3, 12(a2)
+; RV32I-NEXT:    srli a4, a3, 16
+; RV32I-NEXT:    sb a4, 14(a2)
+; RV32I-NEXT:    srli a4, a3, 24
+; RV32I-NEXT:    sb a4, 15(a2)
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 13(a2)
+; RV32I-NEXT:    sb a5, 8(a2)
+; RV32I-NEXT:    srli a3, a5, 16
+; RV32I-NEXT:    sb a3, 10(a2)
+; RV32I-NEXT:    srli a3, a5, 24
+; RV32I-NEXT:    sb a3, 11(a2)
+; RV32I-NEXT:    srli a5, a5, 8
+; RV32I-NEXT:    sb a5, 9(a2)
+; RV32I-NEXT:    sb a1, 0(a2)
+; RV32I-NEXT:    srli a3, a1, 16
+; RV32I-NEXT:    sb a3, 2(a2)
+; RV32I-NEXT:    srli a3, a1, 24
+; RV32I-NEXT:    sb a3, 3(a2)
+; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a1, 1(a2)
+; RV32I-NEXT:    sb a0, 4(a2)
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    sb a1, 6(a2)
+; RV32I-NEXT:    srli a1, a0, 24
+; RV32I-NEXT:    sb a1, 7(a2)
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sb a0, 5(a2)
+; RV32I-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+  %src = load i128, ptr %src.ptr, align 1
+  %bitOff = load i128, ptr %bitOff.ptr, align 1
+  %res = ashr i128 %src, %bitOff
+  store i128 %res, ptr %dst, align 1
+  ret void
+}
+
+define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: lshr_32bytes:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 0(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a3, 9(a0)
+; RV64I-NEXT:    lbu a4, 8(a0)
+; RV64I-NEXT:    lbu a5, 10(a0)
+; RV64I-NEXT:    lbu a6, 11(a0)
+; RV64I-NEXT:    slli a3, a3, 8
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    or a5, a5, a3
+; RV64I-NEXT:    lbu a3, 13(a0)
+; RV64I-NEXT:    lbu a4, 12(a0)
+; RV64I-NEXT:    lbu a6, 14(a0)
+; RV64I-NEXT:    lbu a7, 15(a0)
+; RV64I-NEXT:    slli a3, a3, 8
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a4, a7, a6
+; RV64I-NEXT:    or a6, a4, a3
+; RV64I-NEXT:    lbu a3, 1(a0)
+; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a7, 2(a0)
+; RV64I-NEXT:    lbu t0, 3(a0)
+; RV64I-NEXT:    slli a3, a3, 8
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    or a7, a4, a3
+; RV64I-NEXT:    lbu a3, 5(a0)
+; RV64I-NEXT:    lbu a4, 4(a0)
+; RV64I-NEXT:    lbu t0, 6(a0)
+; RV64I-NEXT:    lbu t1, 7(a0)
+; RV64I-NEXT:    slli a3, a3, 8
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a4, t1, t0
+; RV64I-NEXT:    or t0, a4, a3
+; RV64I-NEXT:    lbu a3, 25(a0)
+; RV64I-NEXT:    lbu a4, 24(a0)
+; RV64I-NEXT:    lbu t1, 26(a0)
+; RV64I-NEXT:    lbu t2, 27(a0)
+; RV64I-NEXT:    slli a3, a3, 8
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or a4, t2, t1
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 29(a0)
+; RV64I-NEXT:    lbu t1, 28(a0)
+; RV64I-NEXT:    lbu t2, 30(a0)
+; RV64I-NEXT:    lbu t3, 31(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, t1
+; RV64I-NEXT:    slli t2, t2, 16
+; RV64I-NEXT:    slli t3, t3, 24
+; RV64I-NEXT:    or t1, t3, t2
+; RV64I-NEXT:    or a4, t1, a4
+; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 17(a0)
+; RV64I-NEXT:    lbu t1, 16(a0)
+; RV64I-NEXT:    lbu t2, 18(a0)
+; RV64I-NEXT:    lbu t3, 19(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, t1
+; RV64I-NEXT:    slli t2, t2, 16
+; RV64I-NEXT:    slli t3, t3, 24
+; RV64I-NEXT:    or t1, t3, t2
+; RV64I-NEXT:    or a4, t1, a4
+; RV64I-NEXT:    lbu t1, 21(a0)
+; RV64I-NEXT:    lbu t2, 20(a0)
+; RV64I-NEXT:    lbu t3, 22(a0)
+; RV64I-NEXT:    lbu a0, 23(a0)
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or t1, t1, t2
+; RV64I-NEXT:    slli t3, t3, 16
+; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    or a0, a0, t3
+; RV64I-NEXT:    or a0, a0, t1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    or a4, a0, a4
+; RV64I-NEXT:    lbu a0, 1(a1)
+; RV64I-NEXT:    lbu t1, 0(a1)
+; RV64I-NEXT:    lbu t2, 2(a1)
+; RV64I-NEXT:    lbu t3, 3(a1)
+; RV64I-NEXT:    slli a0, a0, 8
+; RV64I-NEXT:    or a0, a0, t1
+; RV64I-NEXT:    slli t2, t2, 16
+; RV64I-NEXT:    slli t3, t3, 24
+; RV64I-NEXT:    lbu t1, 5(a1)
+; RV64I-NEXT:    lbu t4, 4(a1)
+; RV64I-NEXT:    or t2, t3, t2
+; RV64I-NEXT:    or t2, t2, a0
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or t1, t1, t4
+; RV64I-NEXT:    lbu t3, 6(a1)
+; RV64I-NEXT:    lbu t4, 7(a1)
+; RV64I-NEXT:    slli a0, a6, 32
+; RV64I-NEXT:    slli a1, t0, 32
+; RV64I-NEXT:    slli t3, t3, 16
+; RV64I-NEXT:    slli t4, t4, 24
+; RV64I-NEXT:    or a6, t4, t3
+; RV64I-NEXT:    or a6, a6, t1
+; RV64I-NEXT:    slli a6, a6, 32
+; RV64I-NEXT:    or a6, a6, t2
+; RV64I-NEXT:    addi t1, a6, -128
+; RV64I-NEXT:    addi t2, a6, -192
+; RV64I-NEXT:    slli t0, a3, 1
+; RV64I-NEXT:    bltz t2, .LBB9_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    srl t3, a3, t2
+; RV64I-NEXT:    j .LBB9_3
+; RV64I-NEXT:  .LBB9_2:
+; RV64I-NEXT:    srl t3, a4, t1
+; RV64I-NEXT:    xori t4, t1, 63
+; RV64I-NEXT:    sll t4, t0, t4
+; RV64I-NEXT:    or t3, t3, t4
+; RV64I-NEXT:  .LBB9_3:
+; RV64I-NEXT:    or a0, a0, a5
+; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    addi a7, a6, -64
+; RV64I-NEXT:    xori a5, a6, 63
+; RV64I-NEXT:    bltz a7, .LBB9_5
+; RV64I-NEXT:  # %bb.4:
+; RV64I-NEXT:    srl s1, a0, a7
+; RV64I-NEXT:    j .LBB9_6
+; RV64I-NEXT:  .LBB9_5:
+; RV64I-NEXT:    srl t4, a1, a6
+; RV64I-NEXT:    slli t5, a0, 1
+; RV64I-NEXT:    sll t5, t5, a5
+; RV64I-NEXT:    or s1, t4, t5
+; RV64I-NEXT:  .LBB9_6:
+; RV64I-NEXT:    negw t6, a6
+; RV64I-NEXT:    sll t4, a4, t6
+; RV64I-NEXT:    li s0, 64
+; RV64I-NEXT:    li t5, 128
+; RV64I-NEXT:    sub s0, s0, a6
+; RV64I-NEXT:    bltu a6, t5, .LBB9_12
+; RV64I-NEXT:  # %bb.7:
+; RV64I-NEXT:    bnez a6, .LBB9_13
+; RV64I-NEXT:  .LBB9_8:
+; RV64I-NEXT:    bgez s0, .LBB9_10
+; RV64I-NEXT:  .LBB9_9:
+; RV64I-NEXT:    sll t3, a3, t6
+; RV64I-NEXT:    srli t4, a4, 1
+; RV64I-NEXT:    sub t6, t5, a6
+; RV64I-NEXT:    xori t6, t6, 63
+; RV64I-NEXT:    srl t4, t4, t6
+; RV64I-NEXT:    or t4, t3, t4
+; RV64I-NEXT:  .LBB9_10:
+; RV64I-NEXT:    slti t3, a7, 0
+; RV64I-NEXT:    neg t3, t3
+; RV64I-NEXT:    bltu a6, t5, .LBB9_14
+; RV64I-NEXT:  # %bb.11:
+; RV64I-NEXT:    srl t1, a3, t1
+; RV64I-NEXT:    slti t2, t2, 0
+; RV64I-NEXT:    neg t2, t2
+; RV64I-NEXT:    and t1, t2, t1
+; RV64I-NEXT:    bnez a6, .LBB9_15
+; RV64I-NEXT:    j .LBB9_16
+; RV64I-NEXT:  .LBB9_12:
+; RV64I-NEXT:    slti t3, s0, 0
+; RV64I-NEXT:    neg t3, t3
+; RV64I-NEXT:    and t3, t3, t4
+; RV64I-NEXT:    or t3, s1, t3
+; RV64I-NEXT:    beqz a6, .LBB9_8
+; RV64I-NEXT:  .LBB9_13:
+; RV64I-NEXT:    mv a1, t3
+; RV64I-NEXT:    bltz s0, .LBB9_9
+; RV64I-NEXT:    j .LBB9_10
+; RV64I-NEXT:  .LBB9_14:
+; RV64I-NEXT:    srl t1, a0, a6
+; RV64I-NEXT:    and t1, t3, t1
+; RV64I-NEXT:    or t1, t1, t4
+; RV64I-NEXT:    beqz a6, .LBB9_16
+; RV64I-NEXT:  .LBB9_15:
+; RV64I-NEXT:    mv a0, t1
+; RV64I-NEXT:  .LBB9_16:
+; RV64I-NEXT:    bltz a7, .LBB9_18
+; RV64I-NEXT:  # %bb.17:
+; RV64I-NEXT:    srl a4, a3, a7
+; RV64I-NEXT:    j .LBB9_19
+; RV64I-NEXT:  .LBB9_18:
+; RV64I-NEXT:    srl a4, a4, a6
+; RV64I-NEXT:    sll a5, t0, a5
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:  .LBB9_19:
+; RV64I-NEXT:    sltiu a5, a6, 128
+; RV64I-NEXT:    neg a5, a5
+; RV64I-NEXT:    and a4, a5, a4
+; RV64I-NEXT:    srl a3, a3, a6
+; RV64I-NEXT:    and a3, t3, a3
+; RV64I-NEXT:    and a3, a5, a3
+; RV64I-NEXT:    sb a4, 16(a2)
+; RV64I-NEXT:    sb a3, 24(a2)
+; RV64I-NEXT:    srli a5, a4, 56
+; RV64I-NEXT:    sb a5, 23(a2)
+; RV64I-NEXT:    srli a5, a4, 48
+; RV64I-NEXT:    sb a5, 22(a2)
+; RV64I-NEXT:    srli a5, a4, 40
+; RV64I-NEXT:    sb a5, 21(a2)
+; RV64I-NEXT:    srli a5, a4, 32
+; RV64I-NEXT:    sb a5, 20(a2)
+; RV64I-NEXT:    srli a5, a4, 24
+; RV64I-NEXT:    sb a5, 19(a2)
+; RV64I-NEXT:    srli a5, a4, 16
+; RV64I-NEXT:    sb a5, 18(a2)
+; RV64I-NEXT:    srli a4, a4, 8
+; RV64I-NEXT:    sb a4, 17(a2)
+; RV64I-NEXT:    srli a4, a3, 56
+; RV64I-NEXT:    sb a4, 31(a2)
+; RV64I-NEXT:    srli a4, a3, 48
+; RV64I-NEXT:    sb a4, 30(a2)
+; RV64I-NEXT:    srli a4, a3, 40
+; RV64I-NEXT:    sb a4, 29(a2)
+; RV64I-NEXT:    srli a4, a3, 32
+; RV64I-NEXT:    sb a4, 28(a2)
+; RV64I-NEXT:    srli a4, a3, 24
+; RV64I-NEXT:    sb a4, 27(a2)
+; RV64I-NEXT:    srli a4, a3, 16
+; RV64I-NEXT:    sb a4, 26(a2)
+; RV64I-NEXT:    srli a3, a3, 8
+; RV64I-NEXT:    sb a3, 25(a2)
+; RV64I-NEXT:    sb a1, 0(a2)
+; RV64I-NEXT:    srli a3, a1, 56
+; RV64I-NEXT:    sb a3, 7(a2)
+; RV64I-NEXT:    srli a3, a1, 48
+; RV64I-NEXT:    sb a3, 6(a2)
+; RV64I-NEXT:    srli a3, a1, 40
+; RV64I-NEXT:    sb a3, 5(a2)
+; RV64I-NEXT:    srli a3, a1, 32
+; RV64I-NEXT:    sb a3, 4(a2)
+; RV64I-NEXT:    srli a3, a1, 24
+; RV64I-NEXT:    sb a3, 3(a2)
+; RV64I-NEXT:    srli a3, a1, 16
+; RV64I-NEXT:    sb a3, 2(a2)
+; RV64I-NEXT:    srli a1, a1, 8
+; RV64I-NEXT:    sb a1, 1(a2)
+; RV64I-NEXT:    sb a0, 8(a2)
+; RV64I-NEXT:    srli a1, a0, 56
+; RV64I-NEXT:    sb a1, 15(a2)
+; RV64I-NEXT:    srli a1, a0, 48
+; RV64I-NEXT:    sb a1, 14(a2)
+; RV64I-NEXT:    srli a1, a0, 40
+; RV64I-NEXT:    sb a1, 13(a2)
+; RV64I-NEXT:    srli a1, a0, 32
+; RV64I-NEXT:    sb a1, 12(a2)
+; RV64I-NEXT:    srli a1, a0, 24
+; RV64I-NEXT:    sb a1, 11(a2)
+; RV64I-NEXT:    srli a1, a0, 16
+; RV64I-NEXT:    sb a1, 10(a2)
+; RV64I-NEXT:    srli a0, a0, 8
+; RV64I-NEXT:    sb a0, 9(a2)
+; RV64I-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 0(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: lshr_32bytes:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -128
+; RV32I-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 116(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 112(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s6, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s7, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s8, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s9, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s10, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s11, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu t0, 4(a0)
+; RV32I-NEXT:    lbu a6, 5(a0)
+; RV32I-NEXT:    lbu t2, 6(a0)
+; RV32I-NEXT:    lbu t4, 7(a0)
+; RV32I-NEXT:    lbu t1, 0(a0)
+; RV32I-NEXT:    lbu t5, 1(a0)
+; RV32I-NEXT:    lbu t6, 2(a0)
+; RV32I-NEXT:    lbu s0, 3(a0)
+; RV32I-NEXT:    lbu t3, 12(a0)
+; RV32I-NEXT:    lbu a7, 13(a0)
+; RV32I-NEXT:    lbu s1, 14(a0)
+; RV32I-NEXT:    lbu s6, 15(a0)
+; RV32I-NEXT:    lbu s2, 8(a0)
+; RV32I-NEXT:    lbu s3, 9(a0)
+; RV32I-NEXT:    lbu s4, 10(a0)
+; RV32I-NEXT:    lbu s5, 11(a0)
+; RV32I-NEXT:    lbu a3, 21(a0)
+; RV32I-NEXT:    lbu a4, 20(a0)
+; RV32I-NEXT:    lbu a5, 22(a0)
+; RV32I-NEXT:    lbu s7, 23(a0)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli s7, s7, 24
+; RV32I-NEXT:    or a4, s7, a5
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 17(a0)
+; RV32I-NEXT:    lbu a5, 16(a0)
+; RV32I-NEXT:    lbu s8, 18(a0)
+; RV32I-NEXT:    lbu s9, 19(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or s7, a4, a5
+; RV32I-NEXT:    slli s8, s8, 16
+; RV32I-NEXT:    slli s9, s9, 24
+; RV32I-NEXT:    or s9, s9, s8
+; RV32I-NEXT:    lbu a4, 29(a0)
+; RV32I-NEXT:    lbu a5, 28(a0)
+; RV32I-NEXT:    lbu s8, 30(a0)
+; RV32I-NEXT:    lbu s10, 31(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    slli s8, s8, 16
+; RV32I-NEXT:    slli s10, s10, 24
+; RV32I-NEXT:    or a5, s10, s8
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    lbu a5, 25(a0)
+; RV32I-NEXT:    lbu s8, 24(a0)
+; RV32I-NEXT:    lbu s10, 26(a0)
+; RV32I-NEXT:    lbu a0, 27(a0)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a5, a5, s8
+; RV32I-NEXT:    slli s10, s10, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a0, a0, s10
+; RV32I-NEXT:    or ra, a0, a5
+; RV32I-NEXT:    lbu a0, 1(a1)
+; RV32I-NEXT:    lbu a5, 0(a1)
+; RV32I-NEXT:    lbu s8, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a0, a0, 8
+; RV32I-NEXT:    or a0, a0, a5
+; RV32I-NEXT:    slli s8, s8, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, s8
+; RV32I-NEXT:    or a0, a1, a0
+; RV32I-NEXT:    addi a5, a0, -192
+; RV32I-NEXT:    addi a1, a0, -224
+; RV32I-NEXT:    slli s8, a4, 1
+; RV32I-NEXT:    sw s8, 60(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw a5, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw a1, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    bltz a1, .LBB9_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    srl s8, a4, a1
+; RV32I-NEXT:    j .LBB9_3
+; RV32I-NEXT:  .LBB9_2:
+; RV32I-NEXT:    srl a1, ra, a5
+; RV32I-NEXT:    xori a5, a5, 31
+; RV32I-NEXT:    sll a5, s8, a5
+; RV32I-NEXT:    or s8, a1, a5
+; RV32I-NEXT:  .LBB9_3:
+; RV32I-NEXT:    slli a5, a7, 8
+; RV32I-NEXT:    slli s10, s1, 16
+; RV32I-NEXT:    slli s6, s6, 24
+; RV32I-NEXT:    or a7, s9, s7
+; RV32I-NEXT:    addi s1, a0, -128
+; RV32I-NEXT:    slli a1, a3, 1
+; RV32I-NEXT:    addi s9, a0, -160
+; RV32I-NEXT:    xori s11, s1, 31
+; RV32I-NEXT:    sw a1, 48(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s11, 36(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    bltz s9, .LBB9_5
+; RV32I-NEXT:  # %bb.4:
+; RV32I-NEXT:    srl s7, a3, s9
+; RV32I-NEXT:    j .LBB9_6
+; RV32I-NEXT:  .LBB9_5:
+; RV32I-NEXT:    srl s7, a7, s1
+; RV32I-NEXT:    sll s11, a1, s11
+; RV32I-NEXT:    or s7, s7, s11
+; RV32I-NEXT:  .LBB9_6:
+; RV32I-NEXT:    slli s3, s3, 8
+; RV32I-NEXT:    slli s4, s4, 16
+; RV32I-NEXT:    slli s5, s5, 24
+; RV32I-NEXT:    or a5, a5, t3
+; RV32I-NEXT:    or s6, s6, s10
+; RV32I-NEXT:    neg s11, a0
+; RV32I-NEXT:    sll s10, ra, s11
+; RV32I-NEXT:    li t3, 160
+; RV32I-NEXT:    li a1, 64
+; RV32I-NEXT:    sub t3, t3, a0
+; RV32I-NEXT:    sw s10, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw t3, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    bgeu s1, a1, .LBB9_8
+; RV32I-NEXT:  # %bb.7:
+; RV32I-NEXT:    slti t3, t3, 0
+; RV32I-NEXT:    neg t3, t3
+; RV32I-NEXT:    and t3, t3, s10
+; RV32I-NEXT:    or s8, s7, t3
+; RV32I-NEXT:  .LBB9_8:
+; RV32I-NEXT:    slli s10, a6, 8
+; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli t4, t4, 24
+; RV32I-NEXT:    slli t5, t5, 8
+; RV32I-NEXT:    slli t6, t6, 16
+; RV32I-NEXT:    slli s0, s0, 24
+; RV32I-NEXT:    or s2, s3, s2
+; RV32I-NEXT:    or s3, s5, s4
+; RV32I-NEXT:    or a6, s6, a5
+; RV32I-NEXT:    mv s7, a7
+; RV32I-NEXT:    beqz s1, .LBB9_10
+; RV32I-NEXT:  # %bb.9:
+; RV32I-NEXT:    mv s7, s8
+; RV32I-NEXT:  .LBB9_10:
+; RV32I-NEXT:    or t0, s10, t0
+; RV32I-NEXT:    or t2, t4, t2
+; RV32I-NEXT:    or t1, t5, t1
+; RV32I-NEXT:    or t4, s0, t6
+; RV32I-NEXT:    or s5, s3, s2
+; RV32I-NEXT:    addi a1, a0, -64
+; RV32I-NEXT:    slli t5, a6, 1
+; RV32I-NEXT:    addi s4, a0, -96
+; RV32I-NEXT:    xori t3, a1, 31
+; RV32I-NEXT:    sw a1, 56(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw t3, 32(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw t5, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    bltz s4, .LBB9_12
+; RV32I-NEXT:  # %bb.11:
+; RV32I-NEXT:    srl a5, a6, s4
+; RV32I-NEXT:    j .LBB9_13
+; RV32I-NEXT:  .LBB9_12:
+; RV32I-NEXT:    srl a5, s5, a1
+; RV32I-NEXT:    sll t3, t5, t3
+; RV32I-NEXT:    or a5, a5, t3
+; RV32I-NEXT:  .LBB9_13:
+; RV32I-NEXT:    li t5, 64
+; RV32I-NEXT:    or s3, t2, t0
+; RV32I-NEXT:    or t1, t4, t1
+; RV32I-NEXT:    addi t6, a0, -32
+; RV32I-NEXT:    xori s10, a0, 31
+; RV32I-NEXT:    bltz t6, .LBB9_15
+; RV32I-NEXT:  # %bb.14:
+; RV32I-NEXT:    srl t4, s3, t6
+; RV32I-NEXT:    j .LBB9_16
+; RV32I-NEXT:  .LBB9_15:
+; RV32I-NEXT:    srl t0, t1, a0
+; RV32I-NEXT:    slli t2, s3, 1
+; RV32I-NEXT:    sll t2, t2, s10
+; RV32I-NEXT:    or t4, t0, t2
+; RV32I-NEXT:  .LBB9_16:
+; RV32I-NEXT:    sll t2, s5, s11
+; RV32I-NEXT:    li t0, 32
+; RV32I-NEXT:    sub s0, t0, a0
+; RV32I-NEXT:    slti t3, s0, 0
+; RV32I-NEXT:    neg a1, t3
+; RV32I-NEXT:    bgeu a0, t5, .LBB9_18
+; RV32I-NEXT:  # %bb.17:
+; RV32I-NEXT:    and a5, a1, t2
+; RV32I-NEXT:    or a5, t4, a5
+; RV32I-NEXT:  .LBB9_18:
+; RV32I-NEXT:    mv s8, t1
+; RV32I-NEXT:    beqz a0, .LBB9_20
+; RV32I-NEXT:  # %bb.19:
+; RV32I-NEXT:    mv s8, a5
+; RV32I-NEXT:  .LBB9_20:
+; RV32I-NEXT:    sll a5, a7, s11
+; RV32I-NEXT:    li t3, 96
+; RV32I-NEXT:    sub s6, t3, a0
+; RV32I-NEXT:    slti t3, s6, 0
+; RV32I-NEXT:    neg t4, t3
+; RV32I-NEXT:    li s2, 128
+; RV32I-NEXT:    sub t5, s2, a0
+; RV32I-NEXT:    sltiu t3, t5, 64
+; RV32I-NEXT:    neg t3, t3
+; RV32I-NEXT:    sw t3, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    bgeu a0, s2, .LBB9_22
+; RV32I-NEXT:  # %bb.21:
+; RV32I-NEXT:    mv s2, t3
+; RV32I-NEXT:    and t3, t4, a5
+; RV32I-NEXT:    and t3, s2, t3
+; RV32I-NEXT:    or s7, s8, t3
+; RV32I-NEXT:  .LBB9_22:
+; RV32I-NEXT:    li s8, 64
+; RV32I-NEXT:    sw s0, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    beqz a0, .LBB9_24
+; RV32I-NEXT:  # %bb.23:
+; RV32I-NEXT:    mv t1, s7
+; RV32I-NEXT:  .LBB9_24:
+; RV32I-NEXT:    neg t3, t5
+; RV32I-NEXT:    sub s0, t0, t5
+; RV32I-NEXT:    srl t0, a3, t3
+; RV32I-NEXT:    sw a1, 52(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw t0, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    bgez s0, .LBB9_26
+; RV32I-NEXT:  # %bb.25:
+; RV32I-NEXT:    srl t0, a7, t3
+; RV32I-NEXT:    sub t3, s8, t5
+; RV32I-NEXT:    xori t3, t3, 31
+; RV32I-NEXT:    lw a1, 48(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sll t3, a1, t3
+; RV32I-NEXT:    lw a1, 52(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    or t0, t0, t3
+; RV32I-NEXT:  .LBB9_26:
+; RV32I-NEXT:    bltu t5, s8, .LBB9_28
+; RV32I-NEXT:  # %bb.27:
+; RV32I-NEXT:    and t3, a1, a5
+; RV32I-NEXT:    mv t0, ra
+; RV32I-NEXT:    bnez t5, .LBB9_29
+; RV32I-NEXT:    j .LBB9_30
+; RV32I-NEXT:  .LBB9_28:
+; RV32I-NEXT:    lw t3, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    and t3, t4, t3
+; RV32I-NEXT:    or t3, t3, t0
+; RV32I-NEXT:    mv t0, ra
+; RV32I-NEXT:    beqz t5, .LBB9_30
+; RV32I-NEXT:  .LBB9_29:
+; RV32I-NEXT:    mv t0, t3
+; RV32I-NEXT:  .LBB9_30:
+; RV32I-NEXT:    bltz t6, .LBB9_32
+; RV32I-NEXT:  # %bb.31:
+; RV32I-NEXT:    srl t4, a6, t6
+; RV32I-NEXT:    j .LBB9_33
+; RV32I-NEXT:  .LBB9_32:
+; RV32I-NEXT:    srl t3, s5, a0
+; RV32I-NEXT:    lw a1, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sll t4, a1, s10
+; RV32I-NEXT:    or t4, t3, t4
+; RV32I-NEXT:  .LBB9_33:
+; RV32I-NEXT:    sltiu s0, a0, 64
+; RV32I-NEXT:    sw s10, 44(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    bltz s9, .LBB9_35
+; RV32I-NEXT:  # %bb.34:
+; RV32I-NEXT:    srl a1, a4, s9
+; RV32I-NEXT:    j .LBB9_36
+; RV32I-NEXT:  .LBB9_35:
+; RV32I-NEXT:    srl t3, ra, s1
+; RV32I-NEXT:    lw s7, 60(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a1, 36(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sll a1, s7, a1
+; RV32I-NEXT:    or a1, t3, a1
+; RV32I-NEXT:  .LBB9_36:
+; RV32I-NEXT:    neg s10, s0
+; RV32I-NEXT:    sltiu t3, s1, 64
+; RV32I-NEXT:    neg s0, t3
+; RV32I-NEXT:    li t3, 128
+; RV32I-NEXT:    sw ra, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    bltu a0, t3, .LBB9_38
+; RV32I-NEXT:  # %bb.37:
+; RV32I-NEXT:    and a1, s0, a1
+; RV32I-NEXT:    j .LBB9_39
+; RV32I-NEXT:  .LBB9_38:
+; RV32I-NEXT:    and a1, s10, t4
+; RV32I-NEXT:    or a1, a1, t0
+; RV32I-NEXT:  .LBB9_39:
+; RV32I-NEXT:    lw t3, 40(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    mv ra, s5
+; RV32I-NEXT:    beqz a0, .LBB9_41
+; RV32I-NEXT:  # %bb.40:
+; RV32I-NEXT:    mv ra, a1
+; RV32I-NEXT:  .LBB9_41:
+; RV32I-NEXT:    sub a1, s8, a0
+; RV32I-NEXT:    xori t4, a1, 31
+; RV32I-NEXT:    lw a1, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sw s0, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    bgez a1, .LBB9_43
+; RV32I-NEXT:  # %bb.42:
+; RV32I-NEXT:    sll a1, a6, s11
+; RV32I-NEXT:    srli t0, s5, 1
+; RV32I-NEXT:    srl t0, t0, t4
+; RV32I-NEXT:    or t2, a1, t0
+; RV32I-NEXT:  .LBB9_43:
+; RV32I-NEXT:    slti a1, t6, 0
+; RV32I-NEXT:    neg s2, a1
+; RV32I-NEXT:    slti t0, s4, 0
+; RV32I-NEXT:    neg s0, t0
+; RV32I-NEXT:    bltu a0, s8, .LBB9_45
+; RV32I-NEXT:  # %bb.44:
+; RV32I-NEXT:    lw a1, 56(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    srl t0, a6, a1
+; RV32I-NEXT:    and t2, s0, t0
+; RV32I-NEXT:    j .LBB9_46
+; RV32I-NEXT:  .LBB9_45:
+; RV32I-NEXT:    srl t0, s3, a0
+; RV32I-NEXT:    and t0, s2, t0
+; RV32I-NEXT:    or t2, t0, t2
+; RV32I-NEXT:  .LBB9_46:
+; RV32I-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw t4, 36(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    li a1, 64
+; RV32I-NEXT:    mv t0, s3
+; RV32I-NEXT:    beqz a0, .LBB9_48
+; RV32I-NEXT:  # %bb.47:
+; RV32I-NEXT:    mv t0, t2
+; RV32I-NEXT:  .LBB9_48:
+; RV32I-NEXT:    sll s7, a3, s11
+; RV32I-NEXT:    srli s8, a7, 1
+; RV32I-NEXT:    xori s0, t5, 31
+; RV32I-NEXT:    bltz s6, .LBB9_50
+; RV32I-NEXT:  # %bb.49:
+; RV32I-NEXT:    mv t4, a5
+; RV32I-NEXT:    j .LBB9_51
+; RV32I-NEXT:  .LBB9_50:
+; RV32I-NEXT:    srl t2, s8, s0
+; RV32I-NEXT:    or t4, s7, t2
+; RV32I-NEXT:  .LBB9_51:
+; RV32I-NEXT:    sll s5, a4, s11
+; RV32I-NEXT:    lw t2, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    srli s11, t2, 1
+; RV32I-NEXT:    bltz t3, .LBB9_53
+; RV32I-NEXT:  # %bb.52:
+; RV32I-NEXT:    lw t3, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    j .LBB9_54
+; RV32I-NEXT:  .LBB9_53:
+; RV32I-NEXT:    li t2, 192
+; RV32I-NEXT:    sub t2, t2, a0
+; RV32I-NEXT:    xori t2, t2, 31
+; RV32I-NEXT:    srl t2, s11, t2
+; RV32I-NEXT:    or t3, s5, t2
+; RV32I-NEXT:  .LBB9_54:
+; RV32I-NEXT:    slti t2, s9, 0
+; RV32I-NEXT:    neg t2, t2
+; RV32I-NEXT:    bltu s1, a1, .LBB9_56
+; RV32I-NEXT:  # %bb.55:
+; RV32I-NEXT:    lw a1, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    srl t3, a4, a1
+; RV32I-NEXT:    lw a1, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    slti s9, a1, 0
+; RV32I-NEXT:    neg s9, s9
+; RV32I-NEXT:    and t3, s9, t3
+; RV32I-NEXT:    mv s9, a3
+; RV32I-NEXT:    bnez s1, .LBB9_57
+; RV32I-NEXT:    j .LBB9_58
+; RV32I-NEXT:  .LBB9_56:
+; RV32I-NEXT:    srl s9, a3, s1
+; RV32I-NEXT:    and s9, t2, s9
+; RV32I-NEXT:    or t3, s9, t3
+; RV32I-NEXT:    mv s9, a3
+; RV32I-NEXT:    beqz s1, .LBB9_58
+; RV32I-NEXT:  .LBB9_57:
+; RV32I-NEXT:    mv s9, t3
+; RV32I-NEXT:  .LBB9_58:
+; RV32I-NEXT:    li a1, 128
+; RV32I-NEXT:    bltu a0, a1, .LBB9_63
+; RV32I-NEXT:  # %bb.59:
+; RV32I-NEXT:    lw t3, 60(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    bnez a0, .LBB9_64
+; RV32I-NEXT:  .LBB9_60:
+; RV32I-NEXT:    lw a1, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    bltz a1, .LBB9_65
+; RV32I-NEXT:  .LBB9_61:
+; RV32I-NEXT:    li s7, 64
+; RV32I-NEXT:    bltz s6, .LBB9_66
+; RV32I-NEXT:  .LBB9_62:
+; RV32I-NEXT:    lw t4, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    mv t0, t4
+; RV32I-NEXT:    bltu t5, s7, .LBB9_67
+; RV32I-NEXT:    j .LBB9_68
+; RV32I-NEXT:  .LBB9_63:
+; RV32I-NEXT:    lw a1, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    and t3, a1, t4
+; RV32I-NEXT:    or s9, t0, t3
+; RV32I-NEXT:    lw t3, 60(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    beqz a0, .LBB9_60
+; RV32I-NEXT:  .LBB9_64:
+; RV32I-NEXT:    mv s3, s9
+; RV32I-NEXT:    lw a1, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    bgez a1, .LBB9_61
+; RV32I-NEXT:  .LBB9_65:
+; RV32I-NEXT:    lw a1, 36(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    srl a5, s8, a1
+; RV32I-NEXT:    or a5, s7, a5
+; RV32I-NEXT:    li s7, 64
+; RV32I-NEXT:    bgez s6, .LBB9_62
+; RV32I-NEXT:  .LBB9_66:
+; RV32I-NEXT:    srl t0, s11, s0
+; RV32I-NEXT:    or t0, s5, t0
+; RV32I-NEXT:    lw t4, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    bgeu t5, s7, .LBB9_68
+; RV32I-NEXT:  .LBB9_67:
+; RV32I-NEXT:    lw a1, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    slti a5, a1, 0
+; RV32I-NEXT:    neg a5, a5
+; RV32I-NEXT:    lw a1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    and a5, a5, a1
+; RV32I-NEXT:    or a5, t0, a5
+; RV32I-NEXT:  .LBB9_68:
+; RV32I-NEXT:    mv t0, a4
+; RV32I-NEXT:    bnez t5, .LBB9_71
+; RV32I-NEXT:  # %bb.69:
+; RV32I-NEXT:    li a1, 128
+; RV32I-NEXT:    bltu a0, a1, .LBB9_72
+; RV32I-NEXT:  .LBB9_70:
+; RV32I-NEXT:    srl a5, a4, s1
+; RV32I-NEXT:    and a5, t2, a5
+; RV32I-NEXT:    lw a1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    and a5, a1, a5
+; RV32I-NEXT:    lw t5, 56(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    bnez a0, .LBB9_73
+; RV32I-NEXT:    j .LBB9_74
+; RV32I-NEXT:  .LBB9_71:
+; RV32I-NEXT:    mv t0, a5
+; RV32I-NEXT:    li a1, 128
+; RV32I-NEXT:    bgeu a0, a1, .LBB9_70
+; RV32I-NEXT:  .LBB9_72:
+; RV32I-NEXT:    srl a5, a6, a0
+; RV32I-NEXT:    and a5, s2, a5
+; RV32I-NEXT:    and a5, s10, a5
+; RV32I-NEXT:    or a5, a5, t0
+; RV32I-NEXT:    lw t5, 56(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    beqz a0, .LBB9_74
+; RV32I-NEXT:  .LBB9_73:
+; RV32I-NEXT:    mv a6, a5
+; RV32I-NEXT:  .LBB9_74:
+; RV32I-NEXT:    bltz s4, .LBB9_77
+; RV32I-NEXT:  # %bb.75:
+; RV32I-NEXT:    srl a5, a4, s4
+; RV32I-NEXT:    bgez t6, .LBB9_78
+; RV32I-NEXT:  .LBB9_76:
+; RV32I-NEXT:    srl t0, a7, a0
+; RV32I-NEXT:    lw a1, 48(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw t2, 44(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sll t2, a1, t2
+; RV32I-NEXT:    or t0, t0, t2
+; RV32I-NEXT:    lw a1, 52(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    bltu a0, s7, .LBB9_79
+; RV32I-NEXT:    j .LBB9_80
+; RV32I-NEXT:  .LBB9_77:
+; RV32I-NEXT:    lw a5, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    srl a5, a5, t5
+; RV32I-NEXT:    lw a1, 32(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sll t0, t3, a1
+; RV32I-NEXT:    or a5, a5, t0
+; RV32I-NEXT:    bltz t6, .LBB9_76
+; RV32I-NEXT:  .LBB9_78:
+; RV32I-NEXT:    srl t0, a3, t6
+; RV32I-NEXT:    lw a1, 52(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    bgeu a0, s7, .LBB9_80
+; RV32I-NEXT:  .LBB9_79:
+; RV32I-NEXT:    and a5, a1, t4
+; RV32I-NEXT:    or a5, t0, a5
+; RV32I-NEXT:  .LBB9_80:
+; RV32I-NEXT:    bnez a0, .LBB9_84
+; RV32I-NEXT:  # %bb.81:
+; RV32I-NEXT:    lw a1, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    bltz a1, .LBB9_85
+; RV32I-NEXT:  .LBB9_82:
+; RV32I-NEXT:    sltiu a5, a0, 128
+; RV32I-NEXT:    bltu a0, s7, .LBB9_86
+; RV32I-NEXT:  .LBB9_83:
+; RV32I-NEXT:    srl t0, a4, t5
+; RV32I-NEXT:    lw a1, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    and t2, a1, t0
+; RV32I-NEXT:    neg t0, a5
+; RV32I-NEXT:    bnez a0, .LBB9_87
+; RV32I-NEXT:    j .LBB9_88
+; RV32I-NEXT:  .LBB9_84:
+; RV32I-NEXT:    mv a7, a5
+; RV32I-NEXT:    lw a1, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    bgez a1, .LBB9_82
+; RV32I-NEXT:  .LBB9_85:
+; RV32I-NEXT:    lw a1, 36(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    srl a5, s11, a1
+; RV32I-NEXT:    or t4, s5, a5
+; RV32I-NEXT:    sltiu a5, a0, 128
+; RV32I-NEXT:    bgeu a0, s7, .LBB9_83
+; RV32I-NEXT:  .LBB9_86:
+; RV32I-NEXT:    srl t0, a3, a0
+; RV32I-NEXT:    and t0, s2, t0
+; RV32I-NEXT:    or t2, t0, t4
+; RV32I-NEXT:    neg t0, a5
+; RV32I-NEXT:    beqz a0, .LBB9_88
+; RV32I-NEXT:  .LBB9_87:
+; RV32I-NEXT:    mv a3, t2
+; RV32I-NEXT:  .LBB9_88:
+; RV32I-NEXT:    and a5, t0, a7
+; RV32I-NEXT:    and a3, t0, a3
+; RV32I-NEXT:    bltz t6, .LBB9_90
+; RV32I-NEXT:  # %bb.89:
+; RV32I-NEXT:    srl a7, a4, t6
+; RV32I-NEXT:    j .LBB9_91
+; RV32I-NEXT:  .LBB9_90:
+; RV32I-NEXT:    lw a7, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    srl a7, a7, a0
+; RV32I-NEXT:    lw a1, 44(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sll t2, t3, a1
+; RV32I-NEXT:    or a7, a7, t2
+; RV32I-NEXT:  .LBB9_91:
+; RV32I-NEXT:    and a7, s10, a7
+; RV32I-NEXT:    and a7, t0, a7
+; RV32I-NEXT:    srl a0, a4, a0
+; RV32I-NEXT:    and a0, s2, a0
+; RV32I-NEXT:    and a0, s10, a0
+; RV32I-NEXT:    and a0, t0, a0
+; RV32I-NEXT:    sb a7, 24(a2)
+; RV32I-NEXT:    sb a0, 28(a2)
+; RV32I-NEXT:    srli a1, a7, 24
+; RV32I-NEXT:    sb a1, 27(a2)
+; RV32I-NEXT:    srli a1, a7, 16
+; RV32I-NEXT:    sb a1, 26(a2)
+; RV32I-NEXT:    srli a1, a7, 8
+; RV32I-NEXT:    sb a1, 25(a2)
+; RV32I-NEXT:    srli a1, a0, 24
+; RV32I-NEXT:    sb a1, 31(a2)
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    sb a1, 30(a2)
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sb a0, 29(a2)
+; RV32I-NEXT:    sb a5, 16(a2)
+; RV32I-NEXT:    srli a0, a5, 24
+; RV32I-NEXT:    sb a0, 19(a2)
+; RV32I-NEXT:    srli a0, a5, 16
+; RV32I-NEXT:    sb a0, 18(a2)
+; RV32I-NEXT:    srli a5, a5, 8
+; RV32I-NEXT:    sb a5, 17(a2)
+; RV32I-NEXT:    sb a3, 20(a2)
+; RV32I-NEXT:    srli a0, a3, 24
+; RV32I-NEXT:    sb a0, 23(a2)
+; RV32I-NEXT:    srli a0, a3, 16
+; RV32I-NEXT:    sb a0, 22(a2)
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 21(a2)
+; RV32I-NEXT:    sb t1, 0(a2)
+; RV32I-NEXT:    sb a6, 12(a2)
+; RV32I-NEXT:    srli a0, t1, 24
+; RV32I-NEXT:    sb a0, 3(a2)
+; RV32I-NEXT:    srli a0, t1, 16
+; RV32I-NEXT:    sb a0, 2(a2)
+; RV32I-NEXT:    srli a0, t1, 8
+; RV32I-NEXT:    sb a0, 1(a2)
+; RV32I-NEXT:    sb s3, 4(a2)
+; RV32I-NEXT:    sb ra, 8(a2)
+; RV32I-NEXT:    srli a0, a6, 24
+; RV32I-NEXT:    sb a0, 15(a2)
+; RV32I-NEXT:    srli a0, a6, 16
+; RV32I-NEXT:    sb a0, 14(a2)
+; RV32I-NEXT:    srli a0, a6, 8
+; RV32I-NEXT:    sb a0, 13(a2)
+; RV32I-NEXT:    srli a0, s3, 24
+; RV32I-NEXT:    sb a0, 7(a2)
+; RV32I-NEXT:    srli a0, s3, 16
+; RV32I-NEXT:    sb a0, 6(a2)
+; RV32I-NEXT:    srli a0, s3, 8
+; RV32I-NEXT:    sb a0, 5(a2)
+; RV32I-NEXT:    srli a0, ra, 24
+; RV32I-NEXT:    sb a0, 11(a2)
+; RV32I-NEXT:    srli a0, ra, 16
+; RV32I-NEXT:    sb a0, 10(a2)
+; RV32I-NEXT:    srli a0, ra, 8
+; RV32I-NEXT:    sb a0, 9(a2)
+; RV32I-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 116(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 112(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s6, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s7, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s8, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s9, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s10, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s11, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 128
+; RV32I-NEXT:    ret
+  %src = load i256, ptr %src.ptr, align 1
+  %bitOff = load i256, ptr %bitOff.ptr, align 1
+  %res = lshr i256 %src, %bitOff
+  store i256 %res, ptr %dst, align 1
+  ret void
+}
+define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: shl_32bytes:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 0(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a3, 17(a0)
+; RV64I-NEXT:    lbu a4, 16(a0)
+; RV64I-NEXT:    lbu a5, 18(a0)
+; RV64I-NEXT:    lbu a6, 19(a0)
+; RV64I-NEXT:    slli a3, a3, 8
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    or a5, a5, a3
+; RV64I-NEXT:    lbu a3, 21(a0)
+; RV64I-NEXT:    lbu a4, 20(a0)
+; RV64I-NEXT:    lbu a6, 22(a0)
+; RV64I-NEXT:    lbu a7, 23(a0)
+; RV64I-NEXT:    slli a3, a3, 8
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a4, a7, a6
+; RV64I-NEXT:    or a6, a4, a3
+; RV64I-NEXT:    lbu a3, 25(a0)
+; RV64I-NEXT:    lbu a4, 24(a0)
+; RV64I-NEXT:    lbu a7, 26(a0)
+; RV64I-NEXT:    lbu t0, 27(a0)
+; RV64I-NEXT:    slli a3, a3, 8
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    or a7, a4, a3
+; RV64I-NEXT:    lbu a3, 29(a0)
+; RV64I-NEXT:    lbu a4, 28(a0)
+; RV64I-NEXT:    lbu t0, 30(a0)
+; RV64I-NEXT:    lbu t1, 31(a0)
+; RV64I-NEXT:    slli a3, a3, 8
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a4, t1, t0
+; RV64I-NEXT:    or t0, a4, a3
+; RV64I-NEXT:    lbu a3, 1(a0)
+; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu t1, 2(a0)
+; RV64I-NEXT:    lbu t2, 3(a0)
+; RV64I-NEXT:    slli a3, a3, 8
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or a4, t2, t1
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 5(a0)
+; RV64I-NEXT:    lbu t1, 4(a0)
+; RV64I-NEXT:    lbu t2, 6(a0)
+; RV64I-NEXT:    lbu t3, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, t1
+; RV64I-NEXT:    slli t2, t2, 16
+; RV64I-NEXT:    slli t3, t3, 24
+; RV64I-NEXT:    or t1, t3, t2
+; RV64I-NEXT:    or a4, t1, a4
+; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    lbu a4, 9(a0)
+; RV64I-NEXT:    lbu t1, 8(a0)
+; RV64I-NEXT:    lbu t2, 10(a0)
+; RV64I-NEXT:    lbu t3, 11(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a4, a4, t1
+; RV64I-NEXT:    slli t2, t2, 16
+; RV64I-NEXT:    slli t3, t3, 24
+; RV64I-NEXT:    or t1, t3, t2
+; RV64I-NEXT:    or a4, t1, a4
+; RV64I-NEXT:    lbu t1, 13(a0)
+; RV64I-NEXT:    lbu t2, 12(a0)
+; RV64I-NEXT:    lbu t3, 14(a0)
+; RV64I-NEXT:    lbu a0, 15(a0)
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or t1, t1, t2
+; RV64I-NEXT:    slli t3, t3, 16
+; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    or a0, a0, t3
+; RV64I-NEXT:    or a0, a0, t1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    or a4, a0, a4
+; RV64I-NEXT:    lbu a0, 1(a1)
+; RV64I-NEXT:    lbu t1, 0(a1)
+; RV64I-NEXT:    lbu t2, 2(a1)
+; RV64I-NEXT:    lbu t3, 3(a1)
+; RV64I-NEXT:    slli a0, a0, 8
+; RV64I-NEXT:    or a0, a0, t1
+; RV64I-NEXT:    slli t2, t2, 16
+; RV64I-NEXT:    slli t3, t3, 24
+; RV64I-NEXT:    lbu t1, 5(a1)
+; RV64I-NEXT:    lbu t4, 4(a1)
+; RV64I-NEXT:    or t2, t3, t2
+; RV64I-NEXT:    or t2, t2, a0
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or t1, t1, t4
+; RV64I-NEXT:    lbu t3, 6(a1)
+; RV64I-NEXT:    lbu t4, 7(a1)
+; RV64I-NEXT:    slli a0, a6, 32
+; RV64I-NEXT:    slli a1, t0, 32
+; RV64I-NEXT:    slli t3, t3, 16
+; RV64I-NEXT:    slli t4, t4, 24
+; RV64I-NEXT:    or a6, t4, t3
+; RV64I-NEXT:    or a6, a6, t1
+; RV64I-NEXT:    slli a6, a6, 32
+; RV64I-NEXT:    or a6, a6, t2
+; RV64I-NEXT:    addi t1, a6, -128
+; RV64I-NEXT:    addi t2, a6, -192
+; RV64I-NEXT:    srli t0, a3, 1
+; RV64I-NEXT:    bltz t2, .LBB10_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    sll t3, a3, t2
+; RV64I-NEXT:    j .LBB10_3
+; RV64I-NEXT:  .LBB10_2:
+; RV64I-NEXT:    sll t3, a4, t1
+; RV64I-NEXT:    xori t4, t1, 63
+; RV64I-NEXT:    srl t4, t0, t4
+; RV64I-NEXT:    or t3, t3, t4
+; RV64I-NEXT:  .LBB10_3:
+; RV64I-NEXT:    or a0, a0, a5
+; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    addi a7, a6, -64
+; RV64I-NEXT:    xori a5, a6, 63
+; RV64I-NEXT:    bltz a7, .LBB10_5
+; RV64I-NEXT:  # %bb.4:
+; RV64I-NEXT:    sll s1, a0, a7
+; RV64I-NEXT:    j .LBB10_6
+; RV64I-NEXT:  .LBB10_5:
+; RV64I-NEXT:    sll t4, a1, a6
+; RV64I-NEXT:    srli t5, a0, 1
+; RV64I-NEXT:    srl t5, t5, a5
+; RV64I-NEXT:    or s1, t4, t5
+; RV64I-NEXT:  .LBB10_6:
+; RV64I-NEXT:    negw t6, a6
+; RV64I-NEXT:    srl t4, a4, t6
+; RV64I-NEXT:    li s0, 64
+; RV64I-NEXT:    li t5, 128
+; RV64I-NEXT:    sub s0, s0, a6
+; RV64I-NEXT:    bltu a6, t5, .LBB10_12
+; RV64I-NEXT:  # %bb.7:
+; RV64I-NEXT:    bnez a6, .LBB10_13
+; RV64I-NEXT:  .LBB10_8:
+; RV64I-NEXT:    bgez s0, .LBB10_10
+; RV64I-NEXT:  .LBB10_9:
+; RV64I-NEXT:    srl t3, a3, t6
+; RV64I-NEXT:    slli t4, a4, 1
+; RV64I-NEXT:    sub t6, t5, a6
+; RV64I-NEXT:    xori t6, t6, 63
+; RV64I-NEXT:    sll t4, t4, t6
+; RV64I-NEXT:    or t4, t3, t4
+; RV64I-NEXT:  .LBB10_10:
+; RV64I-NEXT:    slti t3, a7, 0
+; RV64I-NEXT:    neg t3, t3
+; RV64I-NEXT:    bltu a6, t5, .LBB10_14
+; RV64I-NEXT:  # %bb.11:
+; RV64I-NEXT:    sll t1, a3, t1
+; RV64I-NEXT:    slti t2, t2, 0
+; RV64I-NEXT:    neg t2, t2
+; RV64I-NEXT:    and t1, t2, t1
+; RV64I-NEXT:    bnez a6, .LBB10_15
+; RV64I-NEXT:    j .LBB10_16
+; RV64I-NEXT:  .LBB10_12:
+; RV64I-NEXT:    slti t3, s0, 0
+; RV64I-NEXT:    neg t3, t3
+; RV64I-NEXT:    and t3, t3, t4
+; RV64I-NEXT:    or t3, s1, t3
+; RV64I-NEXT:    beqz a6, .LBB10_8
+; RV64I-NEXT:  .LBB10_13:
+; RV64I-NEXT:    mv a1, t3
+; RV64I-NEXT:    bltz s0, .LBB10_9
+; RV64I-NEXT:    j .LBB10_10
+; RV64I-NEXT:  .LBB10_14:
+; RV64I-NEXT:    sll t1, a0, a6
+; RV64I-NEXT:    and t1, t3, t1
+; RV64I-NEXT:    or t1, t1, t4
+; RV64I-NEXT:    beqz a6, .LBB10_16
+; RV64I-NEXT:  .LBB10_15:
+; RV64I-NEXT:    mv a0, t1
+; RV64I-NEXT:  .LBB10_16:
+; RV64I-NEXT:    bltz a7, .LBB10_18
+; RV64I-NEXT:  # %bb.17:
+; RV64I-NEXT:    sll a4, a3, a7
+; RV64I-NEXT:    j .LBB10_19
+; RV64I-NEXT:  .LBB10_18:
+; RV64I-NEXT:    sll a4, a4, a6
+; RV64I-NEXT:    srl a5, t0, a5
+; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:  .LBB10_19:
+; RV64I-NEXT:    sltiu a5, a6, 128
+; RV64I-NEXT:    neg a5, a5
+; RV64I-NEXT:    and a4, a5, a4
+; RV64I-NEXT:    sll a3, a3, a6
+; RV64I-NEXT:    and a3, t3, a3
+; RV64I-NEXT:    and a3, a5, a3
+; RV64I-NEXT:    sb a3, 0(a2)
+; RV64I-NEXT:    sb a4, 8(a2)
+; RV64I-NEXT:    srli a5, a3, 56
+; RV64I-NEXT:    sb a5, 7(a2)
+; RV64I-NEXT:    srli a5, a3, 48
+; RV64I-NEXT:    sb a5, 6(a2)
+; RV64I-NEXT:    srli a5, a3, 40
+; RV64I-NEXT:    sb a5, 5(a2)
+; RV64I-NEXT:    srli a5, a3, 32
+; RV64I-NEXT:    sb a5, 4(a2)
+; RV64I-NEXT:    srli a5, a3, 24
+; RV64I-NEXT:    sb a5, 3(a2)
+; RV64I-NEXT:    srli a5, a3, 16
+; RV64I-NEXT:    sb a5, 2(a2)
+; RV64I-NEXT:    srli a3, a3, 8
+; RV64I-NEXT:    sb a3, 1(a2)
+; RV64I-NEXT:    srli a3, a4, 56
+; RV64I-NEXT:    sb a3, 15(a2)
+; RV64I-NEXT:    srli a3, a4, 48
+; RV64I-NEXT:    sb a3, 14(a2)
+; RV64I-NEXT:    srli a3, a4, 40
+; RV64I-NEXT:    sb a3, 13(a2)
+; RV64I-NEXT:    srli a3, a4, 32
+; RV64I-NEXT:    sb a3, 12(a2)
+; RV64I-NEXT:    srli a3, a4, 24
+; RV64I-NEXT:    sb a3, 11(a2)
+; RV64I-NEXT:    srli a3, a4, 16
+; RV64I-NEXT:    sb a3, 10(a2)
+; RV64I-NEXT:    srli a4, a4, 8
+; RV64I-NEXT:    sb a4, 9(a2)
+; RV64I-NEXT:    sb a1, 24(a2)
+; RV64I-NEXT:    sb a0, 16(a2)
+; RV64I-NEXT:    srli a3, a1, 56
+; RV64I-NEXT:    sb a3, 31(a2)
+; RV64I-NEXT:    srli a3, a1, 48
+; RV64I-NEXT:    sb a3, 30(a2)
+; RV64I-NEXT:    srli a3, a1, 40
+; RV64I-NEXT:    sb a3, 29(a2)
+; RV64I-NEXT:    srli a3, a1, 32
+; RV64I-NEXT:    sb a3, 28(a2)
+; RV64I-NEXT:    srli a3, a1, 24
+; RV64I-NEXT:    sb a3, 27(a2)
+; RV64I-NEXT:    srli a3, a1, 16
+; RV64I-NEXT:    sb a3, 26(a2)
+; RV64I-NEXT:    srli a1, a1, 8
+; RV64I-NEXT:    sb a1, 25(a2)
+; RV64I-NEXT:    srli a1, a0, 56
+; RV64I-NEXT:    sb a1, 23(a2)
+; RV64I-NEXT:    srli a1, a0, 48
+; RV64I-NEXT:    sb a1, 22(a2)
+; RV64I-NEXT:    srli a1, a0, 40
+; RV64I-NEXT:    sb a1, 21(a2)
+; RV64I-NEXT:    srli a1, a0, 32
+; RV64I-NEXT:    sb a1, 20(a2)
+; RV64I-NEXT:    srli a1, a0, 24
+; RV64I-NEXT:    sb a1, 19(a2)
+; RV64I-NEXT:    srli a1, a0, 16
+; RV64I-NEXT:    sb a1, 18(a2)
+; RV64I-NEXT:    srli a0, a0, 8
+; RV64I-NEXT:    sb a0, 17(a2)
+; RV64I-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 0(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: shl_32bytes:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -128
+; RV32I-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 116(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 112(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s6, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s7, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s8, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s9, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s10, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s11, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a7, 24(a0)
+; RV32I-NEXT:    lbu t3, 25(a0)
+; RV32I-NEXT:    lbu t4, 26(a0)
+; RV32I-NEXT:    lbu t5, 27(a0)
+; RV32I-NEXT:    lbu t0, 28(a0)
+; RV32I-NEXT:    lbu s0, 29(a0)
+; RV32I-NEXT:    lbu s1, 30(a0)
+; RV32I-NEXT:    lbu s3, 31(a0)
+; RV32I-NEXT:    lbu a6, 16(a0)
+; RV32I-NEXT:    lbu t6, 17(a0)
+; RV32I-NEXT:    lbu s2, 18(a0)
+; RV32I-NEXT:    lbu s6, 19(a0)
+; RV32I-NEXT:    lbu s4, 20(a0)
+; RV32I-NEXT:    lbu t1, 21(a0)
+; RV32I-NEXT:    lbu t2, 22(a0)
+; RV32I-NEXT:    lbu s5, 23(a0)
+; RV32I-NEXT:    lbu a3, 9(a0)
+; RV32I-NEXT:    lbu a4, 8(a0)
+; RV32I-NEXT:    lbu a5, 10(a0)
+; RV32I-NEXT:    lbu s7, 11(a0)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli s7, s7, 24
+; RV32I-NEXT:    or a4, s7, a5
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 13(a0)
+; RV32I-NEXT:    lbu a5, 12(a0)
+; RV32I-NEXT:    lbu s7, 14(a0)
+; RV32I-NEXT:    lbu s9, 15(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or s8, a4, a5
+; RV32I-NEXT:    slli s7, s7, 16
+; RV32I-NEXT:    slli s9, s9, 24
+; RV32I-NEXT:    or s9, s9, s7
+; RV32I-NEXT:    lbu a4, 1(a0)
+; RV32I-NEXT:    lbu a5, 0(a0)
+; RV32I-NEXT:    lbu s7, 2(a0)
+; RV32I-NEXT:    lbu s10, 3(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    slli s7, s7, 16
+; RV32I-NEXT:    slli s10, s10, 24
+; RV32I-NEXT:    or a5, s10, s7
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    lbu a5, 5(a0)
+; RV32I-NEXT:    lbu s7, 4(a0)
+; RV32I-NEXT:    lbu s10, 6(a0)
+; RV32I-NEXT:    lbu a0, 7(a0)
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a5, a5, s7
+; RV32I-NEXT:    slli s10, s10, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a0, a0, s10
+; RV32I-NEXT:    or s10, a0, a5
+; RV32I-NEXT:    lbu a0, 1(a1)
+; RV32I-NEXT:    lbu a5, 0(a1)
+; RV32I-NEXT:    lbu s7, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a0, a0, 8
+; RV32I-NEXT:    or a0, a0, a5
+; RV32I-NEXT:    slli s7, s7, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, s7
+; RV32I-NEXT:    or a0, a1, a0
+; RV32I-NEXT:    addi a5, a0, -192
+; RV32I-NEXT:    addi a1, a0, -224
+; RV32I-NEXT:    srli s7, a4, 1
+; RV32I-NEXT:    sw s10, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s7, 52(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw a5, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw a1, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    bltz a1, .LBB10_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    sll s7, a4, a1
+; RV32I-NEXT:    j .LBB10_3
+; RV32I-NEXT:  .LBB10_2:
+; RV32I-NEXT:    sll a1, s10, a5
+; RV32I-NEXT:    xori a5, a5, 31
+; RV32I-NEXT:    srl a5, s7, a5
+; RV32I-NEXT:    or s7, a1, a5
+; RV32I-NEXT:  .LBB10_3:
+; RV32I-NEXT:    slli s10, t6, 8
+; RV32I-NEXT:    slli ra, s2, 16
+; RV32I-NEXT:    slli s6, s6, 24
+; RV32I-NEXT:    or t6, s9, s8
+; RV32I-NEXT:    addi s2, a0, -128
+; RV32I-NEXT:    srli a1, a3, 1
+; RV32I-NEXT:    addi s11, a0, -160
+; RV32I-NEXT:    xori s8, s2, 31
+; RV32I-NEXT:    sw a1, 44(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s8, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    bltz s11, .LBB10_5
+; RV32I-NEXT:  # %bb.4:
+; RV32I-NEXT:    sll s8, a3, s11
+; RV32I-NEXT:    j .LBB10_6
+; RV32I-NEXT:  .LBB10_5:
+; RV32I-NEXT:    sll a5, t6, s2
+; RV32I-NEXT:    srl s8, a1, s8
+; RV32I-NEXT:    or s8, a5, s8
+; RV32I-NEXT:  .LBB10_6:
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    slli a5, t2, 16
+; RV32I-NEXT:    slli s5, s5, 24
+; RV32I-NEXT:    or a6, s10, a6
+; RV32I-NEXT:    or s6, s6, ra
+; RV32I-NEXT:    neg s10, a0
+; RV32I-NEXT:    lw t2, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    srl s9, t2, s10
+; RV32I-NEXT:    li t2, 160
+; RV32I-NEXT:    li ra, 64
+; RV32I-NEXT:    sub t2, t2, a0
+; RV32I-NEXT:    li a1, 64
+; RV32I-NEXT:    sw s9, 60(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw t2, 36(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    bgeu s2, ra, .LBB10_8
+; RV32I-NEXT:  # %bb.7:
+; RV32I-NEXT:    slti t2, t2, 0
+; RV32I-NEXT:    neg t2, t2
+; RV32I-NEXT:    and t2, t2, s9
+; RV32I-NEXT:    or s7, s8, t2
+; RV32I-NEXT:  .LBB10_8:
+; RV32I-NEXT:    slli t3, t3, 8
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli t5, t5, 24
+; RV32I-NEXT:    slli s0, s0, 8
+; RV32I-NEXT:    slli s1, s1, 16
+; RV32I-NEXT:    slli s3, s3, 24
+; RV32I-NEXT:    or s4, t1, s4
+; RV32I-NEXT:    or s5, s5, a5
+; RV32I-NEXT:    or ra, s6, a6
+; RV32I-NEXT:    sw t6, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv a6, t6
+; RV32I-NEXT:    beqz s2, .LBB10_10
+; RV32I-NEXT:  # %bb.9:
+; RV32I-NEXT:    mv a6, s7
+; RV32I-NEXT:  .LBB10_10:
+; RV32I-NEXT:    or a5, t3, a7
+; RV32I-NEXT:    or a7, t5, t4
+; RV32I-NEXT:    or t0, s0, t0
+; RV32I-NEXT:    or t1, s3, s1
+; RV32I-NEXT:    or s6, s5, s4
+; RV32I-NEXT:    addi t4, a0, -64
+; RV32I-NEXT:    srli s0, ra, 1
+; RV32I-NEXT:    addi t6, a0, -96
+; RV32I-NEXT:    xori t3, t4, 31
+; RV32I-NEXT:    sw t3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    bltz t6, .LBB10_12
+; RV32I-NEXT:  # %bb.11:
+; RV32I-NEXT:    sll t3, ra, t6
+; RV32I-NEXT:    j .LBB10_13
+; RV32I-NEXT:  .LBB10_12:
+; RV32I-NEXT:    sll t2, s6, t4
+; RV32I-NEXT:    srl t3, s0, t3
+; RV32I-NEXT:    or t3, t2, t3
+; RV32I-NEXT:  .LBB10_13:
+; RV32I-NEXT:    or a7, a7, a5
+; RV32I-NEXT:    or t0, t1, t0
+; RV32I-NEXT:    addi t5, a0, -32
+; RV32I-NEXT:    xori s4, a0, 31
+; RV32I-NEXT:    bltz t5, .LBB10_15
+; RV32I-NEXT:  # %bb.14:
+; RV32I-NEXT:    sll a5, a7, t5
+; RV32I-NEXT:    j .LBB10_16
+; RV32I-NEXT:  .LBB10_15:
+; RV32I-NEXT:    sll a5, t0, a0
+; RV32I-NEXT:    srli t1, a7, 1
+; RV32I-NEXT:    srl t1, t1, s4
+; RV32I-NEXT:    or a5, a5, t1
+; RV32I-NEXT:  .LBB10_16:
+; RV32I-NEXT:    srl s1, s6, s10
+; RV32I-NEXT:    li t1, 32
+; RV32I-NEXT:    sub t2, t1, a0
+; RV32I-NEXT:    sw t2, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    slti t2, t2, 0
+; RV32I-NEXT:    neg s9, t2
+; RV32I-NEXT:    sw s1, 32(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    bgeu a0, a1, .LBB10_18
+; RV32I-NEXT:  # %bb.17:
+; RV32I-NEXT:    and t2, s9, s1
+; RV32I-NEXT:    or t3, a5, t2
+; RV32I-NEXT:  .LBB10_18:
+; RV32I-NEXT:    sw t4, 48(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv s1, t0
+; RV32I-NEXT:    beqz a0, .LBB10_20
+; RV32I-NEXT:  # %bb.19:
+; RV32I-NEXT:    mv s1, t3
+; RV32I-NEXT:  .LBB10_20:
+; RV32I-NEXT:    lw a1, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    srl a1, a1, s10
+; RV32I-NEXT:    li t2, 96
+; RV32I-NEXT:    sub t4, t2, a0
+; RV32I-NEXT:    slti t2, t4, 0
+; RV32I-NEXT:    neg t3, t2
+; RV32I-NEXT:    li a5, 128
+; RV32I-NEXT:    sub s7, a5, a0
+; RV32I-NEXT:    sltiu t2, s7, 64
+; RV32I-NEXT:    neg t2, t2
+; RV32I-NEXT:    bgeu a0, a5, .LBB10_22
+; RV32I-NEXT:  # %bb.21:
+; RV32I-NEXT:    and a6, t3, a1
+; RV32I-NEXT:    and a6, t2, a6
+; RV32I-NEXT:    or a6, s1, a6
+; RV32I-NEXT:  .LBB10_22:
+; RV32I-NEXT:    lw s3, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sw a1, 56(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    beqz a0, .LBB10_24
+; RV32I-NEXT:  # %bb.23:
+; RV32I-NEXT:    mv t0, a6
+; RV32I-NEXT:  .LBB10_24:
+; RV32I-NEXT:    neg a6, s7
+; RV32I-NEXT:    sub s8, t1, s7
+; RV32I-NEXT:    sll t1, a3, a6
+; RV32I-NEXT:    sw t2, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    bltz s8, .LBB10_27
+; RV32I-NEXT:  # %bb.25:
+; RV32I-NEXT:    mv a6, t1
+; RV32I-NEXT:    li a1, 64
+; RV32I-NEXT:    li a5, 64
+; RV32I-NEXT:    bgeu s7, a1, .LBB10_28
+; RV32I-NEXT:  .LBB10_26:
+; RV32I-NEXT:    lw t2, 60(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    and t2, t3, t2
+; RV32I-NEXT:    or t2, t2, a6
+; RV32I-NEXT:    mv a6, s3
+; RV32I-NEXT:    bnez s7, .LBB10_29
+; RV32I-NEXT:    j .LBB10_30
+; RV32I-NEXT:  .LBB10_27:
+; RV32I-NEXT:    lw a1, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sll a6, a1, a6
+; RV32I-NEXT:    li a1, 64
+; RV32I-NEXT:    sub t2, a1, s7
+; RV32I-NEXT:    xori t2, t2, 31
+; RV32I-NEXT:    lw a5, 44(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    srl t2, a5, t2
+; RV32I-NEXT:    or a6, a6, t2
+; RV32I-NEXT:    li a5, 64
+; RV32I-NEXT:    bltu s7, a1, .LBB10_26
+; RV32I-NEXT:  .LBB10_28:
+; RV32I-NEXT:    lw a1, 56(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    and t2, s9, a1
+; RV32I-NEXT:    mv a6, s3
+; RV32I-NEXT:    beqz s7, .LBB10_30
+; RV32I-NEXT:  .LBB10_29:
+; RV32I-NEXT:    mv a6, t2
+; RV32I-NEXT:  .LBB10_30:
+; RV32I-NEXT:    bltz t5, .LBB10_32
+; RV32I-NEXT:  # %bb.31:
+; RV32I-NEXT:    sll s0, ra, t5
+; RV32I-NEXT:    j .LBB10_33
+; RV32I-NEXT:  .LBB10_32:
+; RV32I-NEXT:    sll t2, s6, a0
+; RV32I-NEXT:    srl t3, s0, s4
+; RV32I-NEXT:    or s0, t2, t3
+; RV32I-NEXT:  .LBB10_33:
+; RV32I-NEXT:    sltiu t3, a0, 64
+; RV32I-NEXT:    sw s4, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s9, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    bltz s11, .LBB10_35
+; RV32I-NEXT:  # %bb.34:
+; RV32I-NEXT:    sll a1, a4, s11
+; RV32I-NEXT:    j .LBB10_36
+; RV32I-NEXT:  .LBB10_35:
+; RV32I-NEXT:    sll t2, s3, s2
+; RV32I-NEXT:    lw s4, 52(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a1, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    srl a1, s4, a1
+; RV32I-NEXT:    or a1, t2, a1
+; RV32I-NEXT:  .LBB10_36:
+; RV32I-NEXT:    neg s5, t3
+; RV32I-NEXT:    sltiu t2, s2, 64
+; RV32I-NEXT:    neg t3, t2
+; RV32I-NEXT:    li t2, 128
+; RV32I-NEXT:    bltu a0, t2, .LBB10_38
+; RV32I-NEXT:  # %bb.37:
+; RV32I-NEXT:    and a1, t3, a1
+; RV32I-NEXT:    mv s0, s6
+; RV32I-NEXT:    bnez a0, .LBB10_39
+; RV32I-NEXT:    j .LBB10_40
+; RV32I-NEXT:  .LBB10_38:
+; RV32I-NEXT:    and a1, s5, s0
+; RV32I-NEXT:    or a1, a1, a6
+; RV32I-NEXT:    mv s0, s6
+; RV32I-NEXT:    beqz a0, .LBB10_40
+; RV32I-NEXT:  .LBB10_39:
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:  .LBB10_40:
+; RV32I-NEXT:    srl a1, a3, s10
+; RV32I-NEXT:    lw a6, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    slli a6, a6, 1
+; RV32I-NEXT:    sub t2, a5, a0
+; RV32I-NEXT:    xori t2, t2, 31
+; RV32I-NEXT:    lw s1, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sw t2, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    bltz s1, .LBB10_42
+; RV32I-NEXT:  # %bb.41:
+; RV32I-NEXT:    lw s4, 56(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    j .LBB10_43
+; RV32I-NEXT:  .LBB10_42:
+; RV32I-NEXT:    sll t2, a6, t2
+; RV32I-NEXT:    or s4, a1, t2
+; RV32I-NEXT:  .LBB10_43:
+; RV32I-NEXT:    srl s1, a4, s10
+; RV32I-NEXT:    slli s3, s3, 1
+; RV32I-NEXT:    xori s9, s7, 31
+; RV32I-NEXT:    sw s3, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    bltz t4, .LBB10_45
+; RV32I-NEXT:  # %bb.44:
+; RV32I-NEXT:    mv s3, s1
+; RV32I-NEXT:    lw t2, 60(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    bltu s7, a5, .LBB10_46
+; RV32I-NEXT:    j .LBB10_47
+; RV32I-NEXT:  .LBB10_45:
+; RV32I-NEXT:    sll t2, s3, s9
+; RV32I-NEXT:    mv s3, s1
+; RV32I-NEXT:    or t2, s1, t2
+; RV32I-NEXT:    bgeu s7, a5, .LBB10_47
+; RV32I-NEXT:  .LBB10_46:
+; RV32I-NEXT:    slti s4, s8, 0
+; RV32I-NEXT:    neg s4, s4
+; RV32I-NEXT:    and t1, s4, t1
+; RV32I-NEXT:    or s4, t2, t1
+; RV32I-NEXT:  .LBB10_47:
+; RV32I-NEXT:    mv s8, a4
+; RV32I-NEXT:    beqz s7, .LBB10_49
+; RV32I-NEXT:  # %bb.48:
+; RV32I-NEXT:    mv s8, s4
+; RV32I-NEXT:  .LBB10_49:
+; RV32I-NEXT:    slti t1, t5, 0
+; RV32I-NEXT:    neg s7, t1
+; RV32I-NEXT:    slti t1, s11, 0
+; RV32I-NEXT:    neg t1, t1
+; RV32I-NEXT:    li a5, 128
+; RV32I-NEXT:    bltu a0, a5, .LBB10_51
+; RV32I-NEXT:  # %bb.50:
+; RV32I-NEXT:    sll t2, a4, s2
+; RV32I-NEXT:    and t2, t1, t2
+; RV32I-NEXT:    and t2, t3, t2
+; RV32I-NEXT:    mv s11, ra
+; RV32I-NEXT:    bnez a0, .LBB10_52
+; RV32I-NEXT:    j .LBB10_53
+; RV32I-NEXT:  .LBB10_51:
+; RV32I-NEXT:    sll t2, ra, a0
+; RV32I-NEXT:    and t2, s7, t2
+; RV32I-NEXT:    and t2, s5, t2
+; RV32I-NEXT:    or t2, t2, s8
+; RV32I-NEXT:    mv s11, ra
+; RV32I-NEXT:    beqz a0, .LBB10_53
+; RV32I-NEXT:  .LBB10_52:
+; RV32I-NEXT:    mv s11, t2
+; RV32I-NEXT:  .LBB10_53:
+; RV32I-NEXT:    lw a5, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    bgez a5, .LBB10_55
+; RV32I-NEXT:  # %bb.54:
+; RV32I-NEXT:    srl t2, ra, s10
+; RV32I-NEXT:    slli s6, s6, 1
+; RV32I-NEXT:    lw a5, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sll t3, s6, a5
+; RV32I-NEXT:    or a5, t2, t3
+; RV32I-NEXT:    sw a5, 32(sp) # 4-byte Folded Spill
+; RV32I-NEXT:  .LBB10_55:
+; RV32I-NEXT:    slti t2, t6, 0
+; RV32I-NEXT:    neg s6, t2
+; RV32I-NEXT:    li s10, 64
+; RV32I-NEXT:    bltu a0, s10, .LBB10_57
+; RV32I-NEXT:  # %bb.56:
+; RV32I-NEXT:    lw a5, 48(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sll t2, ra, a5
+; RV32I-NEXT:    and t2, s6, t2
+; RV32I-NEXT:    j .LBB10_58
+; RV32I-NEXT:  .LBB10_57:
+; RV32I-NEXT:    sll t2, a7, a0
+; RV32I-NEXT:    and t2, s7, t2
+; RV32I-NEXT:    lw a5, 32(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    or t2, t2, a5
+; RV32I-NEXT:  .LBB10_58:
+; RV32I-NEXT:    lw s4, 52(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s8, 60(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    mv t3, a7
+; RV32I-NEXT:    beqz a0, .LBB10_60
+; RV32I-NEXT:  # %bb.59:
+; RV32I-NEXT:    mv t3, t2
+; RV32I-NEXT:  .LBB10_60:
+; RV32I-NEXT:    bgez t4, .LBB10_62
+; RV32I-NEXT:  # %bb.61:
+; RV32I-NEXT:    sll a5, a6, s9
+; RV32I-NEXT:    or a1, a1, a5
+; RV32I-NEXT:    sw a1, 56(sp) # 4-byte Folded Spill
+; RV32I-NEXT:  .LBB10_62:
+; RV32I-NEXT:    lw t2, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    mv s1, s3
+; RV32I-NEXT:    lw t4, 48(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a1, 36(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    bltz a1, .LBB10_65
+; RV32I-NEXT:  # %bb.63:
+; RV32I-NEXT:    mv a1, s8
+; RV32I-NEXT:    bgeu s2, s10, .LBB10_66
+; RV32I-NEXT:  .LBB10_64:
+; RV32I-NEXT:    sll a6, a3, s2
+; RV32I-NEXT:    and a6, t1, a6
+; RV32I-NEXT:    or a6, a6, a1
+; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:    bnez s2, .LBB10_67
+; RV32I-NEXT:    j .LBB10_68
+; RV32I-NEXT:  .LBB10_65:
+; RV32I-NEXT:    li a1, 192
+; RV32I-NEXT:    sub a1, a1, a0
+; RV32I-NEXT:    xori a1, a1, 31
+; RV32I-NEXT:    lw a5, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sll a1, a5, a1
+; RV32I-NEXT:    or a1, s1, a1
+; RV32I-NEXT:    bltu s2, s10, .LBB10_64
+; RV32I-NEXT:  .LBB10_66:
+; RV32I-NEXT:    lw a1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sll a1, a4, a1
+; RV32I-NEXT:    lw a5, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    slti a6, a5, 0
+; RV32I-NEXT:    neg a6, a6
+; RV32I-NEXT:    and a6, a6, a1
+; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:    beqz s2, .LBB10_68
+; RV32I-NEXT:  .LBB10_67:
+; RV32I-NEXT:    mv a1, a6
+; RV32I-NEXT:  .LBB10_68:
+; RV32I-NEXT:    li a5, 128
+; RV32I-NEXT:    bltu a0, a5, .LBB10_73
+; RV32I-NEXT:  # %bb.69:
+; RV32I-NEXT:    bnez a0, .LBB10_74
+; RV32I-NEXT:  .LBB10_70:
+; RV32I-NEXT:    bltz t6, .LBB10_75
+; RV32I-NEXT:  .LBB10_71:
+; RV32I-NEXT:    sll a1, a4, t6
+; RV32I-NEXT:    lw t3, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    bgez t5, .LBB10_76
+; RV32I-NEXT:  .LBB10_72:
+; RV32I-NEXT:    sll a5, t3, a0
+; RV32I-NEXT:    lw a6, 44(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw t1, 40(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    srl a6, a6, t1
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    bltu a0, s10, .LBB10_77
+; RV32I-NEXT:    j .LBB10_78
+; RV32I-NEXT:  .LBB10_73:
+; RV32I-NEXT:    lw a1, 56(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a5, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    and a1, a5, a1
+; RV32I-NEXT:    or a1, t3, a1
+; RV32I-NEXT:    beqz a0, .LBB10_70
+; RV32I-NEXT:  .LBB10_74:
+; RV32I-NEXT:    mv a7, a1
+; RV32I-NEXT:    bgez t6, .LBB10_71
+; RV32I-NEXT:  .LBB10_75:
+; RV32I-NEXT:    sll a1, t2, t4
+; RV32I-NEXT:    lw a5, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    srl a5, s4, a5
+; RV32I-NEXT:    or a1, a1, a5
+; RV32I-NEXT:    lw t3, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    bltz t5, .LBB10_72
+; RV32I-NEXT:  .LBB10_76:
+; RV32I-NEXT:    sll a5, a3, t5
+; RV32I-NEXT:    bgeu a0, s10, .LBB10_78
+; RV32I-NEXT:  .LBB10_77:
+; RV32I-NEXT:    lw a1, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    and a1, a1, s8
+; RV32I-NEXT:    or a1, a5, a1
+; RV32I-NEXT:  .LBB10_78:
+; RV32I-NEXT:    bnez a0, .LBB10_82
+; RV32I-NEXT:  # %bb.79:
+; RV32I-NEXT:    lw a1, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    bltz a1, .LBB10_83
+; RV32I-NEXT:  .LBB10_80:
+; RV32I-NEXT:    sltiu a1, a0, 128
+; RV32I-NEXT:    bltu a0, s10, .LBB10_84
+; RV32I-NEXT:  .LBB10_81:
+; RV32I-NEXT:    sll a5, a4, t4
+; RV32I-NEXT:    and a6, s6, a5
+; RV32I-NEXT:    neg a5, a1
+; RV32I-NEXT:    bnez a0, .LBB10_85
+; RV32I-NEXT:    j .LBB10_86
+; RV32I-NEXT:  .LBB10_82:
+; RV32I-NEXT:    mv t3, a1
+; RV32I-NEXT:    lw a1, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    bgez a1, .LBB10_80
+; RV32I-NEXT:  .LBB10_83:
+; RV32I-NEXT:    lw a1, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw a5, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sll a1, a5, a1
+; RV32I-NEXT:    or s8, s1, a1
+; RV32I-NEXT:    sltiu a1, a0, 128
+; RV32I-NEXT:    bgeu a0, s10, .LBB10_81
+; RV32I-NEXT:  .LBB10_84:
+; RV32I-NEXT:    sll a5, a3, a0
+; RV32I-NEXT:    and a5, s7, a5
+; RV32I-NEXT:    or a6, a5, s8
+; RV32I-NEXT:    neg a5, a1
+; RV32I-NEXT:    beqz a0, .LBB10_86
+; RV32I-NEXT:  .LBB10_85:
+; RV32I-NEXT:    mv a3, a6
+; RV32I-NEXT:  .LBB10_86:
+; RV32I-NEXT:    and a6, a5, t3
+; RV32I-NEXT:    and a1, a5, a3
+; RV32I-NEXT:    bltz t5, .LBB10_88
+; RV32I-NEXT:  # %bb.87:
+; RV32I-NEXT:    sll a3, a4, t5
+; RV32I-NEXT:    j .LBB10_89
+; RV32I-NEXT:  .LBB10_88:
+; RV32I-NEXT:    sll a3, t2, a0
+; RV32I-NEXT:    lw t1, 40(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    srl t1, s4, t1
+; RV32I-NEXT:    or a3, a3, t1
+; RV32I-NEXT:  .LBB10_89:
+; RV32I-NEXT:    and a3, s5, a3
+; RV32I-NEXT:    and a3, a5, a3
+; RV32I-NEXT:    sll a0, a4, a0
+; RV32I-NEXT:    and a0, s7, a0
+; RV32I-NEXT:    and a0, s5, a0
+; RV32I-NEXT:    and a0, a5, a0
+; RV32I-NEXT:    sb a0, 0(a2)
+; RV32I-NEXT:    sb a3, 4(a2)
+; RV32I-NEXT:    srli a4, a0, 24
+; RV32I-NEXT:    sb a4, 3(a2)
+; RV32I-NEXT:    srli a4, a0, 16
+; RV32I-NEXT:    sb a4, 2(a2)
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sb a0, 1(a2)
+; RV32I-NEXT:    srli a0, a3, 24
+; RV32I-NEXT:    sb a0, 7(a2)
+; RV32I-NEXT:    srli a0, a3, 16
+; RV32I-NEXT:    sb a0, 6(a2)
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 5(a2)
+; RV32I-NEXT:    sb a6, 12(a2)
+; RV32I-NEXT:    sb a1, 8(a2)
+; RV32I-NEXT:    srli a0, a6, 24
+; RV32I-NEXT:    sb a0, 15(a2)
+; RV32I-NEXT:    srli a0, a6, 16
+; RV32I-NEXT:    sb a0, 14(a2)
+; RV32I-NEXT:    srli a0, a6, 8
+; RV32I-NEXT:    sb a0, 13(a2)
+; RV32I-NEXT:    sb t0, 28(a2)
+; RV32I-NEXT:    srli a0, a1, 24
+; RV32I-NEXT:    sb a0, 11(a2)
+; RV32I-NEXT:    srli a0, a1, 16
+; RV32I-NEXT:    sb a0, 10(a2)
+; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a1, 9(a2)
+; RV32I-NEXT:    sb a7, 24(a2)
+; RV32I-NEXT:    srli a0, t0, 24
+; RV32I-NEXT:    sb a0, 31(a2)
+; RV32I-NEXT:    srli a0, t0, 16
+; RV32I-NEXT:    sb a0, 30(a2)
+; RV32I-NEXT:    srli a0, t0, 8
+; RV32I-NEXT:    sb a0, 29(a2)
+; RV32I-NEXT:    sb s11, 16(a2)
+; RV32I-NEXT:    srli a0, a7, 24
+; RV32I-NEXT:    sb a0, 27(a2)
+; RV32I-NEXT:    srli a0, a7, 16
+; RV32I-NEXT:    sb a0, 26(a2)
+; RV32I-NEXT:    srli a0, a7, 8
+; RV32I-NEXT:    sb a0, 25(a2)
+; RV32I-NEXT:    srli a0, s11, 24
+; RV32I-NEXT:    sb a0, 19(a2)
+; RV32I-NEXT:    srli a0, s11, 16
+; RV32I-NEXT:    sb a0, 18(a2)
+; RV32I-NEXT:    srli a0, s11, 8
+; RV32I-NEXT:    sb a0, 17(a2)
+; RV32I-NEXT:    sb s0, 20(a2)
+; RV32I-NEXT:    srli a0, s0, 24
+; RV32I-NEXT:    sb a0, 23(a2)
+; RV32I-NEXT:    srli a0, s0, 16
+; RV32I-NEXT:    sb a0, 22(a2)
+; RV32I-NEXT:    srli s0, s0, 8
+; RV32I-NEXT:    sb s0, 21(a2)
+; RV32I-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 116(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 112(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s6, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s7, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s8, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s9, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s10, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s11, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 128
+; RV32I-NEXT:    ret
+  %src = load i256, ptr %src.ptr, align 1
+  %bitOff = load i256, ptr %bitOff.ptr, align 1
+  %res = shl i256 %src, %bitOff
+  store i256 %res, ptr %dst, align 1
+  ret void
+}
+define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: ashr_32bytes:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    sd s0, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a3, 9(a0)
+; RV64I-NEXT:    lbu a4, 8(a0)
+; RV64I-NEXT:    lbu a5, 10(a0)
+; RV64I-NEXT:    lbu a6, 11(a0)
+; RV64I-NEXT:    slli a3, a3, 8
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a7, a4, a3
+; RV64I-NEXT:    lbu a3, 13(a0)
+; RV64I-NEXT:    lbu a4, 12(a0)
+; RV64I-NEXT:    lbu a5, 14(a0)
+; RV64I-NEXT:    lbu a6, 15(a0)
+; RV64I-NEXT:    slli a3, a3, 8
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a4, a4, a3
+; RV64I-NEXT:    lbu a3, 1(a0)
+; RV64I-NEXT:    lbu a5, 0(a0)
+; RV64I-NEXT:    lbu a6, 2(a0)
+; RV64I-NEXT:    lbu t0, 3(a0)
+; RV64I-NEXT:    slli a3, a3, 8
+; RV64I-NEXT:    or a3, a3, a5
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a5, t0, a6
+; RV64I-NEXT:    or t1, a5, a3
+; RV64I-NEXT:    lbu a3, 5(a0)
+; RV64I-NEXT:    lbu a5, 4(a0)
+; RV64I-NEXT:    lbu a6, 6(a0)
+; RV64I-NEXT:    lbu t0, 7(a0)
+; RV64I-NEXT:    slli a3, a3, 8
+; RV64I-NEXT:    or a3, a3, a5
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a5, t0, a6
+; RV64I-NEXT:    or t0, a5, a3
+; RV64I-NEXT:    lbu a3, 25(a0)
+; RV64I-NEXT:    lbu a5, 24(a0)
+; RV64I-NEXT:    lbu a6, 26(a0)
+; RV64I-NEXT:    lbu t2, 27(a0)
+; RV64I-NEXT:    slli a3, a3, 8
+; RV64I-NEXT:    or a3, a3, a5
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or a5, t2, a6
+; RV64I-NEXT:    or a3, a5, a3
+; RV64I-NEXT:    lbu a5, 29(a0)
+; RV64I-NEXT:    lbu a6, 28(a0)
+; RV64I-NEXT:    lbu t2, 30(a0)
+; RV64I-NEXT:    lbu t3, 31(a0)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    slli t2, t2, 16
+; RV64I-NEXT:    slli t3, t3, 24
+; RV64I-NEXT:    or a6, t3, t2
+; RV64I-NEXT:    or a6, a6, a5
+; RV64I-NEXT:    slli a5, a6, 32
+; RV64I-NEXT:    or a3, a5, a3
+; RV64I-NEXT:    lbu a5, 17(a0)
+; RV64I-NEXT:    lbu t2, 16(a0)
+; RV64I-NEXT:    lbu t3, 18(a0)
+; RV64I-NEXT:    lbu t4, 19(a0)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a5, a5, t2
+; RV64I-NEXT:    slli t3, t3, 16
+; RV64I-NEXT:    slli t4, t4, 24
+; RV64I-NEXT:    or t2, t4, t3
+; RV64I-NEXT:    or a5, t2, a5
+; RV64I-NEXT:    lbu t2, 21(a0)
+; RV64I-NEXT:    lbu t3, 20(a0)
+; RV64I-NEXT:    lbu t4, 22(a0)
+; RV64I-NEXT:    lbu a0, 23(a0)
+; RV64I-NEXT:    slli t2, t2, 8
+; RV64I-NEXT:    or t2, t2, t3
+; RV64I-NEXT:    slli t4, t4, 16
+; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    or a0, a0, t4
+; RV64I-NEXT:    or a0, a0, t2
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    or a5, a0, a5
+; RV64I-NEXT:    lbu a0, 1(a1)
+; RV64I-NEXT:    lbu t2, 0(a1)
+; RV64I-NEXT:    lbu t3, 2(a1)
+; RV64I-NEXT:    lbu t4, 3(a1)
+; RV64I-NEXT:    slli a0, a0, 8
+; RV64I-NEXT:    or a0, a0, t2
+; RV64I-NEXT:    slli t3, t3, 16
+; RV64I-NEXT:    slli t4, t4, 24
+; RV64I-NEXT:    lbu t2, 5(a1)
+; RV64I-NEXT:    lbu t5, 4(a1)
+; RV64I-NEXT:    or t3, t4, t3
+; RV64I-NEXT:    or t3, t3, a0
+; RV64I-NEXT:    slli t2, t2, 8
+; RV64I-NEXT:    or t2, t2, t5
+; RV64I-NEXT:    lbu t4, 6(a1)
+; RV64I-NEXT:    lbu t5, 7(a1)
+; RV64I-NEXT:    slli a0, a4, 32
+; RV64I-NEXT:    slli a1, t0, 32
+; RV64I-NEXT:    slli t4, t4, 16
+; RV64I-NEXT:    slli t5, t5, 24
+; RV64I-NEXT:    or a4, t5, t4
+; RV64I-NEXT:    or a4, a4, t2
+; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    or a4, a4, t3
+; RV64I-NEXT:    addi t3, a4, -128
+; RV64I-NEXT:    addi t4, a4, -192
+; RV64I-NEXT:    slli t0, a3, 1
+; RV64I-NEXT:    bltz t4, .LBB11_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    sra t6, a3, t4
+; RV64I-NEXT:    j .LBB11_3
+; RV64I-NEXT:  .LBB11_2:
+; RV64I-NEXT:    srl t2, a5, t3
+; RV64I-NEXT:    xori t5, t3, 63
+; RV64I-NEXT:    sll t5, t0, t5
+; RV64I-NEXT:    or t6, t2, t5
+; RV64I-NEXT:  .LBB11_3:
+; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    or a1, a1, t1
+; RV64I-NEXT:    addi a7, a4, -64
+; RV64I-NEXT:    xori t2, a4, 63
+; RV64I-NEXT:    bltz a7, .LBB11_5
+; RV64I-NEXT:  # %bb.4:
+; RV64I-NEXT:    srl s2, a0, a7
+; RV64I-NEXT:    j .LBB11_6
+; RV64I-NEXT:  .LBB11_5:
+; RV64I-NEXT:    srl t1, a1, a4
+; RV64I-NEXT:    slli t5, a0, 1
+; RV64I-NEXT:    sll t5, t5, t2
+; RV64I-NEXT:    or s2, t1, t5
+; RV64I-NEXT:  .LBB11_6:
+; RV64I-NEXT:    negw s0, a4
+; RV64I-NEXT:    sll t5, a5, s0
+; RV64I-NEXT:    li s1, 64
+; RV64I-NEXT:    li t1, 128
+; RV64I-NEXT:    sub s1, s1, a4
+; RV64I-NEXT:    bltu a4, t1, .LBB11_11
+; RV64I-NEXT:  # %bb.7:
+; RV64I-NEXT:    bnez a4, .LBB11_12
+; RV64I-NEXT:  .LBB11_8:
+; RV64I-NEXT:    bltz s1, .LBB11_13
+; RV64I-NEXT:  .LBB11_9:
+; RV64I-NEXT:    sraiw a6, a6, 31
+; RV64I-NEXT:    bltz t4, .LBB11_14
+; RV64I-NEXT:  .LBB11_10:
+; RV64I-NEXT:    mv t3, a6
+; RV64I-NEXT:    bltu a4, t1, .LBB11_15
+; RV64I-NEXT:    j .LBB11_16
+; RV64I-NEXT:  .LBB11_11:
+; RV64I-NEXT:    slti t6, s1, 0
+; RV64I-NEXT:    neg t6, t6
+; RV64I-NEXT:    and t6, t6, t5
+; RV64I-NEXT:    or t6, s2, t6
+; RV64I-NEXT:    beqz a4, .LBB11_8
+; RV64I-NEXT:  .LBB11_12:
+; RV64I-NEXT:    mv a1, t6
+; RV64I-NEXT:    bgez s1, .LBB11_9
+; RV64I-NEXT:  .LBB11_13:
+; RV64I-NEXT:    sll t5, a3, s0
+; RV64I-NEXT:    srli t6, a5, 1
+; RV64I-NEXT:    sub s0, t1, a4
+; RV64I-NEXT:    xori s0, s0, 63
+; RV64I-NEXT:    srl t6, t6, s0
+; RV64I-NEXT:    or t5, t5, t6
+; RV64I-NEXT:    sraiw a6, a6, 31
+; RV64I-NEXT:    bgez t4, .LBB11_10
+; RV64I-NEXT:  .LBB11_14:
+; RV64I-NEXT:    sra t3, a3, t3
+; RV64I-NEXT:    bgeu a4, t1, .LBB11_16
+; RV64I-NEXT:  .LBB11_15:
+; RV64I-NEXT:    slti t3, a7, 0
+; RV64I-NEXT:    srl t4, a0, a4
+; RV64I-NEXT:    neg t3, t3
+; RV64I-NEXT:    and t3, t3, t4
+; RV64I-NEXT:    or t3, t3, t5
+; RV64I-NEXT:  .LBB11_16:
+; RV64I-NEXT:    bnez a4, .LBB11_19
+; RV64I-NEXT:  # %bb.17:
+; RV64I-NEXT:    bltz a7, .LBB11_20
+; RV64I-NEXT:  .LBB11_18:
+; RV64I-NEXT:    sra a5, a3, a7
+; RV64I-NEXT:    bgeu a4, t1, .LBB11_21
+; RV64I-NEXT:    j .LBB11_22
+; RV64I-NEXT:  .LBB11_19:
+; RV64I-NEXT:    mv a0, t3
+; RV64I-NEXT:    bgez a7, .LBB11_18
+; RV64I-NEXT:  .LBB11_20:
+; RV64I-NEXT:    srl a5, a5, a4
+; RV64I-NEXT:    sll t0, t0, t2
+; RV64I-NEXT:    or a5, a5, t0
+; RV64I-NEXT:    bltu a4, t1, .LBB11_22
+; RV64I-NEXT:  .LBB11_21:
+; RV64I-NEXT:    mv a5, a6
+; RV64I-NEXT:  .LBB11_22:
+; RV64I-NEXT:    bltz a7, .LBB11_24
+; RV64I-NEXT:  # %bb.23:
+; RV64I-NEXT:    mv a3, a6
+; RV64I-NEXT:    bgeu a4, t1, .LBB11_25
+; RV64I-NEXT:    j .LBB11_26
+; RV64I-NEXT:  .LBB11_24:
+; RV64I-NEXT:    sra a3, a3, a4
+; RV64I-NEXT:    bltu a4, t1, .LBB11_26
+; RV64I-NEXT:  .LBB11_25:
+; RV64I-NEXT:    mv a3, a6
+; RV64I-NEXT:  .LBB11_26:
+; RV64I-NEXT:    sb a3, 24(a2)
+; RV64I-NEXT:    srli a4, a3, 56
+; RV64I-NEXT:    sb a4, 31(a2)
+; RV64I-NEXT:    srli a4, a3, 48
+; RV64I-NEXT:    sb a4, 30(a2)
+; RV64I-NEXT:    srli a4, a3, 40
+; RV64I-NEXT:    sb a4, 29(a2)
+; RV64I-NEXT:    srli a4, a3, 32
+; RV64I-NEXT:    sb a4, 28(a2)
+; RV64I-NEXT:    srli a4, a3, 24
+; RV64I-NEXT:    sb a4, 27(a2)
+; RV64I-NEXT:    srli a4, a3, 16
+; RV64I-NEXT:    sb a4, 26(a2)
+; RV64I-NEXT:    srli a3, a3, 8
+; RV64I-NEXT:    sb a3, 25(a2)
+; RV64I-NEXT:    sb a5, 16(a2)
+; RV64I-NEXT:    srli a3, a5, 56
+; RV64I-NEXT:    sb a3, 23(a2)
+; RV64I-NEXT:    srli a3, a5, 48
+; RV64I-NEXT:    sb a3, 22(a2)
+; RV64I-NEXT:    srli a3, a5, 40
+; RV64I-NEXT:    sb a3, 21(a2)
+; RV64I-NEXT:    srli a3, a5, 32
+; RV64I-NEXT:    sb a3, 20(a2)
+; RV64I-NEXT:    srli a3, a5, 24
+; RV64I-NEXT:    sb a3, 19(a2)
+; RV64I-NEXT:    srli a3, a5, 16
+; RV64I-NEXT:    sb a3, 18(a2)
+; RV64I-NEXT:    srli a5, a5, 8
+; RV64I-NEXT:    sb a5, 17(a2)
+; RV64I-NEXT:    sb a1, 0(a2)
+; RV64I-NEXT:    srli a3, a1, 56
+; RV64I-NEXT:    sb a3, 7(a2)
+; RV64I-NEXT:    srli a3, a1, 48
+; RV64I-NEXT:    sb a3, 6(a2)
+; RV64I-NEXT:    srli a3, a1, 40
+; RV64I-NEXT:    sb a3, 5(a2)
+; RV64I-NEXT:    srli a3, a1, 32
+; RV64I-NEXT:    sb a3, 4(a2)
+; RV64I-NEXT:    srli a3, a1, 24
+; RV64I-NEXT:    sb a3, 3(a2)
+; RV64I-NEXT:    srli a3, a1, 16
+; RV64I-NEXT:    sb a3, 2(a2)
+; RV64I-NEXT:    srli a1, a1, 8
+; RV64I-NEXT:    sb a1, 1(a2)
+; RV64I-NEXT:    sb a0, 8(a2)
+; RV64I-NEXT:    srli a1, a0, 56
+; RV64I-NEXT:    sb a1, 15(a2)
+; RV64I-NEXT:    srli a1, a0, 48
+; RV64I-NEXT:    sb a1, 14(a2)
+; RV64I-NEXT:    srli a1, a0, 40
+; RV64I-NEXT:    sb a1, 13(a2)
+; RV64I-NEXT:    srli a1, a0, 32
+; RV64I-NEXT:    sb a1, 12(a2)
+; RV64I-NEXT:    srli a1, a0, 24
+; RV64I-NEXT:    sb a1, 11(a2)
+; RV64I-NEXT:    srli a1, a0, 16
+; RV64I-NEXT:    sb a1, 10(a2)
+; RV64I-NEXT:    srli a0, a0, 8
+; RV64I-NEXT:    sb a0, 9(a2)
+; RV64I-NEXT:    ld s0, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: ashr_32bytes:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -128
+; RV32I-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 116(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 112(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s6, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s7, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s8, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s9, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s10, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s11, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a7, 4(a0)
+; RV32I-NEXT:    lbu a5, 5(a0)
+; RV32I-NEXT:    lbu t2, 6(a0)
+; RV32I-NEXT:    lbu t3, 7(a0)
+; RV32I-NEXT:    lbu t0, 0(a0)
+; RV32I-NEXT:    lbu t4, 1(a0)
+; RV32I-NEXT:    lbu s9, 2(a0)
+; RV32I-NEXT:    lbu s0, 3(a0)
+; RV32I-NEXT:    lbu t1, 12(a0)
+; RV32I-NEXT:    lbu t6, 13(a0)
+; RV32I-NEXT:    lbu s3, 14(a0)
+; RV32I-NEXT:    lbu s5, 15(a0)
+; RV32I-NEXT:    lbu s1, 8(a0)
+; RV32I-NEXT:    lbu s2, 9(a0)
+; RV32I-NEXT:    lbu s6, 10(a0)
+; RV32I-NEXT:    lbu s7, 11(a0)
+; RV32I-NEXT:    lbu a3, 21(a0)
+; RV32I-NEXT:    lbu a4, 20(a0)
+; RV32I-NEXT:    lbu a6, 22(a0)
+; RV32I-NEXT:    lbu t5, 23(a0)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli t5, t5, 24
+; RV32I-NEXT:    or a4, t5, a6
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 17(a0)
+; RV32I-NEXT:    lbu a6, 16(a0)
+; RV32I-NEXT:    lbu t5, 18(a0)
+; RV32I-NEXT:    lbu s4, 19(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or s8, a4, a6
+; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli s4, s4, 24
+; RV32I-NEXT:    or a6, s4, t5
+; RV32I-NEXT:    lbu a4, 29(a0)
+; RV32I-NEXT:    lbu t5, 28(a0)
+; RV32I-NEXT:    lbu s4, 30(a0)
+; RV32I-NEXT:    lbu s10, 31(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a4, a4, t5
+; RV32I-NEXT:    slli t5, s4, 16
+; RV32I-NEXT:    slli s4, s10, 24
+; RV32I-NEXT:    or t5, s4, t5
+; RV32I-NEXT:    or a4, t5, a4
+; RV32I-NEXT:    lbu t5, 25(a0)
+; RV32I-NEXT:    lbu s10, 24(a0)
+; RV32I-NEXT:    lbu s11, 26(a0)
+; RV32I-NEXT:    lbu a0, 27(a0)
+; RV32I-NEXT:    slli t5, t5, 8
+; RV32I-NEXT:    or t5, t5, s10
+; RV32I-NEXT:    slli s11, s11, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a0, a0, s11
+; RV32I-NEXT:    or s11, a0, t5
+; RV32I-NEXT:    lbu a0, 1(a1)
+; RV32I-NEXT:    lbu t5, 0(a1)
+; RV32I-NEXT:    lbu s10, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a0, a0, 8
+; RV32I-NEXT:    or a0, a0, t5
+; RV32I-NEXT:    slli s10, s10, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, s10
+; RV32I-NEXT:    or a1, a1, a0
+; RV32I-NEXT:    addi t5, a1, -192
+; RV32I-NEXT:    addi a0, a1, -224
+; RV32I-NEXT:    slli s10, a4, 1
+; RV32I-NEXT:    sw s11, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s10, 64(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw t5, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw a0, 40(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    bltz a0, .LBB11_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    sra a0, a4, a0
+; RV32I-NEXT:    j .LBB11_3
+; RV32I-NEXT:  .LBB11_2:
+; RV32I-NEXT:    srl a0, s11, t5
+; RV32I-NEXT:    xori t5, t5, 31
+; RV32I-NEXT:    sll t5, s10, t5
+; RV32I-NEXT:    or a0, a0, t5
+; RV32I-NEXT:  .LBB11_3:
+; RV32I-NEXT:    slli s10, t6, 8
+; RV32I-NEXT:    slli s11, s3, 16
+; RV32I-NEXT:    slli ra, s5, 24
+; RV32I-NEXT:    or t5, a6, s8
+; RV32I-NEXT:    addi s3, a1, -128
+; RV32I-NEXT:    slli t6, a3, 1
+; RV32I-NEXT:    addi s5, a1, -160
+; RV32I-NEXT:    xori s8, s3, 31
+; RV32I-NEXT:    sw t6, 60(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s8, 32(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    bltz s5, .LBB11_5
+; RV32I-NEXT:  # %bb.4:
+; RV32I-NEXT:    srl t6, a3, s5
+; RV32I-NEXT:    j .LBB11_6
+; RV32I-NEXT:  .LBB11_5:
+; RV32I-NEXT:    srl a6, t5, s3
+; RV32I-NEXT:    sll t6, t6, s8
+; RV32I-NEXT:    or t6, a6, t6
+; RV32I-NEXT:  .LBB11_6:
+; RV32I-NEXT:    slli s2, s2, 8
+; RV32I-NEXT:    slli s6, s6, 16
+; RV32I-NEXT:    slli s7, s7, 24
+; RV32I-NEXT:    or a6, s10, t1
+; RV32I-NEXT:    or s8, ra, s11
+; RV32I-NEXT:    neg ra, a1
+; RV32I-NEXT:    lw t1, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sll s11, t1, ra
+; RV32I-NEXT:    li s10, 160
+; RV32I-NEXT:    li t1, 64
+; RV32I-NEXT:    sub s10, s10, a1
+; RV32I-NEXT:    sw s11, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    bgeu s3, t1, .LBB11_8
+; RV32I-NEXT:  # %bb.7:
+; RV32I-NEXT:    slti a0, s10, 0
+; RV32I-NEXT:    neg a0, a0
+; RV32I-NEXT:    and a0, a0, s11
+; RV32I-NEXT:    or a0, t6, a0
+; RV32I-NEXT:  .LBB11_8:
+; RV32I-NEXT:    slli t6, a5, 8
+; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli t3, t3, 24
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    slli s9, s9, 16
+; RV32I-NEXT:    slli s0, s0, 24
+; RV32I-NEXT:    or s1, s2, s1
+; RV32I-NEXT:    or s2, s7, s6
+; RV32I-NEXT:    or a5, s8, a6
+; RV32I-NEXT:    mv s7, t5
+; RV32I-NEXT:    beqz s3, .LBB11_10
+; RV32I-NEXT:  # %bb.9:
+; RV32I-NEXT:    mv s7, a0
+; RV32I-NEXT:  .LBB11_10:
+; RV32I-NEXT:    or a0, t6, a7
+; RV32I-NEXT:    or a7, t3, t2
+; RV32I-NEXT:    or t0, t4, t0
+; RV32I-NEXT:    or t2, s0, s9
+; RV32I-NEXT:    or s1, s2, s1
+; RV32I-NEXT:    addi t6, a1, -64
+; RV32I-NEXT:    slli s8, a5, 1
+; RV32I-NEXT:    addi s0, a1, -96
+; RV32I-NEXT:    xori t3, t6, 31
+; RV32I-NEXT:    sw t3, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    bltz s0, .LBB11_12
+; RV32I-NEXT:  # %bb.11:
+; RV32I-NEXT:    srl a6, a5, s0
+; RV32I-NEXT:    j .LBB11_13
+; RV32I-NEXT:  .LBB11_12:
+; RV32I-NEXT:    srl a6, s1, t6
+; RV32I-NEXT:    sll t3, s8, t3
+; RV32I-NEXT:    or a6, a6, t3
+; RV32I-NEXT:  .LBB11_13:
+; RV32I-NEXT:    or s11, a7, a0
+; RV32I-NEXT:    or t2, t2, t0
+; RV32I-NEXT:    addi t4, a1, -32
+; RV32I-NEXT:    xori s9, a1, 31
+; RV32I-NEXT:    bltz t4, .LBB11_15
+; RV32I-NEXT:  # %bb.14:
+; RV32I-NEXT:    srl a7, s11, t4
+; RV32I-NEXT:    j .LBB11_16
+; RV32I-NEXT:  .LBB11_15:
+; RV32I-NEXT:    srl a0, t2, a1
+; RV32I-NEXT:    slli a7, s11, 1
+; RV32I-NEXT:    sll a7, a7, s9
+; RV32I-NEXT:    or a7, a0, a7
+; RV32I-NEXT:  .LBB11_16:
+; RV32I-NEXT:    sll t3, s1, ra
+; RV32I-NEXT:    li a0, 32
+; RV32I-NEXT:    sub s6, a0, a1
+; RV32I-NEXT:    slti t0, s6, 0
+; RV32I-NEXT:    neg t0, t0
+; RV32I-NEXT:    bgeu a1, t1, .LBB11_18
+; RV32I-NEXT:  # %bb.17:
+; RV32I-NEXT:    and a6, t0, t3
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:  .LBB11_18:
+; RV32I-NEXT:    sw s10, 36(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw t0, 44(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 52(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw t6, 56(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv t0, t2
+; RV32I-NEXT:    beqz a1, .LBB11_20
+; RV32I-NEXT:  # %bb.19:
+; RV32I-NEXT:    mv t0, a6
+; RV32I-NEXT:  .LBB11_20:
+; RV32I-NEXT:    sll a6, t5, ra
+; RV32I-NEXT:    li a7, 96
+; RV32I-NEXT:    sub s10, a7, a1
+; RV32I-NEXT:    slti a7, s10, 0
+; RV32I-NEXT:    neg a7, a7
+; RV32I-NEXT:    li s0, 128
+; RV32I-NEXT:    sub s2, s0, a1
+; RV32I-NEXT:    sltiu t6, s2, 64
+; RV32I-NEXT:    neg t6, t6
+; RV32I-NEXT:    sw t6, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    bgeu a1, s0, .LBB11_22
+; RV32I-NEXT:  # %bb.21:
+; RV32I-NEXT:    mv s0, t6
+; RV32I-NEXT:    and t6, a7, a6
+; RV32I-NEXT:    and t6, s0, t6
+; RV32I-NEXT:    or s7, t0, t6
+; RV32I-NEXT:  .LBB11_22:
+; RV32I-NEXT:    beqz a1, .LBB11_24
+; RV32I-NEXT:  # %bb.23:
+; RV32I-NEXT:    mv t2, s7
+; RV32I-NEXT:  .LBB11_24:
+; RV32I-NEXT:    neg t0, s2
+; RV32I-NEXT:    sub t6, a0, s2
+; RV32I-NEXT:    srl a0, a3, t0
+; RV32I-NEXT:    sw t6, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    bgez t6, .LBB11_26
+; RV32I-NEXT:  # %bb.25:
+; RV32I-NEXT:    srl a0, t5, t0
+; RV32I-NEXT:    sub t0, t1, s2
+; RV32I-NEXT:    xori t0, t0, 31
+; RV32I-NEXT:    lw t6, 60(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sll t0, t6, t0
+; RV32I-NEXT:    or a0, a0, t0
+; RV32I-NEXT:  .LBB11_26:
+; RV32I-NEXT:    lw s7, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    bltu s2, t1, .LBB11_28
+; RV32I-NEXT:  # %bb.27:
+; RV32I-NEXT:    lw a0, 44(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    and a7, a0, a6
+; RV32I-NEXT:    mv a0, s7
+; RV32I-NEXT:    bnez s2, .LBB11_29
+; RV32I-NEXT:    j .LBB11_30
+; RV32I-NEXT:  .LBB11_28:
+; RV32I-NEXT:    lw t0, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    and a7, a7, t0
+; RV32I-NEXT:    or a7, a7, a0
+; RV32I-NEXT:    mv a0, s7
+; RV32I-NEXT:    beqz s2, .LBB11_30
+; RV32I-NEXT:  .LBB11_29:
+; RV32I-NEXT:    mv a0, a7
+; RV32I-NEXT:  .LBB11_30:
+; RV32I-NEXT:    bltz t4, .LBB11_32
+; RV32I-NEXT:  # %bb.31:
+; RV32I-NEXT:    srl a7, a5, t4
+; RV32I-NEXT:    j .LBB11_33
+; RV32I-NEXT:  .LBB11_32:
+; RV32I-NEXT:    srl a7, s1, a1
+; RV32I-NEXT:    sll t0, s8, s9
+; RV32I-NEXT:    or a7, a7, t0
+; RV32I-NEXT:  .LBB11_33:
+; RV32I-NEXT:    li s8, 128
+; RV32I-NEXT:    sw s9, 48(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    bltz s5, .LBB11_35
+; RV32I-NEXT:  # %bb.34:
+; RV32I-NEXT:    sra t0, a4, s5
+; RV32I-NEXT:    j .LBB11_36
+; RV32I-NEXT:  .LBB11_35:
+; RV32I-NEXT:    srl t0, s7, s3
+; RV32I-NEXT:    lw t6, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s9, 32(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sll t6, t6, s9
+; RV32I-NEXT:    or t0, t0, t6
+; RV32I-NEXT:  .LBB11_36:
+; RV32I-NEXT:    sltiu t6, a1, 64
+; RV32I-NEXT:    srai s9, s4, 31
+; RV32I-NEXT:    bgeu s3, t1, .LBB11_44
+; RV32I-NEXT:  # %bb.37:
+; RV32I-NEXT:    neg s0, t6
+; RV32I-NEXT:    bltu a1, s8, .LBB11_45
+; RV32I-NEXT:  .LBB11_38:
+; RV32I-NEXT:    mv s4, s1
+; RV32I-NEXT:    beqz a1, .LBB11_40
+; RV32I-NEXT:  .LBB11_39:
+; RV32I-NEXT:    mv s4, t0
+; RV32I-NEXT:  .LBB11_40:
+; RV32I-NEXT:    sub a0, t1, a1
+; RV32I-NEXT:    xori t0, a0, 31
+; RV32I-NEXT:    bgez s6, .LBB11_42
+; RV32I-NEXT:  # %bb.41:
+; RV32I-NEXT:    sll a0, a5, ra
+; RV32I-NEXT:    srli s1, s1, 1
+; RV32I-NEXT:    srl a7, s1, t0
+; RV32I-NEXT:    or t3, a0, a7
+; RV32I-NEXT:  .LBB11_42:
+; RV32I-NEXT:    slti a0, t4, 0
+; RV32I-NEXT:    neg a7, a0
+; RV32I-NEXT:    sw a7, 32(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    bltu a1, t1, .LBB11_46
+; RV32I-NEXT:  # %bb.43:
+; RV32I-NEXT:    lw a0, 56(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    srl a0, a5, a0
+; RV32I-NEXT:    lw a7, 52(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    slti a7, a7, 0
+; RV32I-NEXT:    neg a7, a7
+; RV32I-NEXT:    and a0, a7, a0
+; RV32I-NEXT:    j .LBB11_47
+; RV32I-NEXT:  .LBB11_44:
+; RV32I-NEXT:    mv t0, s9
+; RV32I-NEXT:    neg s0, t6
+; RV32I-NEXT:    bgeu a1, s8, .LBB11_38
+; RV32I-NEXT:  .LBB11_45:
+; RV32I-NEXT:    and a7, s0, a7
+; RV32I-NEXT:    or t0, a7, a0
+; RV32I-NEXT:    mv s4, s1
+; RV32I-NEXT:    bnez a1, .LBB11_39
+; RV32I-NEXT:    j .LBB11_40
+; RV32I-NEXT:  .LBB11_46:
+; RV32I-NEXT:    srl a0, s11, a1
+; RV32I-NEXT:    and a0, a7, a0
+; RV32I-NEXT:    or a0, a0, t3
+; RV32I-NEXT:  .LBB11_47:
+; RV32I-NEXT:    sw t0, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv t0, s11
+; RV32I-NEXT:    beqz a1, .LBB11_49
+; RV32I-NEXT:  # %bb.48:
+; RV32I-NEXT:    mv t0, a0
+; RV32I-NEXT:  .LBB11_49:
+; RV32I-NEXT:    sll t6, a3, ra
+; RV32I-NEXT:    srli a0, t5, 1
+; RV32I-NEXT:    xori t3, s2, 31
+; RV32I-NEXT:    bltz s10, .LBB11_51
+; RV32I-NEXT:  # %bb.50:
+; RV32I-NEXT:    mv a7, a6
+; RV32I-NEXT:    j .LBB11_52
+; RV32I-NEXT:  .LBB11_51:
+; RV32I-NEXT:    srl a7, a0, t3
+; RV32I-NEXT:    or a7, t6, a7
+; RV32I-NEXT:  .LBB11_52:
+; RV32I-NEXT:    sll ra, a4, ra
+; RV32I-NEXT:    srli s1, s7, 1
+; RV32I-NEXT:    lw s7, 36(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    bltz s7, .LBB11_55
+; RV32I-NEXT:  # %bb.53:
+; RV32I-NEXT:    lw s7, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s8, 40(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    bgez s8, .LBB11_56
+; RV32I-NEXT:  .LBB11_54:
+; RV32I-NEXT:    lw s8, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sra s8, a4, s8
+; RV32I-NEXT:    bltu s3, t1, .LBB11_57
+; RV32I-NEXT:    j .LBB11_58
+; RV32I-NEXT:  .LBB11_55:
+; RV32I-NEXT:    li s7, 192
+; RV32I-NEXT:    sub s7, s7, a1
+; RV32I-NEXT:    xori s7, s7, 31
+; RV32I-NEXT:    srl s7, s1, s7
+; RV32I-NEXT:    or s7, ra, s7
+; RV32I-NEXT:    lw s8, 40(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    bltz s8, .LBB11_54
+; RV32I-NEXT:  .LBB11_56:
+; RV32I-NEXT:    mv s8, s9
+; RV32I-NEXT:    bgeu s3, t1, .LBB11_58
+; RV32I-NEXT:  .LBB11_57:
+; RV32I-NEXT:    slti s8, s5, 0
+; RV32I-NEXT:    mv t1, t2
+; RV32I-NEXT:    mv t2, s6
+; RV32I-NEXT:    mv s6, s1
+; RV32I-NEXT:    mv s1, ra
+; RV32I-NEXT:    srl ra, a3, s3
+; RV32I-NEXT:    neg s8, s8
+; RV32I-NEXT:    and s8, s8, ra
+; RV32I-NEXT:    mv ra, s1
+; RV32I-NEXT:    mv s1, s6
+; RV32I-NEXT:    mv s6, t2
+; RV32I-NEXT:    mv t2, t1
+; RV32I-NEXT:    li t1, 64
+; RV32I-NEXT:    or s8, s8, s7
+; RV32I-NEXT:  .LBB11_58:
+; RV32I-NEXT:    mv s7, a3
+; RV32I-NEXT:    bnez s3, .LBB11_65
+; RV32I-NEXT:  # %bb.59:
+; RV32I-NEXT:    li s8, 128
+; RV32I-NEXT:    bltu a1, s8, .LBB11_66
+; RV32I-NEXT:  .LBB11_60:
+; RV32I-NEXT:    lw a7, 60(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    bnez a1, .LBB11_67
+; RV32I-NEXT:  .LBB11_61:
+; RV32I-NEXT:    bgez s6, .LBB11_63
+; RV32I-NEXT:  .LBB11_62:
+; RV32I-NEXT:    lw a6, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    srl a0, a0, a6
+; RV32I-NEXT:    or a6, t6, a0
+; RV32I-NEXT:  .LBB11_63:
+; RV32I-NEXT:    lw t0, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw t6, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    bltz s10, .LBB11_68
+; RV32I-NEXT:  # %bb.64:
+; RV32I-NEXT:    mv a0, t6
+; RV32I-NEXT:    lw t3, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    bltu s2, t1, .LBB11_69
+; RV32I-NEXT:    j .LBB11_70
+; RV32I-NEXT:  .LBB11_65:
+; RV32I-NEXT:    mv s7, s8
+; RV32I-NEXT:    li s8, 128
+; RV32I-NEXT:    bgeu a1, s8, .LBB11_60
+; RV32I-NEXT:  .LBB11_66:
+; RV32I-NEXT:    lw s7, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    and a7, s7, a7
+; RV32I-NEXT:    or s7, t0, a7
+; RV32I-NEXT:    lw a7, 60(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    beqz a1, .LBB11_61
+; RV32I-NEXT:  .LBB11_67:
+; RV32I-NEXT:    mv s11, s7
+; RV32I-NEXT:    bltz s6, .LBB11_62
+; RV32I-NEXT:    j .LBB11_63
+; RV32I-NEXT:  .LBB11_68:
+; RV32I-NEXT:    srl a0, s1, t3
+; RV32I-NEXT:    or a0, ra, a0
+; RV32I-NEXT:    lw t3, 64(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    bgeu s2, t1, .LBB11_70
+; RV32I-NEXT:  .LBB11_69:
+; RV32I-NEXT:    lw a6, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    slti a6, a6, 0
+; RV32I-NEXT:    neg a6, a6
+; RV32I-NEXT:    lw s7, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    and a6, a6, s7
+; RV32I-NEXT:    or a6, a0, a6
+; RV32I-NEXT:  .LBB11_70:
+; RV32I-NEXT:    mv a0, a4
+; RV32I-NEXT:    bnez s2, .LBB11_73
+; RV32I-NEXT:  # %bb.71:
+; RV32I-NEXT:    bltz s5, .LBB11_74
+; RV32I-NEXT:  .LBB11_72:
+; RV32I-NEXT:    mv a6, s9
+; RV32I-NEXT:    lw s2, 52(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    bgeu s3, t1, .LBB11_75
+; RV32I-NEXT:    j .LBB11_76
+; RV32I-NEXT:  .LBB11_73:
+; RV32I-NEXT:    mv a0, a6
+; RV32I-NEXT:    bgez s5, .LBB11_72
+; RV32I-NEXT:  .LBB11_74:
+; RV32I-NEXT:    sra a6, a4, s3
+; RV32I-NEXT:    lw s2, 52(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    bltu s3, t1, .LBB11_76
+; RV32I-NEXT:  .LBB11_75:
+; RV32I-NEXT:    mv a6, s9
+; RV32I-NEXT:  .LBB11_76:
+; RV32I-NEXT:    bltu a1, s8, .LBB11_81
+; RV32I-NEXT:  # %bb.77:
+; RV32I-NEXT:    bnez a1, .LBB11_82
+; RV32I-NEXT:  .LBB11_78:
+; RV32I-NEXT:    bltz s2, .LBB11_83
+; RV32I-NEXT:  .LBB11_79:
+; RV32I-NEXT:    sra a0, a4, s2
+; RV32I-NEXT:    bgez t4, .LBB11_84
+; RV32I-NEXT:  .LBB11_80:
+; RV32I-NEXT:    srl a6, t5, a1
+; RV32I-NEXT:    lw s0, 48(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sll a7, a7, s0
+; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    bltu a1, t1, .LBB11_85
+; RV32I-NEXT:    j .LBB11_86
+; RV32I-NEXT:  .LBB11_81:
+; RV32I-NEXT:    srl a6, a5, a1
+; RV32I-NEXT:    lw s3, 32(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    and a6, s3, a6
+; RV32I-NEXT:    and a6, s0, a6
+; RV32I-NEXT:    or a6, a6, a0
+; RV32I-NEXT:    beqz a1, .LBB11_78
+; RV32I-NEXT:  .LBB11_82:
+; RV32I-NEXT:    mv a5, a6
+; RV32I-NEXT:    bgez s2, .LBB11_79
+; RV32I-NEXT:  .LBB11_83:
+; RV32I-NEXT:    lw a0, 56(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    srl a0, t0, a0
+; RV32I-NEXT:    lw a6, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sll a6, t3, a6
+; RV32I-NEXT:    or a0, a0, a6
+; RV32I-NEXT:    bltz t4, .LBB11_80
+; RV32I-NEXT:  .LBB11_84:
+; RV32I-NEXT:    srl a6, a3, t4
+; RV32I-NEXT:    bgeu a1, t1, .LBB11_86
+; RV32I-NEXT:  .LBB11_85:
+; RV32I-NEXT:    lw a0, 44(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    and a0, a0, t6
+; RV32I-NEXT:    or a0, a6, a0
+; RV32I-NEXT:  .LBB11_86:
+; RV32I-NEXT:    bnez a1, .LBB11_91
+; RV32I-NEXT:  # %bb.87:
+; RV32I-NEXT:    bgeu a1, s8, .LBB11_92
+; RV32I-NEXT:  .LBB11_88:
+; RV32I-NEXT:    bltz s6, .LBB11_93
+; RV32I-NEXT:  .LBB11_89:
+; RV32I-NEXT:    bltz s2, .LBB11_94
+; RV32I-NEXT:  .LBB11_90:
+; RV32I-NEXT:    mv a0, s9
+; RV32I-NEXT:    bltu a1, t1, .LBB11_95
+; RV32I-NEXT:    j .LBB11_96
+; RV32I-NEXT:  .LBB11_91:
+; RV32I-NEXT:    mv t5, a0
+; RV32I-NEXT:    bltu a1, s8, .LBB11_88
+; RV32I-NEXT:  .LBB11_92:
+; RV32I-NEXT:    mv t5, s9
+; RV32I-NEXT:    bgez s6, .LBB11_89
+; RV32I-NEXT:  .LBB11_93:
+; RV32I-NEXT:    lw a0, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    srl a0, s1, a0
+; RV32I-NEXT:    or t6, ra, a0
+; RV32I-NEXT:    bgez s2, .LBB11_90
+; RV32I-NEXT:  .LBB11_94:
+; RV32I-NEXT:    lw a0, 56(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sra a0, a4, a0
+; RV32I-NEXT:    bgeu a1, t1, .LBB11_96
+; RV32I-NEXT:  .LBB11_95:
+; RV32I-NEXT:    srl a0, a3, a1
+; RV32I-NEXT:    lw a6, 32(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    and a0, a6, a0
+; RV32I-NEXT:    or a0, a0, t6
+; RV32I-NEXT:  .LBB11_96:
+; RV32I-NEXT:    bnez a1, .LBB11_100
+; RV32I-NEXT:  # %bb.97:
+; RV32I-NEXT:    bgeu a1, s8, .LBB11_101
+; RV32I-NEXT:  .LBB11_98:
+; RV32I-NEXT:    bltz t4, .LBB11_102
+; RV32I-NEXT:  .LBB11_99:
+; RV32I-NEXT:    sra a0, a4, t4
+; RV32I-NEXT:    bgeu a1, t1, .LBB11_103
+; RV32I-NEXT:    j .LBB11_104
+; RV32I-NEXT:  .LBB11_100:
+; RV32I-NEXT:    mv a3, a0
+; RV32I-NEXT:    bltu a1, s8, .LBB11_98
+; RV32I-NEXT:  .LBB11_101:
+; RV32I-NEXT:    mv a3, s9
+; RV32I-NEXT:    bgez t4, .LBB11_99
+; RV32I-NEXT:  .LBB11_102:
+; RV32I-NEXT:    srl a0, t0, a1
+; RV32I-NEXT:    lw a6, 48(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    sll a6, t3, a6
+; RV32I-NEXT:    or a0, a0, a6
+; RV32I-NEXT:    bltu a1, t1, .LBB11_104
+; RV32I-NEXT:  .LBB11_103:
+; RV32I-NEXT:    mv a0, s9
+; RV32I-NEXT:  .LBB11_104:
+; RV32I-NEXT:    bgeu a1, s8, .LBB11_107
+; RV32I-NEXT:  # %bb.105:
+; RV32I-NEXT:    bltz t4, .LBB11_108
+; RV32I-NEXT:  .LBB11_106:
+; RV32I-NEXT:    mv a4, s9
+; RV32I-NEXT:    bgeu a1, t1, .LBB11_109
+; RV32I-NEXT:    j .LBB11_110
+; RV32I-NEXT:  .LBB11_107:
+; RV32I-NEXT:    mv a0, s9
+; RV32I-NEXT:    bgez t4, .LBB11_106
+; RV32I-NEXT:  .LBB11_108:
+; RV32I-NEXT:    sra a4, a4, a1
+; RV32I-NEXT:    bltu a1, t1, .LBB11_110
+; RV32I-NEXT:  .LBB11_109:
+; RV32I-NEXT:    mv a4, s9
+; RV32I-NEXT:  .LBB11_110:
+; RV32I-NEXT:    bltu a1, s8, .LBB11_112
+; RV32I-NEXT:  # %bb.111:
+; RV32I-NEXT:    mv a4, s9
+; RV32I-NEXT:  .LBB11_112:
+; RV32I-NEXT:    sb a4, 28(a2)
+; RV32I-NEXT:    srli a1, a4, 24
+; RV32I-NEXT:    sb a1, 31(a2)
+; RV32I-NEXT:    srli a1, a4, 16
+; RV32I-NEXT:    sb a1, 30(a2)
+; RV32I-NEXT:    srli a4, a4, 8
+; RV32I-NEXT:    sb a4, 29(a2)
+; RV32I-NEXT:    sb a0, 24(a2)
+; RV32I-NEXT:    srli a1, a0, 24
+; RV32I-NEXT:    sb a1, 27(a2)
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    sb a1, 26(a2)
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    sb a0, 25(a2)
+; RV32I-NEXT:    sb t5, 16(a2)
+; RV32I-NEXT:    srli a0, t5, 24
+; RV32I-NEXT:    sb a0, 19(a2)
+; RV32I-NEXT:    srli a0, t5, 16
+; RV32I-NEXT:    sb a0, 18(a2)
+; RV32I-NEXT:    srli a0, t5, 8
+; RV32I-NEXT:    sb a0, 17(a2)
+; RV32I-NEXT:    sb a3, 20(a2)
+; RV32I-NEXT:    srli a0, a3, 24
+; RV32I-NEXT:    sb a0, 23(a2)
+; RV32I-NEXT:    srli a0, a3, 16
+; RV32I-NEXT:    sb a0, 22(a2)
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    sb a3, 21(a2)
+; RV32I-NEXT:    sb t2, 0(a2)
+; RV32I-NEXT:    sb a5, 12(a2)
+; RV32I-NEXT:    srli a0, t2, 24
+; RV32I-NEXT:    sb a0, 3(a2)
+; RV32I-NEXT:    srli a0, t2, 16
+; RV32I-NEXT:    sb a0, 2(a2)
+; RV32I-NEXT:    srli a0, t2, 8
+; RV32I-NEXT:    sb a0, 1(a2)
+; RV32I-NEXT:    sb s11, 4(a2)
+; RV32I-NEXT:    sb s4, 8(a2)
+; RV32I-NEXT:    srli a0, a5, 24
+; RV32I-NEXT:    sb a0, 15(a2)
+; RV32I-NEXT:    srli a0, a5, 16
+; RV32I-NEXT:    sb a0, 14(a2)
+; RV32I-NEXT:    srli a5, a5, 8
+; RV32I-NEXT:    sb a5, 13(a2)
+; RV32I-NEXT:    srli a0, s11, 24
+; RV32I-NEXT:    sb a0, 7(a2)
+; RV32I-NEXT:    srli a0, s11, 16
+; RV32I-NEXT:    sb a0, 6(a2)
+; RV32I-NEXT:    srli a0, s11, 8
+; RV32I-NEXT:    sb a0, 5(a2)
+; RV32I-NEXT:    srli a0, s4, 24
+; RV32I-NEXT:    sb a0, 11(a2)
+; RV32I-NEXT:    srli a0, s4, 16
+; RV32I-NEXT:    sb a0, 10(a2)
+; RV32I-NEXT:    srli a0, s4, 8
+; RV32I-NEXT:    sb a0, 9(a2)
+; RV32I-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 116(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 112(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s6, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s7, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s8, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s9, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s10, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s11, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 128
+; RV32I-NEXT:    ret
+  %src = load i256, ptr %src.ptr, align 1
+  %bitOff = load i256, ptr %bitOff.ptr, align 1
+  %res = ashr i256 %src, %bitOff
+  store i256 %res, ptr %dst, align 1
+  ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; ALL: {{.*}}

diff  --git a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
new file mode 100644
index 000000000000..c06cd5b4477f
--- /dev/null
+++ b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
@@ -0,0 +1,7825 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-NO-BMI2,X64-NO-SHLD,X64-NO-BMI2-NO-SHLD
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-NO-BMI2,X64-SHLD,X64-NO-BMI2-HAVE-SHLD
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-BMI2,X64-NO-SHLD,X64-HAVE-BMI2-NO-SHLD
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-BMI2,X64-SHLD,X64-HAVE-BMI2-HAVE-SHLD
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X32,X32-NO-BMI2,X32-NO-SHLD,X32-NO-BMI2-NO-SHLD
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X32,X32-NO-BMI2,X32-SHLD,X32-NO-BMI2-HAVE-SHLD
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X32,X32-BMI2,X32-NO-SHLD,X32-HAVE-BMI2-NO-SHLD
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X32,X32-BMI2,X32-SHLD,X32-HAVE-BMI2-HAVE-SHLD
+
+define void @lshr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
+; X64-NO-BMI2-LABEL: lshr_4bytes:
+; X64-NO-BMI2:       # %bb.0:
+; X64-NO-BMI2-NEXT:    movl (%rdi), %eax
+; X64-NO-BMI2-NEXT:    movzbl (%rsi), %ecx
+; X64-NO-BMI2-NEXT:    shrl %cl, %eax
+; X64-NO-BMI2-NEXT:    movl %eax, (%rdx)
+; X64-NO-BMI2-NEXT:    retq
+;
+; X64-BMI2-LABEL: lshr_4bytes:
+; X64-BMI2:       # %bb.0:
+; X64-BMI2-NEXT:    movzbl (%rsi), %eax
+; X64-BMI2-NEXT:    shrxl %eax, (%rdi), %eax
+; X64-BMI2-NEXT:    movl %eax, (%rdx)
+; X64-BMI2-NEXT:    retq
+;
+; X32-NO-BMI2-LABEL: lshr_4bytes:
+; X32-NO-BMI2:       # %bb.0:
+; X32-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NO-BMI2-NEXT:    movl (%edx), %edx
+; X32-NO-BMI2-NEXT:    movzbl (%ecx), %ecx
+; X32-NO-BMI2-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-NEXT:    movl %edx, (%eax)
+; X32-NO-BMI2-NEXT:    retl
+;
+; X32-BMI2-LABEL: lshr_4bytes:
+; X32-BMI2:       # %bb.0:
+; X32-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-BMI2-NEXT:    movzbl (%edx), %edx
+; X32-BMI2-NEXT:    shrxl %edx, (%ecx), %ecx
+; X32-BMI2-NEXT:    movl %ecx, (%eax)
+; X32-BMI2-NEXT:    retl
+  %src = load i32, ptr %src.ptr, align 1
+  %bitOff = load i32, ptr %bitOff.ptr, align 1
+  %res = lshr i32 %src, %bitOff
+  store i32 %res, ptr %dst, align 1
+  ret void
+}
+define void @shl_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
+; X64-NO-BMI2-LABEL: shl_4bytes:
+; X64-NO-BMI2:       # %bb.0:
+; X64-NO-BMI2-NEXT:    movl (%rdi), %eax
+; X64-NO-BMI2-NEXT:    movzbl (%rsi), %ecx
+; X64-NO-BMI2-NEXT:    shll %cl, %eax
+; X64-NO-BMI2-NEXT:    movl %eax, (%rdx)
+; X64-NO-BMI2-NEXT:    retq
+;
+; X64-BMI2-LABEL: shl_4bytes:
+; X64-BMI2:       # %bb.0:
+; X64-BMI2-NEXT:    movzbl (%rsi), %eax
+; X64-BMI2-NEXT:    shlxl %eax, (%rdi), %eax
+; X64-BMI2-NEXT:    movl %eax, (%rdx)
+; X64-BMI2-NEXT:    retq
+;
+; X32-NO-BMI2-LABEL: shl_4bytes:
+; X32-NO-BMI2:       # %bb.0:
+; X32-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NO-BMI2-NEXT:    movl (%edx), %edx
+; X32-NO-BMI2-NEXT:    movzbl (%ecx), %ecx
+; X32-NO-BMI2-NEXT:    shll %cl, %edx
+; X32-NO-BMI2-NEXT:    movl %edx, (%eax)
+; X32-NO-BMI2-NEXT:    retl
+;
+; X32-BMI2-LABEL: shl_4bytes:
+; X32-BMI2:       # %bb.0:
+; X32-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-BMI2-NEXT:    movzbl (%edx), %edx
+; X32-BMI2-NEXT:    shlxl %edx, (%ecx), %ecx
+; X32-BMI2-NEXT:    movl %ecx, (%eax)
+; X32-BMI2-NEXT:    retl
+  %src = load i32, ptr %src.ptr, align 1
+  %bitOff = load i32, ptr %bitOff.ptr, align 1
+  %res = shl i32 %src, %bitOff
+  store i32 %res, ptr %dst, align 1
+  ret void
+}
+define void @ashr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
+; X64-NO-BMI2-LABEL: ashr_4bytes:
+; X64-NO-BMI2:       # %bb.0:
+; X64-NO-BMI2-NEXT:    movl (%rdi), %eax
+; X64-NO-BMI2-NEXT:    movzbl (%rsi), %ecx
+; X64-NO-BMI2-NEXT:    sarl %cl, %eax
+; X64-NO-BMI2-NEXT:    movl %eax, (%rdx)
+; X64-NO-BMI2-NEXT:    retq
+;
+; X64-BMI2-LABEL: ashr_4bytes:
+; X64-BMI2:       # %bb.0:
+; X64-BMI2-NEXT:    movzbl (%rsi), %eax
+; X64-BMI2-NEXT:    sarxl %eax, (%rdi), %eax
+; X64-BMI2-NEXT:    movl %eax, (%rdx)
+; X64-BMI2-NEXT:    retq
+;
+; X32-NO-BMI2-LABEL: ashr_4bytes:
+; X32-NO-BMI2:       # %bb.0:
+; X32-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NO-BMI2-NEXT:    movl (%edx), %edx
+; X32-NO-BMI2-NEXT:    movzbl (%ecx), %ecx
+; X32-NO-BMI2-NEXT:    sarl %cl, %edx
+; X32-NO-BMI2-NEXT:    movl %edx, (%eax)
+; X32-NO-BMI2-NEXT:    retl
+;
+; X32-BMI2-LABEL: ashr_4bytes:
+; X32-BMI2:       # %bb.0:
+; X32-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-BMI2-NEXT:    movzbl (%edx), %edx
+; X32-BMI2-NEXT:    sarxl %edx, (%ecx), %ecx
+; X32-BMI2-NEXT:    movl %ecx, (%eax)
+; X32-BMI2-NEXT:    retl
+  %src = load i32, ptr %src.ptr, align 1
+  %bitOff = load i32, ptr %bitOff.ptr, align 1
+  %res = ashr i32 %src, %bitOff
+  store i32 %res, ptr %dst, align 1
+  ret void
+}
+
+define void @lshr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
+; X64-NO-BMI2-LABEL: lshr_8bytes:
+; X64-NO-BMI2:       # %bb.0:
+; X64-NO-BMI2-NEXT:    movq (%rdi), %rax
+; X64-NO-BMI2-NEXT:    movzbl (%rsi), %ecx
+; X64-NO-BMI2-NEXT:    shrq %cl, %rax
+; X64-NO-BMI2-NEXT:    movq %rax, (%rdx)
+; X64-NO-BMI2-NEXT:    retq
+;
+; X64-BMI2-LABEL: lshr_8bytes:
+; X64-BMI2:       # %bb.0:
+; X64-BMI2-NEXT:    movzbl (%rsi), %eax
+; X64-BMI2-NEXT:    shrxq %rax, (%rdi), %rax
+; X64-BMI2-NEXT:    movq %rax, (%rdx)
+; X64-BMI2-NEXT:    retq
+;
+; X32-NO-BMI2-NO-SHLD-LABEL: lshr_8bytes:
+; X32-NO-BMI2-NO-SHLD:       # %bb.0:
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%ecx), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %esi, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%edx)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, (%edx)
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X32-NO-BMI2-HAVE-SHLD-LABEL: lshr_8bytes:
+; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esi), %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%esi), %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %esi, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 4(%eax)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%eax)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
+;
+; X32-HAVE-BMI2-NO-SHLD-LABEL: lshr_8bytes:
+; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%edx), %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%ecx), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%edx), %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edx, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 4(%eax)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, (%eax)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
+;
+; X32-HAVE-BMI2-HAVE-SHLD-LABEL: lshr_8bytes:
+; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esi), %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%esi), %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %edi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 4(%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+  %src = load i64, ptr %src.ptr, align 1
+  %bitOff = load i64, ptr %bitOff.ptr, align 1
+  %res = lshr i64 %src, %bitOff
+  store i64 %res, ptr %dst, align 1
+  ret void
+}
+define void @shl_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
+; X64-NO-BMI2-LABEL: shl_8bytes:
+; X64-NO-BMI2:       # %bb.0:
+; X64-NO-BMI2-NEXT:    movq (%rdi), %rax
+; X64-NO-BMI2-NEXT:    movzbl (%rsi), %ecx
+; X64-NO-BMI2-NEXT:    shlq %cl, %rax
+; X64-NO-BMI2-NEXT:    movq %rax, (%rdx)
+; X64-NO-BMI2-NEXT:    retq
+;
+; X64-BMI2-LABEL: shl_8bytes:
+; X64-BMI2:       # %bb.0:
+; X64-BMI2-NEXT:    movzbl (%rsi), %eax
+; X64-BMI2-NEXT:    shlxq %rax, (%rdi), %rax
+; X64-BMI2-NEXT:    movq %rax, (%rdx)
+; X64-BMI2-NEXT:    retq
+;
+; X32-NO-BMI2-NO-SHLD-LABEL: shl_8bytes:
+; X32-NO-BMI2-NO-SHLD:       # %bb.0:
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%ecx), %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %esi, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, (%edx)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%edx)
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X32-NO-BMI2-HAVE-SHLD-LABEL: shl_8bytes:
+; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %esi, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 4(%eax)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%eax)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
+;
+; X32-HAVE-BMI2-NO-SHLD-LABEL: shl_8bytes:
+; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%edx), %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%ecx), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, 4(%edx), %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %edx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 4(%eax)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
+;
+; X32-HAVE-BMI2-HAVE-SHLD-LABEL: shl_8bytes:
+; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %esi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %edi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 4(%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, (%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+  %src = load i64, ptr %src.ptr, align 1
+  %bitOff = load i64, ptr %bitOff.ptr, align 1
+  %res = shl i64 %src, %bitOff
+  store i64 %res, ptr %dst, align 1
+  ret void
+}
+define void @ashr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
+; X64-NO-BMI2-LABEL: ashr_8bytes:
+; X64-NO-BMI2:       # %bb.0:
+; X64-NO-BMI2-NEXT:    movq (%rdi), %rax
+; X64-NO-BMI2-NEXT:    movzbl (%rsi), %ecx
+; X64-NO-BMI2-NEXT:    sarq %cl, %rax
+; X64-NO-BMI2-NEXT:    movq %rax, (%rdx)
+; X64-NO-BMI2-NEXT:    retq
+;
+; X64-BMI2-LABEL: ashr_8bytes:
+; X64-BMI2:       # %bb.0:
+; X64-BMI2-NEXT:    movzbl (%rsi), %eax
+; X64-BMI2-NEXT:    sarxq %rax, (%rdi), %rax
+; X64-BMI2-NEXT:    movq %rax, (%rdx)
+; X64-BMI2-NEXT:    retq
+;
+; X32-NO-BMI2-NO-SHLD-LABEL: ashr_8bytes:
+; X32-NO-BMI2-NO-SHLD:       # %bb.0:
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%ecx), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    sarl %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    sarl $31, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 4(%edx)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, (%edx)
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X32-NO-BMI2-HAVE-SHLD-LABEL: ashr_8bytes:
+; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esi), %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%esi), %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    sarl %cl, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    sarl $31, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 4(%eax)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%eax)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
+;
+; X32-HAVE-BMI2-NO-SHLD-LABEL: ashr_8bytes:
+; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esi), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%edx), %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, (%esi), %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    sarxl %edx, %ecx, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    sarl $31, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %esi, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, (%eax)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
+;
+; X32-HAVE-BMI2-HAVE-SHLD-LABEL: ashr_8bytes:
+; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esi), %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%esi), %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxl %ecx, %esi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    sarl $31, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 4(%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+  %src = load i64, ptr %src.ptr, align 1
+  %bitOff = load i64, ptr %bitOff.ptr, align 1
+  %res = ashr i64 %src, %bitOff
+  store i64 %res, ptr %dst, align 1
+  ret void
+}
+
+define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
+; X64-NO-BMI2-NO-SHLD-LABEL: lshr_16bytes:
+; X64-NO-BMI2-NO-SHLD:       # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %eax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rdi,%rdi), %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r8, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %al
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %rdi, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rdi, %rcx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rcx, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rsi, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-NO-BMI2-HAVE-SHLD-LABEL: lshr_16bytes:
+; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorl %edi, %edi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %rsi, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-NO-SHLD-LABEL: lshr_16bytes:
+; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, (%rdi), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %dil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rdi, %r8, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rsi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %cl
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %rax, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsi, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-HAVE-SHLD-LABEL: lshr_16bytes:
+; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rdi, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %edi, %edi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rsi, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X32-NO-BMI2-NO-SHLD-LABEL: lshr_16bytes:
+; X32-NO-BMI2-NO-SHLD:       # %bb.0:
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    subl $32, %esp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 8(%ecx), %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb (%eax), %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 4(%edi), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %ah
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %al
+; X32-NO-BMI2-NO-SHLD-NEXT:    subb $64, %ah
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    decb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    negb %ah
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ah
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %al
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %ebx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ebx, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ah
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%eax), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ebx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb %cl, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 12(%ecx)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 8(%ecx)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 4(%ecx)
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl $32, %esp
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X32-NO-BMI2-HAVE-SHLD-LABEL: lshr_16bytes:
+; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    subl $32, %esp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%ecx), %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%ecx), %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ecx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb $64, %dl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    negb %dl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %esi, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %al, %ah
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %ah
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ah, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ah
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %al
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %ebp, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %edi, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ah, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ah
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %ebx, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %al
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 8(%eax)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 12(%eax)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%eax)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 4(%eax)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    addl $32, %esp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
+;
+; X32-HAVE-BMI2-NO-SHLD-LABEL: lshr_16bytes:
+; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subl $32, %esp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%edx), %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%edx), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ecx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esi), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ecx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb $64, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %esi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    decb %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    negb %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ebp, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %eax, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ebp, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ebp, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%ecx), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ecx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %dl, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel (%esp), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 12(%ecx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 8(%ecx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 4(%ecx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $32, %esp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
+;
+; X32-HAVE-BMI2-HAVE-SHLD-LABEL: lshr_16bytes:
+; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $36, %esp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%ecx), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ebx, %ebp, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%ecx), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ebx, %ecx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %edx, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb $64, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %edx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %edi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %dl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %ebp, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %eax, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %ebp, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl (%esp), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %bl, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 8(%ecx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 12(%ecx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%ecx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 4(%ecx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $36, %esp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+  %src = load i128, ptr %src.ptr, align 1
+  %bitOff = load i128, ptr %bitOff.ptr, align 1
+  %res = lshr i128 %src, %bitOff
+  store i128 %res, ptr %dst, align 1
+  ret void
+}
+define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
+; X64-NO-BMI2-NO-SHLD-LABEL: shl_16bytes:
+; X64-NO-BMI2-NO-SHLD:       # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %eax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rdi, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %al
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r8, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r8, %rcx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rcx, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rsi, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-NO-BMI2-HAVE-SHLD-LABEL: shl_16bytes:
+; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rax, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorl %eax, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %rsi, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-NO-SHLD-LABEL: shl_16bytes:
+; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, 8(%rdi), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %dil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, %rax, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rdi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rsi, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %cl
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r8, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %r8, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsi, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-HAVE-SHLD-LABEL: shl_16bytes:
+; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rax, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rcx, %rax, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %esi, %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %rax, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X32-NO-BMI2-NO-SHLD-LABEL: shl_16bytes:
+; X32-NO-BMI2-NO-SHLD:       # %bb.0:
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    subl $32, %esp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%ecx), %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %bh
+; X32-NO-BMI2-NO-SHLD-NEXT:    subb $64, %bh
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bh, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    negb %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    decb %bh
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bh, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %bh
+; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %bh
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bh, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bh
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %eax, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %edx, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bh, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bh
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ebp, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb %bl, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%ecx)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 4(%ecx)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%ecx)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 8(%ecx)
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl $32, %esp
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X32-NO-BMI2-HAVE-SHLD-LABEL: shl_16bytes:
+; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    subl $36, %esp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%ecx), %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%ecx), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ebx, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %cl, %dh
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %cl, %dl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb $64, %dl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ebx, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    negb %dl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %esi, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dh, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %dh
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dh, %al
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %ebx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ebx, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %bl, %bl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%eax)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 4(%eax)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 8(%eax)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 12(%eax)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    addl $36, %esp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
+;
+; X32-HAVE-BMI2-NO-SHLD-LABEL: shl_16bytes:
+; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subl $40, %esp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%edx), %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%edx), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %esi, %ecx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebp, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%ebx), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %eax, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb $64, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    decb %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    negb %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebp, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %eax, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %eax, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %eax, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %eax, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %dl, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel (%esp), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 4(%eax)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 8(%eax)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $40, %esp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
+;
+; X32-HAVE-BMI2-HAVE-SHLD-LABEL: shl_16bytes:
+; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $36, %esp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%ecx), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ebx, %ebp, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%ecx), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ebx, %ecx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %edx, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb $64, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %edx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %dl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, %ebp, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %eax, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %ebp, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%eax), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl (%esp), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %bl, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%ecx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 4(%ecx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 8(%ecx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 12(%ecx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $36, %esp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+  %src = load i128, ptr %src.ptr, align 1
+  %bitOff = load i128, ptr %bitOff.ptr, align 1
+  %res = shl i128 %src, %bitOff
+  store i128 %res, ptr %dst, align 1
+  ret void
+}
+define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
+; X64-NO-BMI2-NO-SHLD-LABEL: ashr_16bytes:
+; X64-NO-BMI2-NO-SHLD:       # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %eax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rdi,%rdi), %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r8, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    sarq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    sarq $63, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %al
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r8, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r8, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rsi, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-NO-BMI2-HAVE-SHLD-LABEL: ashr_16bytes:
+; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    sarq %cl, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    sarq $63, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %rsi, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-NO-SHLD-LABEL: ashr_16bytes:
+; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, (%rdi), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %dil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rdi, %r8, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rsi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    sarxq %rcx, %rax, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    sarq $63, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %cl
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %rsi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rsi, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-HAVE-SHLD-LABEL: ashr_16bytes:
+; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxq %rcx, %rdi, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    sarq $63, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rsi, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X32-NO-BMI2-NO-SHLD-LABEL: ashr_16bytes:
+; X32-NO-BMI2-NO-SHLD:       # %bb.0:
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    subl $36, %esp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 12(%esi), %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 8(%esi), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esi), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    sarl %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    sarl $31, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %eax, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %bh
+; X32-NO-BMI2-NO-SHLD-NEXT:    subb $64, %bh
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %esi, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bh, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    decb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    negb %bh
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bh, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bh
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    sarl %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %esi, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %eax, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bh
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%eax), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %eax, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb %bl, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, (%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl $36, %esp
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X32-NO-BMI2-HAVE-SHLD-LABEL: ashr_16bytes:
+; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    subl $32, %esp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%ecx), %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%ecx), %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    sarl %cl, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    sarl $31, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ebx, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb $64, %dl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    negb %dl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %al, %ah
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %ah
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ah, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    sarl %cl, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ah
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %al
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ebx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %ebp, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ah, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ah
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %al
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 8(%eax)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 12(%eax)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%eax)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 4(%eax)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    addl $32, %esp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
+;
+; X32-HAVE-BMI2-NO-SHLD-LABEL: ashr_16bytes:
+; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subl $36, %esp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%edx), %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%edx), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, %ecx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    sarl $31, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    sarxl %ebx, %edi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%edx), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb $64, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %esi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    decb %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ebp, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    negb %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl (%esp), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    sarxl %ecx, %edi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %esi, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %eax, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%edx), %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel (%esp), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %bl, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 12(%ecx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 8(%ecx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%ecx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $36, %esp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
+;
+; X32-HAVE-BMI2-HAVE-SHLD-LABEL: ashr_16bytes:
+; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $32, %esp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%ecx), %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    sarl $31, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxl %ecx, %edx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%eax), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %eax, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb $64, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %edi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ebp, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $dl killed $dl killed $edx def $edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %dl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxl %edx, %esi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, (%esp) # 1-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebx, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %eax, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ebp, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 8(%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 12(%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 4(%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $32, %esp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+  %src = load i128, ptr %src.ptr, align 1
+  %bitOff = load i128, ptr %bitOff.ptr, align 1
+  %res = ashr i128 %src, %bitOff
+  store i128 %res, ptr %dst, align 1
+  ret void
+}
+
+define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
+; X64-NO-BMI2-NO-SHLD-LABEL: lshr_32bytes:
+; X64-NO-BMI2-NO-SHLD:       # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdx, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rdx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %eax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorl %r11d, %r11d
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %al
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r13, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r11, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r11, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    notb %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r10,%r10), %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rdx, %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %al
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %rdx
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r14, %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdx, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r8,%r8), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r12, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %al
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r13, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movb $-128, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    subb %al, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdx, %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r12, %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdx, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r12, %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r11, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rbp, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r15, %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    leal -128(%rax), %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdx
+; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rdx, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r8, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r11, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb %al, %al
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovnsq %r12, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovnsq %r13, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r10, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovsq %r11, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovsq %r11, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, 24(%r9)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, 16(%r9)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, (%r9)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, 8(%r9)
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-NO-BMI2-HAVE-SHLD-LABEL: lshr_32bytes:
+; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r13
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r12
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, %r12
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r12
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r10, %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorl %esi, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %al
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %r14, %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r12, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %al
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %r12, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movb $-128, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    subb %al, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, %r13
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r8, %r13
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, %r12
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r12
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %r12, %r13
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %r12
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r15, %r12
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r14, %r13
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    leal -128(%rax), %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rdi, %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovnsq %r13, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r10, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovnsq %r12, %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r9, %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovsq %rsi, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovsq %rsi, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, 16(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, 24(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r12
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r13
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-NO-SHLD-LABEL: lshr_32bytes:
+; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq (%rdi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %r9d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r9, %r10, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r9b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r15, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %rcx, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r9, %rax, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r14, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %rcx, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r9, %r8, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %r9d, %r13d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r13b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r13, %r12, %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbx, %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r9b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r14, %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r9, %rbx, %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r10,%r10), %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r13, %r14, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbp, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r9b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r15, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movb $-128, %r13b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    subb %r9b, %r13b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %r13d, %r15d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r15b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rbx, %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r15, %rbp, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r13, %r10, %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbp, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r13b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r13, %rbx, %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r13, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %rcx, %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r12, %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r11, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leal -128(%r9), %r11d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %r11d, %r12d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r12b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %r14, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r11, %rbx, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbx, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r11, %r10, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r11b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r10, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %rcx, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb %r9b, %r9b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovnsq %r13, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %r8, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovnsq %r15, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovsq %rcx, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovsq %rcx, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsi, 24(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, 16(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r14, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-HAVE-SHLD-LABEL: lshr_32bytes:
+; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r13
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r12
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rax, %rdi, %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %esi, %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %al
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r15, %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rax, %r11, %r12
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r12, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %al
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r12, %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $-128, %cl
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %al, %cl
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, %r12
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r8, %r12
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rcx, %r8, %r13
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r13, %r12
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %r13
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r14, %r13
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r15, %r12
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leal -128(%rax), %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %r11, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r11, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnsq %r12, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rdi, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnsq %r13, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %r9, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsq %rsi, %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsq %rsi, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbx, 16(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, 24(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r12
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r13
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X32-NO-BMI2-NO-SHLD-LABEL: lshr_32bytes:
+; X32-NO-BMI2-NO-SHLD:       # %bb.0:
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    subl $140, %esp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 12(%edx), %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 28(%edx), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 20(%ebp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ebp), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 24(%ebp), %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 8(%ecx), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %esi, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-128, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, (%esp) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %ah
+; X32-NO-BMI2-NO-SHLD-NEXT:    subb $64, %ah
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %esi, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    decb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    negb %ah
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ah
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %dh
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %ebx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %esi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dh
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %bh
+; X32-NO-BMI2-NO-SHLD-NEXT:    addb $64, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %bh
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ebp, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %eax, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 16(%eax), %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bh, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bh
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %bh
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %eax, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%eax), %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bh, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bh
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %bh
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %esi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb (%esp), %bl # 1-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb $-128, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    subb %bh, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb $-64, %al
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ah # 1-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    subb %ah, %al
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb %ah, %ah
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %esi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %edx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %edx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %edx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %al
+; X32-NO-BMI2-NO-SHLD-NEXT:    subb $64, %al
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    negb %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    decb %al
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, (%esp) # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %esi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb %dl, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %esi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, (%esp) # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %eax, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb %ch, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ah # 1-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb %ah, %ah
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %esi, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb (%esp), %al # 1-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %al
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb %al, %al
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb %ah, %ah
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnsl %edx, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb %ah, %ah
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb %al, %al
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb %ah, %ah
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %ebp, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 28(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 24(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 16(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 20(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 12(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 4(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 8(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl $140, %esp
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X32-NO-BMI2-HAVE-SHLD-LABEL: lshr_32bytes:
+; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    subl $120, %esp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%ebp), %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%ebp), %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %edx, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%ebp), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%ebp), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%ebp), %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%eax), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-128, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bl, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb $64, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    negb %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bl, %dh
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %dh
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dh, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dh
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ebp, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %edi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dh
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%eax), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bl, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dh, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dh
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %esi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dl, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb $-128, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb %dl, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %cl, %bh
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dh, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dh
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bh, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb $64, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ebp, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    negb %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bh
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bh, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bh
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edx, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bh
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %bh, %bh
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $64, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %bl, %bl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %edx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %edx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %edx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %edx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %edx, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %bl, %bl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnsl %eax, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %bh, %bh
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %bl, %bl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %eax, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb $-64, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb %bl, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %ch, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %bh, %bh
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %esi, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %ch, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %bh, %bh
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %edx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 24(%ecx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 28(%ecx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 16(%ecx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 20(%ecx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%ecx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%ecx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 8(%ecx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 12(%ecx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    addl $120, %esp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
+;
+; X32-HAVE-BMI2-NO-SHLD-LABEL: lshr_32bytes:
+; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subl $152, %esp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%esi), %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esi), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %edi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%esi), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%esi), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%ecx), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %esi, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-128, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ebp, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $bl killed $bl killed $ebx def $ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb $64, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %esi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    decb %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    negb %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %eax, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ebx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %esi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebp, %ecx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebp, %edi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ebx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $bl killed $bl killed $ebx def $ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $64, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ecx, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $dl killed $dl killed $edx def $edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebx, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %eax, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ecx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ebp, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%ebx), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %eax, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %esi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%ebx), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ebp, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ebp, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb $-128, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb %cl, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edx, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb $-64, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb %cl, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edx, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $0, (%esp) # 1-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb $64, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    decb %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    negb %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebp, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %eax, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %bl, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $0, (%esp) # 1-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %ecx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %eax, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edx, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %bl, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $0, (%esp) # 1-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ebp, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %al, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %dl, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnsl %ebx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %dl, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %al, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %dl, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %ebx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 28(%eax)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 24(%eax)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 16(%eax)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 20(%eax)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, (%eax)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 12(%eax)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 8(%eax)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $152, %esp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
+;
+; X32-HAVE-BMI2-HAVE-SHLD-LABEL: lshr_32bytes:
+; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $120, %esp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%ecx), %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%ecx), %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%ecx), %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%edi), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %eax, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%edi), %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %esi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-128, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %eax, %edi, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %eax, %edx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx def $ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb $64, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %edx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl (%esp), %ebp # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ebx, %eax, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %dl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %ebp, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ebx, %edx, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edi, %eax, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%eax), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %esi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %edx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $-128, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %al, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %ebp, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %edx, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx def $ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb $64, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebx, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %eax, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl (%esp), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %eax, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $64, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %esi, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %esi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnsl %edx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %dl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %eax, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $-64, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %al, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %edi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %dl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %dl, %dl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %esi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %edx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 24(%ecx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 28(%ecx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 16(%ecx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 20(%ecx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%ecx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%ecx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 8(%ecx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 12(%ecx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $120, %esp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+  %src = load i256, ptr %src.ptr, align 1
+  %bitOff = load i256, ptr %bitOff.ptr, align 1
+  %res = lshr i256 %src, %bitOff
+  store i256 %res, ptr %dst, align 1
+  ret void
+}
+define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
+; X64-NO-BMI2-NO-SHLD-LABEL: shl_32bytes:
+; X64-NO-BMI2-NO-SHLD:       # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdx, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %rdx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %eax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorl %r10d, %r10d
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %al
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbp, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r10, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r10, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    notb %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r14, %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %al
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %rbx, %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdx, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rbx, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %al
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %rbp, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    movb $-128, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    subb %al, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rdx,%rdx), %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rbx, %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdx, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %rbx, %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r10, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r13, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r12, %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    leal -128(%rax), %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rdx
+; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rdx, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r8, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r10, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb %al, %al
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovnsq %rbx, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovnsq %rbp, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rdi, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovsq %r10, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovsq %r10, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, (%r9)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r15, 8(%r9)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, 24(%r9)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, 16(%r9)
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-NO-BMI2-HAVE-SHLD-LABEL: shl_32bytes:
+; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r13
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r12
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movb $-128, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    subb %al, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, %r12
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r12
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, %r13
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %r13
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorl %esi, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %r12, %r13
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %r12
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r8, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, %rbp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %rbp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %al
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %r14, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbp, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r10, %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %al
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rbp, %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r13, %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r12, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    leal -128(%rax), %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, %r12
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r12
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r10, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %r12, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %r12
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovnsq %r11, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r9, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovnsq %r14, %r12
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r8, %r12
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovsq %rsi, %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovsq %rsi, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r15, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r12, 16(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 24(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r12
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r13
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-NO-SHLD-LABEL: shl_32bytes:
+; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq (%rdi), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %edi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rdi, %r9, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %dil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r15, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %rcx, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rdi, %rax, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %rcx, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rdi, %r8, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, %r13d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r13b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %r12, %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %dil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r11, %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rdi, %r10, %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %r14, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbp, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %dil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r15, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movb $-128, %r13b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    subb %dil, %r13b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %r13d, %r15d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r15b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r10,%r10), %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r15, %rbp, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %r9, %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbp, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r13b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %r10, %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r13, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %rcx, %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r12, %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbx, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leal -128(%rdi), %ebx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %r12d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r12b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r12, %r14, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rbx, %r10, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r10, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rbx, %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %bl
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r9, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %rcx, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb %dil, %dil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovnsq %r13, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %r8, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovnsq %r15, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovsq %rcx, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovsq %rcx, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsi, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r14, 24(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, 16(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-HAVE-SHLD-LABEL: shl_32bytes:
+; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r13
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r12
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $-128, %cl
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %al, %cl
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, %r12
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %r12
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rdi, %r13
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %esi, %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r13, %r12
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %r13
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r8, %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rax, %r8, %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %al
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %rbx, %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rax, %r10, %rbp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbp, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r10, %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %al
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %rbp, %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r12, %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r13, %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leal -128(%rax), %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r10, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rcx, %r10, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r10, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnsq %r14, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %r9, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnsq %rbx, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %r8, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsq %rsi, %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsq %rsi, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r15, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, 16(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 24(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r12
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r13
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X32-NO-BMI2-NO-SHLD-LABEL: shl_32bytes:
+; X32-NO-BMI2-NO-SHLD:       # %bb.0:
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    subl $136, %esp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 16(%ecx), %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%ecx), %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, (%esp) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %ebx, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 24(%esi), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esi), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb (%esp), %ch # 1-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 20(%esi), %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %dh
+; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-128, %dh
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 12(%esi), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dh
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dh
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb (%esp), %bh # 1-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bh, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    subb $64, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    negb %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    decb %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bh, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %bh
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %eax, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %ebp, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %dh
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %bh
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %eax, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dh
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bh, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bh
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb $-128, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    subb %bh, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    subb $64, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    negb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %bh
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %eax, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bh, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bh
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %bh
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    addb $64, %bh
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bh, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bh
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bh, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bh
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    subb $64, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    decb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ebp, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb $-64, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb (%esp), %bh # 1-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    subb %bh, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %eax, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %eax, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb %ch, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %eax, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb %bh, %bh
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %edi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %edi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %edi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %ecx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb %cl, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb (%esp), %ch # 1-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb %ch, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnsl %ebx, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb %ch, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb %ch, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb %cl, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb %ch, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %esi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, (%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 28(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 24(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 16(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 20(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl $136, %esp
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X32-NO-BMI2-HAVE-SHLD-LABEL: shl_32bytes:
+; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    subl $116, %esp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%ecx), %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ebp, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%ebx), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%ebx), %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%eax), %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%eax), %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-128, %al
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb $64, %bl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %edi, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    negb %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %ebx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %al
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $64, %al
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %al, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%eax), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bl, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %ebp, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dl, %bh
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%eax), %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bl, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edx, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bh, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bh
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ebx, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb $-128, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb %ch, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %cl, %dh
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %ch, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %edi, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %edi, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %edi, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %esi, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dh, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb $64, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    negb %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %dh
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb $-64, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edx, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %edx, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edx, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %edx, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ebx, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %cl, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %bl, %bl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %bh, %bh
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %ecx, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %bl, %bl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %bh, %bh
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %ecx, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%ecx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 4(%ecx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 8(%ecx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 12(%ecx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 24(%ecx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 28(%ecx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 16(%ecx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 20(%ecx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    addl $116, %esp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
+;
+; X32-HAVE-BMI2-NO-SHLD-LABEL: shl_32bytes:
+; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subl $156, %esp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%edx), %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%edx), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%edx), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%edx), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %eax, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%edi), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %eax, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%ecx), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ecx, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-128, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%esi), %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %edi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb $64, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    negb %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    decb %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ebp, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edx, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ebx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebp, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %eax, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %esi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, %ebp, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ecx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ebx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %eax, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb $-128, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb (%esp), %bl # 1-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb $64, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    negb %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ebx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %eax, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %eax, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ecx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %esi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ecx, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $64, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb $64, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %esi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    decb %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %eax, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb $-64, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb (%esp), %dl # 1-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ebx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ebx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %esi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $0, (%esp) # 1-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %ebx, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %ebx, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %ebx, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %ebx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %dl, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ebx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %bl, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnsl %esi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %bl, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %bl, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %dl, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %bl, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %ecx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%ecx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 4(%ecx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 12(%ecx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 8(%ecx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 28(%ecx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 24(%ecx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 16(%ecx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 20(%ecx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $156, %esp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
+;
+; X32-HAVE-BMI2-HAVE-SHLD-LABEL: shl_32bytes:
+; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $124, %esp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%ebp), %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%ebp), %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %edx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %ebx, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%ebp), %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %esi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%ebp), %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %edx, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%ebp), %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%eax), %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-128, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %edx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %esi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb $64, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %eax, %edx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ebx, %edi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %dl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ebx, %edx, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %ebp, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $al killed $al killed $eax def $eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $64, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %ebp, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, (%esp) # 1-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%eax), %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%ecx), %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %edx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, (%esp) # 1-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %eax, %ebx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %edi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $-128, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %dl, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ebx, %ecx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %dl, %dl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %edx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %eax, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %eax, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb $64, %dl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %dl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, %eax, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ebp, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $-64, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb (%esp), %cl # 1-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %eax, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %ebp, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %bl, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %eax, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $0, (%esp) # 1-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %eax, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %ebp, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %bl, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebx, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ebx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %bl, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %ecx, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %bl, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %eax, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 4(%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 8(%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 12(%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 24(%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 28(%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 16(%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 20(%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $124, %esp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+  %src = load i256, ptr %src.ptr, align 1
+  %bitOff = load i256, ptr %bitOff.ptr, align 1
+  %res = shl i256 %src, %bitOff
+  store i256 %res, ptr %dst, align 1
+  ret void
+}
+define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
+; X64-NO-BMI2-NO-SHLD-LABEL: ashr_32bytes:
+; X64-NO-BMI2-NO-SHLD:       # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdx, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %edx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r12, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorl %eax, %eax
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %dl
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %rax, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    sarq %cl, %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    sarq $63, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %dl
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r13, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r11, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    notb %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r12,%r12), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r9, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %dl
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r14, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r8,%r8), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rbp, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %dl
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r13, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movb $-128, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    subb %dl, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r13, %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r13, %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %rcx, %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rax, %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r15, %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    leal -128(%rdx), %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r9, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    sarq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r8, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r11, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb %dl, %dl
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovnsq %r13, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovnsq %rbp, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r12, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovsq %r11, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovsq %r11, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, 24(%r10)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, 16(%r10)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, (%r10)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, 8(%r10)
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-NO-BMI2-HAVE-SHLD-LABEL: ashr_32bytes:
+; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r13
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r12
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, %r12
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    sarq %cl, %r12
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r10, %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    sarq $63, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ebp, %ebp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %al
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %r14, %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r12, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rbp, %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %al
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %r12, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movb $-128, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    subb %al, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, %r13
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r8, %r13
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, %r12
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r12
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %r12, %r13
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rbp, %r12
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r15, %r12
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r14, %r13
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    leal -128(%rax), %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    sarq %cl, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rdi, %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovnsq %r13, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r10, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovnsq %r12, %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r9, %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovsq %rsi, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovsq %rsi, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, 16(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, 24(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r12
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r13
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-NO-SHLD-LABEL: ashr_32bytes:
+; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %r9d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r9, %rcx, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorl %eax, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r9b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r15, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %rax, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    sarq $63, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r9b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    sarxq %r9, %r10, %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r12, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %rsi, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r9, %rbx, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %r9d, %ebp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %bpl
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rcx,%rcx), %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rbp, %r13, %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r9b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r15, %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r9, %r14, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r10,%r10), %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rbp, %r15, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rax, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r9b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r12, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movb $-128, %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    subb %r9b, %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %r12d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r12b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r14, %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r12, %rbp, %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r10, %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbp, %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r14, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %rax, %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ebp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %rbp, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r13, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r11, %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leal -128(%r9), %r11d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %r11d, %r13d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r13b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r13, %r15, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r11, %r14, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    sarxq %r11, %r10, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r11b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r10, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %rsi, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb %r9b, %r9b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovnsq %rax, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rbx, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovnsq %r12, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rcx, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovsq %rsi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovsq %rsi, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, 24(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, 16(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r15, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-HAVE-SHLD-LABEL: ashr_32bytes:
+; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r13
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r12
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rax, %rdi, %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    sarq $63, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %r12d, %r12d
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %al
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r15, %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxq %rax, %r11, %r13
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r13, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r12, %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %al
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r13, %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $-128, %cl
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %al, %cl
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, %r13
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r8, %r13
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rcx, %r8, %rbp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %rbp, %r13
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r12, %rbp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r14, %rbp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r15, %r13
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leal -128(%rax), %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxq %rcx, %r11, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r11, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnsq %r13, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rdi, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnsq %rbp, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %r9, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsq %rsi, %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsq %rsi, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbx, 16(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, 24(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r12
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r13
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X32-NO-BMI2-NO-SHLD-LABEL: ashr_32bytes:
+; X32-NO-BMI2-NO-SHLD:       # %bb.0:
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    subl $144, %esp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 28(%ecx), %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    sarl %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    sarl $31, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 20(%ebp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ebp), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 24(%ebp), %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %dh
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %dh
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edi, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-128, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    sarl %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %ah
+; X32-NO-BMI2-NO-SHLD-NEXT:    subb $64, %ah
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %esi, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    decb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    negb %ah
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ah
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %dh
+; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %dh
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    sarl %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dh
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %al
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %ebx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dh
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %bh
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    addb $64, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    sarl %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bh, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %bh
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ebp, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 16(%ebp), %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bh, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bh
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bh, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bh, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bh
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %eax, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%ebp), %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bh, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bh
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %bh
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %esi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb $-128, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    subb %bh, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb $-64, %al
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ah # 1-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    subb %ah, %al
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb %ah, %ah
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %esi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %edx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %edx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %edx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %al
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    subb $64, %al
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ebp, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %ah
+; X32-NO-BMI2-NO-SHLD-NEXT:    negb %ah
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    decb %al
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ah, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ah
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %esi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %ecx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %eax, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb %ch, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ah # 1-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb %ah, %ah
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %esi, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %al
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb %al, %al
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb %ah, %ah
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnsl %esi, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb %ah, %ah
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb %al, %al
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb %ah, %ah
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %edi, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 28(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 24(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 16(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 20(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, (%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 12(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 4(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 8(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl $144, %esp
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X32-NO-BMI2-HAVE-SHLD-LABEL: ashr_32bytes:
+; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    subl $116, %esp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%ebx), %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%ebx), %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    sarl %cl, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    sarl $31, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %edx, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%ebx), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%ebx), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%ebx), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%ebx), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ebx, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-128, %al
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    sarl %cl, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dl, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb $64, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    negb %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dl, %dh
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %dh
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dh, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    sarl %cl, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dh
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %dl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %ebx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dh
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dl, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %dl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%eax), %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dh, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dh
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %dl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dl, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb $-128, %al
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb %dl, %al
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %al, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dh, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dh
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb $64, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    negb %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edx, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %eax, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $64, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    sarl %cl, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %edx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %edx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %edx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %edx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %ch, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnsl %esi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %ch, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %eax, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb $-64, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb %ch, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %ch, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %edx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %ch, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %edx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 24(%ecx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 28(%ecx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 16(%ecx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 20(%ecx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%ecx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%ecx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 8(%ecx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 12(%ecx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    addl $116, %esp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
+;
+; X32-HAVE-BMI2-NO-SHLD-LABEL: ashr_32bytes:
+; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subl $156, %esp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%edx), %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%edx), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    sarl $31, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%edx), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%edx), %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edi, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    sarxl %ebx, %ecx, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%eax), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %esi, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-128, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    sarxl %eax, %edi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb $64, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %eax, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    decb %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    negb %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebp, %ecx, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl (%esp), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    sarxl %ebx, %edi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %ecx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %esi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebp, %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $bl killed $bl killed $ebx def $ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $64, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ecx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $dl killed $dl killed $edx def $edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    sarxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edi, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%esi), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %eax, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %edi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %esi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ebx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb $-128, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb %cl, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb $-64, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb %cl, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %al, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %ebp, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %ebp, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %ebp, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb $64, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    decb %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    negb %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ecx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ebp, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %bl, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %ebp, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %eax, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ebp, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %bl, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edi, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ebp, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %al, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %dl, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnsl %edi, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %dl, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %al, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %dl, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %edi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 28(%eax)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 24(%eax)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 16(%eax)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 20(%eax)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, (%eax)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, 4(%eax)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 8(%eax)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $156, %esp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
+;
+; X32-HAVE-BMI2-HAVE-SHLD-LABEL: ashr_32bytes:
+; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $132, %esp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%ecx), %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%ecx), %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%ecx), %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    sarl $31, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxl %ecx, %edx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %ebp, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %eax, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%edx), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %eax, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-128, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %eax, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxl %ecx, %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb $64, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %edx, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %dl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxl %edx, %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %ebp, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %edx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %esi, %eax, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %eax, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%eax), %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %edi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %ebp, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel (%esp), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $-128, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %bl, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %edx, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %ebp, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %ebp, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx def $ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb $64, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %ebx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %edi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, (%esp) # 1-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %eax, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx def $ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $64, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxl %ecx, %eax, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %esi, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnsl %edx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %dl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $0, (%esp) # 1-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %eax, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $-64, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %al, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %dl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %dl, %dl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %esi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %edx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 24(%ecx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 28(%ecx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 16(%ecx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 20(%ecx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, (%ecx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%ecx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 8(%ecx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 12(%ecx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $132, %esp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+  %src = load i256, ptr %src.ptr, align 1
+  %bitOff = load i256, ptr %bitOff.ptr, align 1
+  %res = ashr i256 %src, %bitOff
+  store i256 %res, ptr %dst, align 1
+  ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; ALL: {{.*}}
+; X32: {{.*}}
+; X32-NO-SHLD: {{.*}}
+; X32-SHLD: {{.*}}
+; X64: {{.*}}
+; X64-NO-SHLD: {{.*}}
+; X64-SHLD: {{.*}}


        


More information about the llvm-commits mailing list