[llvm] aad7259 - [NFC][Codegen][X86] Add codegen test coverage for the variably-indexed load of alloca w/zero upper half

Roman Lebedev via llvm-commits llvm-commits at lists.llvm.org
Fri Dec 23 09:17:05 PST 2022


Author: Roman Lebedev
Date: 2022-12-23T20:16:41+03:00
New Revision: aad725928d767243dff1fc66f26b7afb17e29865

URL: https://github.com/llvm/llvm-project/commit/aad725928d767243dff1fc66f26b7afb17e29865
DIFF: https://github.com/llvm/llvm-project/commit/aad725928d767243dff1fc66f26b7afb17e29865.diff

LOG: [NFC][Codegen][X86] Add codegen test coverage for the variably-indexed load of alloca w/zero upper half

Added: 
    llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll

Modified: 
    

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
new file mode 100644
index 000000000000..d1ad4192a6d2
--- /dev/null
+++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
@@ -0,0 +1,10973 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-NO-BMI2,X64-NO-SHLD,X64-NO-BMI2-NO-SHLD
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-NO-BMI2,X64-SHLD,X64-NO-BMI2-HAVE-SHLD
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-BMI2,X64-NO-SHLD,X64-HAVE-BMI2-NO-SHLD
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-BMI2,X64-SHLD,X64-HAVE-BMI2-HAVE-SHLD
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X32,X32-NO-BMI2,X32-NO-SHLD,X32-NO-BMI2-NO-SHLD
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X32,X32-NO-BMI2,X32-SHLD,X32-NO-BMI2-HAVE-SHLD
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X32,X32-BMI2,X32-NO-SHLD,X32-HAVE-BMI2-NO-SHLD
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X32,X32-BMI2,X32-SHLD,X32-HAVE-BMI2-HAVE-SHLD
+
+define void @load_1byte_chunk_of_2byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
+; X64-NO-BMI2-LABEL: load_1byte_chunk_of_2byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2:       # %bb.0:
+; X64-NO-BMI2-NEXT:    movzbl (%rdi), %eax
+; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NEXT:    shrl %cl, %eax
+; X64-NO-BMI2-NEXT:    movb %al, (%rdx)
+; X64-NO-BMI2-NEXT:    retq
+;
+; X64-BMI2-LABEL: load_1byte_chunk_of_2byte_alloca_with_zero_upper_half:
+; X64-BMI2:       # %bb.0:
+; X64-BMI2-NEXT:    movzbl (%rdi), %eax
+; X64-BMI2-NEXT:    shll $3, %esi
+; X64-BMI2-NEXT:    shrxl %esi, %eax, %eax
+; X64-BMI2-NEXT:    movb %al, (%rdx)
+; X64-BMI2-NEXT:    retq
+;
+; X32-NO-BMI2-LABEL: load_1byte_chunk_of_2byte_alloca_with_zero_upper_half:
+; X32-NO-BMI2:       # %bb.0:
+; X32-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NEXT:    movzbl (%eax), %eax
+; X32-NO-BMI2-NEXT:    shll $3, %ecx
+; X32-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X32-NO-BMI2-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-NEXT:    movb %al, (%edx)
+; X32-NO-BMI2-NEXT:    retl
+;
+; X32-BMI2-LABEL: load_1byte_chunk_of_2byte_alloca_with_zero_upper_half:
+; X32-BMI2:       # %bb.0:
+; X32-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-BMI2-NEXT:    movzbl (%edx), %edx
+; X32-BMI2-NEXT:    shll $3, %ecx
+; X32-BMI2-NEXT:    shrxl %ecx, %edx, %ecx
+; X32-BMI2-NEXT:    movb %cl, (%eax)
+; X32-BMI2-NEXT:    retl
+  %init1 = load i8, ptr %src, align 1
+  %intermediate.sroa.0.0.vec.insert = insertelement <2 x i8> <i8 poison, i8 0>, i8 %init1, i64 0
+  %intermediate.val.frozen = freeze <2 x i8> %intermediate.sroa.0.0.vec.insert
+  %intermediate.val.frozen.bits = bitcast <2 x i8> %intermediate.val.frozen to i16
+  %byteOff.tr = trunc i64 %byteOff to i16
+  %byteOff.numbits.wide = shl i16 %byteOff.tr, 3
+  %intermediate.val.frozen.bits.positioned = lshr i16 %intermediate.val.frozen.bits, %byteOff.numbits.wide
+  %intermediate.val.frozen.bits.positioned.extracted = trunc i16 %intermediate.val.frozen.bits.positioned to i8
+  %1 = insertelement <1 x i8> poison, i8 %intermediate.val.frozen.bits.positioned.extracted, i64 0
+  store <1 x i8> %1, ptr %dst, align 1
+  ret void
+}
+
+define void @load_1byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
+; X64-NO-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2:       # %bb.0:
+; X64-NO-BMI2-NEXT:    movzwl (%rdi), %eax
+; X64-NO-BMI2-NEXT:    movzwl %ax, %eax
+; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NEXT:    shrl %cl, %eax
+; X64-NO-BMI2-NEXT:    movb %al, (%rdx)
+; X64-NO-BMI2-NEXT:    retq
+;
+; X64-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
+; X64-BMI2:       # %bb.0:
+; X64-BMI2-NEXT:    movzwl (%rdi), %eax
+; X64-BMI2-NEXT:    movzwl %ax, %eax
+; X64-BMI2-NEXT:    shll $3, %esi
+; X64-BMI2-NEXT:    shrxl %esi, %eax, %eax
+; X64-BMI2-NEXT:    movb %al, (%rdx)
+; X64-BMI2-NEXT:    retq
+;
+; X32-NO-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
+; X32-NO-BMI2:       # %bb.0:
+; X32-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NO-BMI2-NEXT:    movzwl (%edx), %edx
+; X32-NO-BMI2-NEXT:    movzwl %dx, %edx
+; X32-NO-BMI2-NEXT:    shll $3, %ecx
+; X32-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X32-NO-BMI2-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-NEXT:    movb %dl, (%eax)
+; X32-NO-BMI2-NEXT:    retl
+;
+; X32-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
+; X32-BMI2:       # %bb.0:
+; X32-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-BMI2-NEXT:    movzwl (%edx), %edx
+; X32-BMI2-NEXT:    movzwl %dx, %edx
+; X32-BMI2-NEXT:    shll $3, %ecx
+; X32-BMI2-NEXT:    shrxl %ecx, %edx, %ecx
+; X32-BMI2-NEXT:    movb %cl, (%eax)
+; X32-BMI2-NEXT:    retl
+  %init = load <2 x i8>, ptr %src, align 1
+  %intermediate.sroa.0.0.vec.expand = shufflevector <2 x i8> %init, <2 x i8> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %intermediate.sroa.0.0.vecblend = shufflevector <4 x i8> %intermediate.sroa.0.0.vec.expand, <4 x i8> <i8 poison, i8 poison, i8 0, i8 0>, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+  %intermediate.val.frozen = freeze <4 x i8> %intermediate.sroa.0.0.vecblend
+  %intermediate.val.frozen.bits = bitcast <4 x i8> %intermediate.val.frozen to i32
+  %byteOff.tr = trunc i64 %byteOff to i32
+  %byteOff.numbits.wide = shl i32 %byteOff.tr, 3
+  %intermediate.val.frozen.bits.positioned = lshr i32 %intermediate.val.frozen.bits, %byteOff.numbits.wide
+  %intermediate.val.frozen.bits.positioned.extracted = trunc i32 %intermediate.val.frozen.bits.positioned to i8
+  %1 = insertelement <1 x i8> poison, i8 %intermediate.val.frozen.bits.positioned.extracted, i64 0
+  store <1 x i8> %1, ptr %dst, align 1
+  ret void
+}
+
+define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
+; X64-NO-BMI2-LABEL: load_2byte_chunk_of_4byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2:       # %bb.0:
+; X64-NO-BMI2-NEXT:    movzwl (%rdi), %eax
+; X64-NO-BMI2-NEXT:    movzwl %ax, %eax
+; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NEXT:    shrl %cl, %eax
+; X64-NO-BMI2-NEXT:    movw %ax, (%rdx)
+; X64-NO-BMI2-NEXT:    retq
+;
+; X64-BMI2-LABEL: load_2byte_chunk_of_4byte_alloca_with_zero_upper_half:
+; X64-BMI2:       # %bb.0:
+; X64-BMI2-NEXT:    movzwl (%rdi), %eax
+; X64-BMI2-NEXT:    movzwl %ax, %eax
+; X64-BMI2-NEXT:    shll $3, %esi
+; X64-BMI2-NEXT:    shrxl %esi, %eax, %eax
+; X64-BMI2-NEXT:    movw %ax, (%rdx)
+; X64-BMI2-NEXT:    retq
+;
+; X32-NO-BMI2-LABEL: load_2byte_chunk_of_4byte_alloca_with_zero_upper_half:
+; X32-NO-BMI2:       # %bb.0:
+; X32-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NO-BMI2-NEXT:    movzwl (%edx), %edx
+; X32-NO-BMI2-NEXT:    movzwl %dx, %edx
+; X32-NO-BMI2-NEXT:    shll $3, %ecx
+; X32-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X32-NO-BMI2-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-NEXT:    movw %dx, (%eax)
+; X32-NO-BMI2-NEXT:    retl
+;
+; X32-BMI2-LABEL: load_2byte_chunk_of_4byte_alloca_with_zero_upper_half:
+; X32-BMI2:       # %bb.0:
+; X32-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-BMI2-NEXT:    movzwl (%edx), %edx
+; X32-BMI2-NEXT:    movzwl %dx, %edx
+; X32-BMI2-NEXT:    shll $3, %ecx
+; X32-BMI2-NEXT:    shrxl %ecx, %edx, %ecx
+; X32-BMI2-NEXT:    movw %cx, (%eax)
+; X32-BMI2-NEXT:    retl
+  %init = load <2 x i8>, ptr %src, align 1
+  %intermediate.sroa.0.0.vec.expand = shufflevector <2 x i8> %init, <2 x i8> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %intermediate.sroa.0.0.vecblend = shufflevector <4 x i8> %intermediate.sroa.0.0.vec.expand, <4 x i8> <i8 poison, i8 poison, i8 0, i8 0>, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+  %intermediate.val.frozen = freeze <4 x i8> %intermediate.sroa.0.0.vecblend
+  %intermediate.val.frozen.bits = bitcast <4 x i8> %intermediate.val.frozen to i32
+  %byteOff.tr = trunc i64 %byteOff to i32
+  %byteOff.numbits.wide = shl i32 %byteOff.tr, 3
+  %intermediate.val.frozen.bits.positioned = lshr i32 %intermediate.val.frozen.bits, %byteOff.numbits.wide
+  %intermediate.val.frozen.bits.positioned.extracted = trunc i32 %intermediate.val.frozen.bits.positioned to i16
+  store i16 %intermediate.val.frozen.bits.positioned.extracted, ptr %dst, align 2
+  ret void
+}
+
+define void @load_1byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
+; X64-NO-BMI2-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2:       # %bb.0:
+; X64-NO-BMI2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT:    movq %xmm0, %rax
+; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NEXT:    shrq %cl, %rax
+; X64-NO-BMI2-NEXT:    movb %al, (%rdx)
+; X64-NO-BMI2-NEXT:    retq
+;
+; X64-BMI2-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half:
+; X64-BMI2:       # %bb.0:
+; X64-BMI2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-BMI2-NEXT:    shll $3, %esi
+; X64-BMI2-NEXT:    movq %xmm0, %rax
+; X64-BMI2-NEXT:    shrxq %rsi, %rax, %rax
+; X64-BMI2-NEXT:    movb %al, (%rdx)
+; X64-BMI2-NEXT:    retq
+;
+; X32-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half:
+; X32-NO-BMI2-NO-SHLD:       # %bb.0:
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edi, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, (%edx)
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X32-NO-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half:
+; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dl, (%eax)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
+;
+; X32-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half:
+; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb %dl, (%eax)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
+;
+; X32-HAVE-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half:
+; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb %bl, (%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+  %init = load <4 x i8>, ptr %src, align 1
+  %intermediate.sroa.0.0.vec.expand = shufflevector <4 x i8> %init, <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  %intermediate.sroa.0.0.vecblend = shufflevector <8 x i8> %intermediate.sroa.0.0.vec.expand, <8 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+  %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
+  %intermediate.val.frozen = freeze <8 x i8> %intermediate.sroa.0.0.vecblend
+  %intermediate.val.frozen.bits = bitcast <8 x i8> %intermediate.val.frozen to i64
+  %intermediate.val.frozen.bits.positioned = lshr i64 %intermediate.val.frozen.bits, %byteOff.numbits
+  %intermediate.val.frozen.bits.positioned.extracted = trunc i64 %intermediate.val.frozen.bits.positioned to i8
+  %1 = insertelement <1 x i8> poison, i8 %intermediate.val.frozen.bits.positioned.extracted, i64 0
+  store <1 x i8> %1, ptr %dst, align 1
+  ret void
+}
+
+define void @load_2byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
+; X64-NO-BMI2-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2:       # %bb.0:
+; X64-NO-BMI2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT:    movq %xmm0, %rax
+; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NEXT:    shrq %cl, %rax
+; X64-NO-BMI2-NEXT:    movw %ax, (%rdx)
+; X64-NO-BMI2-NEXT:    retq
+;
+; X64-BMI2-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half:
+; X64-BMI2:       # %bb.0:
+; X64-BMI2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-BMI2-NEXT:    shll $3, %esi
+; X64-BMI2-NEXT:    movq %xmm0, %rax
+; X64-BMI2-NEXT:    shrxq %rsi, %rax, %rax
+; X64-BMI2-NEXT:    movw %ax, (%rdx)
+; X64-BMI2-NEXT:    retq
+;
+; X32-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half:
+; X32-NO-BMI2-NO-SHLD:       # %bb.0:
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movw %si, (%edx)
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X32-NO-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half:
+; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movw %si, (%eax)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
+;
+; X32-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half:
+; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movw %dx, (%eax)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
+;
+; X32-HAVE-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half:
+; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movw %si, (%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+  %init = load <4 x i8>, ptr %src, align 1
+  %intermediate.sroa.0.0.vec.expand = shufflevector <4 x i8> %init, <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  %intermediate.sroa.0.0.vecblend = shufflevector <8 x i8> %intermediate.sroa.0.0.vec.expand, <8 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+  %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
+  %intermediate.val.frozen = freeze <8 x i8> %intermediate.sroa.0.0.vecblend
+  %intermediate.val.frozen.bits = bitcast <8 x i8> %intermediate.val.frozen to i64
+  %intermediate.val.frozen.bits.positioned = lshr i64 %intermediate.val.frozen.bits, %byteOff.numbits
+  %intermediate.val.frozen.bits.positioned.extracted = trunc i64 %intermediate.val.frozen.bits.positioned to i16
+  store i16 %intermediate.val.frozen.bits.positioned.extracted, ptr %dst, align 2
+  ret void
+}
+
+define void @load_4byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
+; X64-NO-BMI2-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2:       # %bb.0:
+; X64-NO-BMI2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT:    movq %xmm0, %rax
+; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NEXT:    shrq %cl, %rax
+; X64-NO-BMI2-NEXT:    movl %eax, (%rdx)
+; X64-NO-BMI2-NEXT:    retq
+;
+; X64-BMI2-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half:
+; X64-BMI2:       # %bb.0:
+; X64-BMI2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-BMI2-NEXT:    shll $3, %esi
+; X64-BMI2-NEXT:    movq %xmm0, %rax
+; X64-BMI2-NEXT:    shrxq %rsi, %rax, %rax
+; X64-BMI2-NEXT:    movl %eax, (%rdx)
+; X64-BMI2-NEXT:    retq
+;
+; X32-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half:
+; X32-NO-BMI2-NO-SHLD:       # %bb.0:
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, (%edx)
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X32-NO-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half:
+; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%eax)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
+;
+; X32-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half:
+; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
+;
+; X32-HAVE-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half:
+; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+  %init = load <4 x i8>, ptr %src, align 1
+  %intermediate.sroa.0.0.vec.expand = shufflevector <4 x i8> %init, <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  %intermediate.sroa.0.0.vecblend = shufflevector <8 x i8> %intermediate.sroa.0.0.vec.expand, <8 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+  %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
+  %intermediate.val.frozen = freeze <8 x i8> %intermediate.sroa.0.0.vecblend
+  %intermediate.val.frozen.bits = bitcast <8 x i8> %intermediate.val.frozen to i64
+  %intermediate.val.frozen.bits.positioned = lshr i64 %intermediate.val.frozen.bits, %byteOff.numbits
+  %intermediate.val.frozen.bits.positioned.extracted = trunc i64 %intermediate.val.frozen.bits.positioned to i32
+  store i32 %intermediate.val.frozen.bits.positioned.extracted, ptr %dst, align 4
+  ret void
+}
+
+define void @load_1byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
+; X64-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-NO-SHLD:       # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rdi, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r8, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movb %al, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-NO-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movb %sil, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rcx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %dil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rdi, %r8, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rcx, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rdi, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movb %al, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rsi, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movb %sil, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X32-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X32-NO-BMI2-NO-SHLD:       # %bb.0:
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, (%esp) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %eax, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    subb $64, %al
+; X32-NO-BMI2-NO-SHLD-NEXT:    negb %al
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    andb $24, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %esi, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb %bl, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, (%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl $4, %esp
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X32-NO-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb $64, %al
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    negb %al
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ecx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %dl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %dl, %dl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel (%esp), %ecx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %cl, (%eax)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    addl $4, %esp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
+;
+; X32-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebp, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ebx, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb $64, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    negb %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, (%esp), %ebp # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %esi, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %al, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb %cl, (%eax)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $4, %esp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
+;
+; X32-HAVE-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %eax, %ebp, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb $64, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $24, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ebx, %edi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %ebx, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ebp, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb %bl, (%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+  %init = load <8 x i8>, ptr %src, align 1
+  %intermediate.sroa.0.0.vec.expand = shufflevector <8 x i8> %init, <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %intermediate.sroa.0.0.vecblend = shufflevector <16 x i8> %intermediate.sroa.0.0.vec.expand, <16 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
+  %intermediate.val.frozen = freeze <16 x i8> %intermediate.sroa.0.0.vecblend
+  %intermediate.val.frozen.bits = bitcast <16 x i8> %intermediate.val.frozen to i128
+  %byteOff.numbits.wide = zext i64 %byteOff.numbits to i128
+  %intermediate.val.frozen.bits.positioned = lshr i128 %intermediate.val.frozen.bits, %byteOff.numbits.wide
+  %intermediate.val.frozen.bits.positioned.extracted = trunc i128 %intermediate.val.frozen.bits.positioned to i8
+  %1 = insertelement <1 x i8> poison, i8 %intermediate.val.frozen.bits.positioned.extracted, i64 0
+  store <1 x i8> %1, ptr %dst, align 1
+  ret void
+}
+
+define void @load_2byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
+; X64-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-NO-SHLD:       # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rdi, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r8, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movw %ax, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-NO-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movw %si, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rcx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %dil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rdi, %r8, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rcx, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rdi, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movw %ax, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rsi, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movw %si, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X32-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X32-NO-BMI2-NO-SHLD:       # %bb.0:
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, (%esp) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %eax, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    subb $64, %al
+; X32-NO-BMI2-NO-SHLD-NEXT:    negb %al
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    andb $24, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %esi, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb %bl, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movw %cx, (%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl $4, %esp
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X32-NO-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %al, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb $64, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    negb %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ebp, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %al
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movw %bp, (%ecx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
+;
+; X32-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebp, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ebx, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb $64, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    negb %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, (%esp), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ebp, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ecx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %esi, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %al, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movw %bp, (%eax)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $4, %esp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
+;
+; X32-HAVE-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %eax, %ebp, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb $64, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $24, %dl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, %edi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %edi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ecx, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel (%esp), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movw %di, (%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $4, %esp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+  %init = load <8 x i8>, ptr %src, align 1
+  %intermediate.sroa.0.0.vec.expand = shufflevector <8 x i8> %init, <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %intermediate.sroa.0.0.vecblend = shufflevector <16 x i8> %intermediate.sroa.0.0.vec.expand, <16 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
+  %intermediate.val.frozen = freeze <16 x i8> %intermediate.sroa.0.0.vecblend
+  %intermediate.val.frozen.bits = bitcast <16 x i8> %intermediate.val.frozen to i128
+  %byteOff.numbits.wide = zext i64 %byteOff.numbits to i128
+  %intermediate.val.frozen.bits.positioned = lshr i128 %intermediate.val.frozen.bits, %byteOff.numbits.wide
+  %intermediate.val.frozen.bits.positioned.extracted = trunc i128 %intermediate.val.frozen.bits.positioned to i16
+  store i16 %intermediate.val.frozen.bits.positioned.extracted, ptr %dst, align 2
+  ret void
+}
+
+define void @load_4byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
+; X64-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-NO-SHLD:       # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rdi, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r8, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-NO-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rcx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %dil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rdi, %r8, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rcx, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rdi, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rsi, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X32-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X32-NO-BMI2-NO-SHLD:       # %bb.0:
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, (%esp) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %eax, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    subb $64, %al
+; X32-NO-BMI2-NO-SHLD-NEXT:    negb %al
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    andb $24, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %esi, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb %bl, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl $4, %esp
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X32-NO-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %al, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb $64, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    negb %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ebp, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %al
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, (%ecx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
+;
+; X32-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebp, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ebx, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb $64, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    negb %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, (%esp), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ebp, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ecx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %esi, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %al, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, (%eax)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $4, %esp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
+;
+; X32-HAVE-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %eax, %ebp, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb $64, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $24, %dl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, %edi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %edi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ecx, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel (%esp), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, (%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $4, %esp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+  %init = load <8 x i8>, ptr %src, align 1
+  %intermediate.sroa.0.0.vec.expand = shufflevector <8 x i8> %init, <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %intermediate.sroa.0.0.vecblend = shufflevector <16 x i8> %intermediate.sroa.0.0.vec.expand, <16 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
+  %intermediate.val.frozen = freeze <16 x i8> %intermediate.sroa.0.0.vecblend
+  %intermediate.val.frozen.bits = bitcast <16 x i8> %intermediate.val.frozen to i128
+  %byteOff.numbits.wide = zext i64 %byteOff.numbits to i128
+  %intermediate.val.frozen.bits.positioned = lshr i128 %intermediate.val.frozen.bits, %byteOff.numbits.wide
+  %intermediate.val.frozen.bits.positioned.extracted = trunc i128 %intermediate.val.frozen.bits.positioned to i32
+  store i32 %intermediate.val.frozen.bits.positioned.extracted, ptr %dst, align 4
+  ret void
+}
+
+define void @load_8byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
+; X64-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-NO-SHLD:       # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rdi, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r8, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rax, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-NO-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rcx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %dil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rdi, %r8, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rcx, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rdi, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rsi, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X32-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X32-NO-BMI2-NO-SHLD:       # %bb.0:
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    subl $24, %esp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, (%esp) # 4-byte Folded Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %bh
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    subb $64, %bh
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bh, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    decb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    negb %bh
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bh, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    andb $24, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bh
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %ebp, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bh
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb %dl, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel (%esp), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 4(%ecx)
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl $24, %esp
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X32-NO-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    subl $24, %esp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb $64, %dl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    negb %dl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ecx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %eax, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bl, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %edi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bl, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %ebp, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %bl, %bl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel (%esp), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%ecx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%ecx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    addl $24, %esp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
+;
+; X32-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subl $28, %esp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb $64, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    decb %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    negb %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %edx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %esi, %ebp, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx def $ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %eax, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %al, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, 4(%eax)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $28, %esp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
+;
+; X32-HAVE-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $28, %esp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb $64, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $24, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %esi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ebx, %eax, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %edx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %dl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, (%esp), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %eax, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ebp # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %bl, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 4(%ecx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%ecx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $28, %esp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+  %init = load <8 x i8>, ptr %src, align 1
+  %intermediate.sroa.0.0.vec.expand = shufflevector <8 x i8> %init, <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %intermediate.sroa.0.0.vecblend = shufflevector <16 x i8> %intermediate.sroa.0.0.vec.expand, <16 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
+  %intermediate.val.frozen = freeze <16 x i8> %intermediate.sroa.0.0.vecblend
+  %intermediate.val.frozen.bits = bitcast <16 x i8> %intermediate.val.frozen to i128
+  %byteOff.numbits.wide = zext i64 %byteOff.numbits to i128
+  %intermediate.val.frozen.bits.positioned = lshr i128 %intermediate.val.frozen.bits, %byteOff.numbits.wide
+  %intermediate.val.frozen.bits.positioned.extracted = trunc i128 %intermediate.val.frozen.bits.positioned to i64
+  store i64 %intermediate.val.frozen.bits.positioned.extracted, ptr %dst, align 8
+  ret void
+}
+
+define void @load_1byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
+; X64-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-NO-SHLD:       # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r8, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r9, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb %sil, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovnsl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movb %dil, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-NO-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %rdi, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorl %edi, %edi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovnsl %esi, %edi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movb %al, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rcx, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %r8d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r8b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r8, %r9, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rdi, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %r8, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorl %edi, %edi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb %sil, %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovnsl %eax, %edi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movb %cl, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rsi, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rdi, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %edi, %edi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnsl %esi, %edi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movb %al, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X32-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X32-NO-BMI2-NO-SHLD:       # %bb.0:
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    subl $8, %esp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movdqu (%ecx), %xmm0
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %eax, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    subb $64, %al
+; X32-NO-BMI2-NO-SHLD-NEXT:    negb %al
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    andb $24, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %esi, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb %dl, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %eax, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %ecx, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %eax, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, (%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl $8, %esp
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X32-NO-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%ecx), %xmm0
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %al, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb $64, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    negb %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ebp, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %al
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %ebp, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bl, (%ecx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
+;
+; X32-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%ecx), %xmm0
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ebx, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb $64, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    negb %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, (%esp), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ebp, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %esi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %al, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ecx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %ebp, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ecx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb %dl, (%eax)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $4, %esp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
+;
+; X32-HAVE-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $8, %esp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%ecx), %xmm0
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %eax, %ebp, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb $64, %dl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %dl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, (%esp), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %edi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ebx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %eax, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %edi, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %eax, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb %cl, (%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $8, %esp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+  %init = load <16 x i8>, ptr %src, align 1
+  %intermediate.sroa.0.0.vec.expand = shufflevector <16 x i8> %init, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %intermediate.sroa.0.0.vecblend = shufflevector <32 x i8> %intermediate.sroa.0.0.vec.expand, <32 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+  %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
+  %intermediate.val.frozen = freeze <32 x i8> %intermediate.sroa.0.0.vecblend
+  %intermediate.val.frozen.bits = bitcast <32 x i8> %intermediate.val.frozen to i256
+  %byteOff.numbits.wide = zext i64 %byteOff.numbits to i256
+  %intermediate.val.frozen.bits.positioned = lshr i256 %intermediate.val.frozen.bits, %byteOff.numbits.wide
+  %intermediate.val.frozen.bits.positioned.extracted = trunc i256 %intermediate.val.frozen.bits.positioned to i8
+  %1 = insertelement <1 x i8> poison, i8 %intermediate.val.frozen.bits.positioned.extracted, i64 0
+  store <1 x i8> %1, ptr %dst, align 1
+  ret void
+}
+
+define void @load_2byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
+; X64-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-NO-SHLD:       # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r8, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r9, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb %sil, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovnsl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movw %di, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-NO-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %rdi, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorl %edi, %edi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovnsl %esi, %edi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movw %ax, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rcx, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %r8d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r8b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r8, %r9, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rdi, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %r8, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorl %edi, %edi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb %sil, %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovnsl %eax, %edi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movw %cx, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rsi, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rdi, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %edi, %edi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnsl %esi, %edi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movw %ax, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X32-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X32-NO-BMI2-NO-SHLD:       # %bb.0:
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    subl $8, %esp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movdqu (%ecx), %xmm0
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, (%esp) # 4-byte Folded Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %edi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    subb $64, %al
+; X32-NO-BMI2-NO-SHLD-NEXT:    negb %al
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    andb $24, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb %bl, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %ecx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movw %si, (%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl $8, %esp
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X32-NO-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%ecx), %xmm0
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %al, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb $64, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    negb %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ebp, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %ebx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %al
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %ebp, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movw %si, (%ecx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
+;
+; X32-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%ecx), %xmm0
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ebx, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb $64, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    negb %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, (%esp), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ebp, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %esi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %al, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ecx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %ebp, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ecx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movw %dx, (%eax)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $4, %esp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
+;
+; X32-HAVE-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $8, %esp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%ecx), %xmm0
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %eax, %ebp, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb $64, %dl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %dl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, (%esp), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %edi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ebx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %eax, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %edi, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %eax, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movw %cx, (%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $8, %esp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+  %init = load <16 x i8>, ptr %src, align 1
+  %intermediate.sroa.0.0.vec.expand = shufflevector <16 x i8> %init, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %intermediate.sroa.0.0.vecblend = shufflevector <32 x i8> %intermediate.sroa.0.0.vec.expand, <32 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+  %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
+  %intermediate.val.frozen = freeze <32 x i8> %intermediate.sroa.0.0.vecblend
+  %intermediate.val.frozen.bits = bitcast <32 x i8> %intermediate.val.frozen to i256
+  %byteOff.numbits.wide = zext i64 %byteOff.numbits to i256
+  %intermediate.val.frozen.bits.positioned = lshr i256 %intermediate.val.frozen.bits, %byteOff.numbits.wide
+  %intermediate.val.frozen.bits.positioned.extracted = trunc i256 %intermediate.val.frozen.bits.positioned to i16
+  store i16 %intermediate.val.frozen.bits.positioned.extracted, ptr %dst, align 2
+  ret void
+}
+
+define void @load_4byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
+; X64-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-NO-SHLD:       # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r8, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r9, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb %sil, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovnsl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-NO-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %rdi, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorl %edi, %edi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovnsl %esi, %edi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rcx, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %r8d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r8b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r8, %r9, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rdi, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %r8, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorl %edi, %edi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb %sil, %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovnsl %eax, %edi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rsi, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rdi, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %edi, %edi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnsl %esi, %edi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X32-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X32-NO-BMI2-NO-SHLD:       # %bb.0:
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    subl $8, %esp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movdqu (%ecx), %xmm0
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, (%esp) # 4-byte Folded Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %edi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    subb $64, %al
+; X32-NO-BMI2-NO-SHLD-NEXT:    negb %al
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    andb $24, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb %bl, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %ecx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, (%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl $8, %esp
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X32-NO-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%ecx), %xmm0
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %al, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb $64, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    negb %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ebp, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %ebx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %al
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %ebp, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%ecx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
+;
+; X32-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%ecx), %xmm0
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ebx, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb $64, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    negb %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, (%esp), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ebp, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %esi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %al, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ecx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %ebp, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ecx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $4, %esp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
+;
+; X32-HAVE-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $8, %esp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%ecx), %xmm0
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %eax, %ebp, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb $64, %dl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %dl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, (%esp), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %edi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ebx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %al, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %eax, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %edi, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %eax, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $8, %esp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+  %init = load <16 x i8>, ptr %src, align 1
+  %intermediate.sroa.0.0.vec.expand = shufflevector <16 x i8> %init, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %intermediate.sroa.0.0.vecblend = shufflevector <32 x i8> %intermediate.sroa.0.0.vec.expand, <32 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+  %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
+  %intermediate.val.frozen = freeze <32 x i8> %intermediate.sroa.0.0.vecblend
+  %intermediate.val.frozen.bits = bitcast <32 x i8> %intermediate.val.frozen to i256
+  %byteOff.numbits.wide = zext i64 %byteOff.numbits to i256
+  %intermediate.val.frozen.bits.positioned = lshr i256 %intermediate.val.frozen.bits, %byteOff.numbits.wide
+  %intermediate.val.frozen.bits.positioned.extracted = trunc i256 %intermediate.val.frozen.bits.positioned to i32
+  store i32 %intermediate.val.frozen.bits.positioned.extracted, ptr %dst, align 4
+  ret void
+}
+
+define void @load_8byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
+; X64-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-NO-SHLD:       # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r8, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r9, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb %sil, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovnsq %rax, %rcx
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rdi, %rcx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rcx, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-NO-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %rdi, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorl %edi, %edi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovnsq %rsi, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rcx, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %r8d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r8b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r8, %r9, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rdi, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %r8, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorl %edi, %edi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb %sil, %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovnsq %rax, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rcx, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rsi, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rdi, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %edi, %edi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnsq %rsi, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X32-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X32-NO-BMI2-NO-SHLD:       # %bb.0:
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    subl $24, %esp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movdqu (%eax), %xmm0
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, (%esp) # 4-byte Folded Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, %bh
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    subb $64, %bh
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bh, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    decb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    negb %bh
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bh, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    andb $24, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bh
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %edx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bh
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edi, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb %cl, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ecx, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %esi, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ecx, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ecx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %esi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ecx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%ecx)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%ecx)
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl $24, %esp
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X32-NO-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    subl $24, %esp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%eax), %xmm0
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, (%esp) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb $64, %dl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    negb %dl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ecx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %eax, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bl, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %ebp, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bl, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ebp # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %bl, %bl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ecx, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %esi, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ecx, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ecx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %esi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ecx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%ecx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%ecx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    addl $24, %esp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
+;
+; X32-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subl $32, %esp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%eax), %xmm0
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb $64, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    decb %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    negb %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ecx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %esi, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %al, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %esi, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %edx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %esi, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %eax, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %edx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %eax, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, (%eax)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $32, %esp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
+;
+; X32-HAVE-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $24, %esp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%eax), %xmm0
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb $64, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $24, %dl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, %esi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ebx, %edi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %eax, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ebx, %ebp, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %edi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, (%esp) # 1-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ecx, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %esi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ecx, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 4(%ecx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%ecx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $24, %esp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+  %init = load <16 x i8>, ptr %src, align 1
+  %intermediate.sroa.0.0.vec.expand = shufflevector <16 x i8> %init, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %intermediate.sroa.0.0.vecblend = shufflevector <32 x i8> %intermediate.sroa.0.0.vec.expand, <32 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+  %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
+  %intermediate.val.frozen = freeze <32 x i8> %intermediate.sroa.0.0.vecblend
+  %intermediate.val.frozen.bits = bitcast <32 x i8> %intermediate.val.frozen to i256
+  %byteOff.numbits.wide = zext i64 %byteOff.numbits to i256
+  %intermediate.val.frozen.bits.positioned = lshr i256 %intermediate.val.frozen.bits, %byteOff.numbits.wide
+  %intermediate.val.frozen.bits.positioned.extracted = trunc i256 %intermediate.val.frozen.bits.positioned to i64
+  store i64 %intermediate.val.frozen.bits.positioned.extracted, ptr %dst, align 8
+  ret void
+}
+
+define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
+; X64-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-NO-SHLD:       # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r8, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rax, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r8, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %rcx, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb %sil, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovsq %rcx, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rdi, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovsq %rcx, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-NO-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorl %r9d, %r9d
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %r8, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %r9, %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovsq %r9, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovsq %r9, %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %rsi, %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rcx, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %r8d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r8b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r8, %r9, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rdi, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rax, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorl %r9d, %r9d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %rdi, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r9, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb %sil, %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovsq %r9, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rcx, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovsq %r9, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rsi, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %r9d, %r9d
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r8, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r9, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsq %r9, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsq %r9, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rsi, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X32-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X32-NO-BMI2-NO-SHLD:       # %bb.0:
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    subl $44, %esp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movdqu (%eax), %xmm0
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    subb $64, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    decb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    negb %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    andb $24, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %eax, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ebp, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %ebp, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ebp, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb %bl, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %ecx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %ecx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %ecx, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovsl %ecx, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 12(%ecx)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 8(%ecx)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 4(%ecx)
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl $44, %esp
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X32-NO-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    subl $32, %esp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%eax), %xmm0
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb $64, %dl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    negb %dl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %esi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bl, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %eax, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %edi, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bl, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %bl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb %bl, %bl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %ecx, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %eax, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %ecx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %eax, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %ecx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovsl %ebx, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 12(%eax)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 8(%eax)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 4(%eax)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%eax)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    addl $32, %esp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
+;
+; X32-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subl $44, %esp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%eax), %xmm0
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ebp, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, %ecx, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edx, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb $64, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edx, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    decb %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebx, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    negb %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %eax, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %edx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edx, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpb $64, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ebx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb %dl, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %esi, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %ebx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %esi, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %ebx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %ebx, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovsl %edx, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 12(%edx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 8(%edx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%edx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%edx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $44, %esp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
+;
+; X32-HAVE-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $36, %esp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%eax), %xmm0
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %eax, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edx, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb $64, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $24, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ebx, %esi, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %edx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %dl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %eax, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpb $64, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %edx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb %cl, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ecx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %edx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ecx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ecx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %edx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ecx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %edx, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovsl %ecx, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 12(%ecx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 8(%ecx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 4(%ecx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%ecx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $36, %esp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+  %init = load <16 x i8>, ptr %src, align 1
+  %intermediate.sroa.0.0.vec.expand = shufflevector <16 x i8> %init, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %intermediate.sroa.0.0.vecblend = shufflevector <32 x i8> %intermediate.sroa.0.0.vec.expand, <32 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+  %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
+  %intermediate.val.frozen = freeze <32 x i8> %intermediate.sroa.0.0.vecblend
+  %intermediate.val.frozen.bits = bitcast <32 x i8> %intermediate.val.frozen to i256
+  %byteOff.numbits.wide = zext i64 %byteOff.numbits to i256
+  %intermediate.val.frozen.bits.positioned = lshr i256 %intermediate.val.frozen.bits, %byteOff.numbits.wide
+  %intermediate.val.frozen.bits.positioned.extracted = trunc i256 %intermediate.val.frozen.bits.positioned to i128
+  store i128 %intermediate.val.frozen.bits.positioned.extracted, ptr %dst, align 16
+  ret void
+}
+
+define void @load_1byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
+; X64-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-NO-SHLD:       # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
+; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm2, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm2, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm1, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movb $-128, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    subb %sil, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorl %r10d, %r10d
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ebx
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r9,%r9), %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rax, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r14, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %r9d
+; X64-NO-BMI2-NO-SHLD-NEXT:    leal -128(%rsi), %eax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rdi,%rdi), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r11, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %al
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rbx, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmpl $128, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovbl %r9d, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovel %r8d, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovael %r10d, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovel %r8d, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movb %dil, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-NO-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm2, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm1, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movb $-128, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    subb %sil, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorl %r9d, %r9d
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r11, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %sil
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r11, %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    orl %ebx, %r8d
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    leal -128(%rsi), %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rax, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r10, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %r8d, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testl %esi, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %r9d, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testl %esi, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movb %al, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm2, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm2, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm1, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movb $-128, %r10b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    subb %sil, %r10b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r10, %r8, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorl %edi, %edi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r10b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %r10d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %r11, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rax, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r9,%r9), %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rbx, %r14, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r11, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rbx, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orl %r10d, %r9d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leal -128(%rsi), %r10d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r10, %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %r10d, %r11d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r11b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rcx,%rcx), %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r11, %rbx, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r8, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r10, %rcx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r10b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %r11, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $128, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %r9d, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %eax, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edi, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %eax, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movb %cl, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm2, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm1, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r9, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rsi, %r9, %rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %sil
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %r10, %rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $-128, %r9b
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %sil, %r9b
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %r9, %rdi, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %r11d, %r11d
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %r9b
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %r9d
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %r10, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ecx, %r9d
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leal -128(%rsi), %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %r8, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rdi, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %r9d, %r8d
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %esi, %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %eax, %r8d
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %r11d, %r8d
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %esi, %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %eax, %r8d
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movb %r8b, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X32-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X32-NO-BMI2-NO-SHLD:       # %bb.0:
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    subl $12, %esp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movdqu (%eax), %xmm0
+; X32-NO-BMI2-NO-SHLD-NEXT:    movdqu 16(%eax), %xmm1
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm2, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal -128(,%ecx,8), %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb $64, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    subb %dl, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %eax, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm2, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %eax, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %esi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %eax, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm2, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %ebp, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb $64, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    testl %edx, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    subb %al, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %eax, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %esi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    testl %ecx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal -128(%ecx), %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    negl %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    andb $24, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $128, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edi, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, (%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl $12, %esp
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X32-NO-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    subl $8, %esp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%eax), %xmm0
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%eax), %xmm1
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    leal -128(,%ecx,8), %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb $64, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb %dl, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %edi, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %esi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %esi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %eax, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb $64, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %edx, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb %al, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %eax, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %edx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %edi, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    leal -128(%esi), %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    negl %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ecx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %eax, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %esi, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %esi, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bl, (%eax)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    addl $8, %esp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
+;
+; X32-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subl $12, %esp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movdqu 16(%ecx), %xmm0
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal -128(,%eax,8), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb $64, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb %cl, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %eax, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ebx, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%ebp), %xmm0
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, (%esp), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %eax, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %edi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %ecx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb $64, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb %dl, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %esi, %edx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %esi, %ebp, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %eax, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edi, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ebx, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ecx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %ebp, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edi, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal -128(%esi), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    negl %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $128, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael (%esp), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %eax, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb %dl, (%eax)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $12, %esp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
+;
+; X32-HAVE-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%eax), %xmm0
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%eax), %xmm2
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    leal -128(,%ebx,8), %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %eax, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $64, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %dl, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %edi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ecx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %ebp, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %eax, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %edx, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel (%esp), %ebp # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ebx, %eax, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $64, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %bl, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ecx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edx, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %eax, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %ebx, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    leal -128(%ebx), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negl %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, (%esp), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %eax, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %edx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %ebx, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $256, %ebx # imm = 0x100
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %ebx, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb %cl, (%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $4, %esp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+  %init = load <32 x i8>, ptr %src, align 1
+  %intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %intermediate.sroa.0.0.vecblend = shufflevector <64 x i8> %intermediate.sroa.0.0.vec.expand, <64 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+  %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
+  %intermediate.val.frozen = freeze <64 x i8> %intermediate.sroa.0.0.vecblend
+  %intermediate.val.frozen.bits = bitcast <64 x i8> %intermediate.val.frozen to i512
+  %byteOff.numbits.wide = zext i64 %byteOff.numbits to i512
+  %intermediate.val.frozen.bits.positioned = lshr i512 %intermediate.val.frozen.bits, %byteOff.numbits.wide
+  %intermediate.val.frozen.bits.positioned.extracted = trunc i512 %intermediate.val.frozen.bits.positioned to i8
+  %1 = insertelement <1 x i8> poison, i8 %intermediate.val.frozen.bits.positioned.extracted, i64 0
+  store <1 x i8> %1, ptr %dst, align 1
+  ret void
+}
+
+define void @load_2byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
+; X64-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-NO-SHLD:       # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
+; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm2, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm2, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm1, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movb $-128, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    subb %sil, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorl %r10d, %r10d
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ebx
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r9,%r9), %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rax, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r14, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %r9d
+; X64-NO-BMI2-NO-SHLD-NEXT:    leal -128(%rsi), %eax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rdi,%rdi), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r11, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %al
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rbx, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmpl $128, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovbl %r9d, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovel %r8d, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovael %r10d, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovel %r8d, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movw %di, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-NO-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm2, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm1, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movb $-128, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    subb %sil, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorl %r9d, %r9d
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r11, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %sil
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r11, %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    orl %ebx, %r8d
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    leal -128(%rsi), %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rax, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r10, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %r8d, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testl %esi, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %r9d, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testl %esi, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movw %ax, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm2, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm2, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm1, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movb $-128, %r10b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    subb %sil, %r10b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r10, %r8, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorl %edi, %edi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r10b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %r10d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %r11, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rax, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r9,%r9), %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rbx, %r14, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r11, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rbx, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orl %r10d, %r9d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leal -128(%rsi), %r10d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r10, %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %r10d, %r11d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r11b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rcx,%rcx), %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r11, %rbx, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r8, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r10, %rcx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r10b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %r11, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $128, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %r9d, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %eax, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edi, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %eax, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movw %cx, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm2, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm1, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r9, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rsi, %r9, %rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %sil
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %r10, %rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $-128, %r9b
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %sil, %r9b
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %r9, %rdi, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %r11d, %r11d
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %r9b
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %r9d
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %r10, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ecx, %r9d
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leal -128(%rsi), %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %r8, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rdi, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %r9d, %r8d
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %esi, %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %eax, %r8d
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %r11d, %r8d
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %esi, %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %eax, %r8d
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movw %r8w, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X32-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X32-NO-BMI2-NO-SHLD:       # %bb.0:
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    subl $16, %esp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movdqu (%eax), %xmm0
+; X32-NO-BMI2-NO-SHLD-NEXT:    movdqu 16(%eax), %xmm1
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm2, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal -128(,%ecx,8), %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb $64, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    subb %dl, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %eax, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm2, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %eax, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %edi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %eax, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm2, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %ebp, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb $64, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    testl %edx, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    subb %al, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %al
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edi, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %esi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    testl %ebx, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal -128(%ebx), %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    negl %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    andb $24, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $128, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael (%esp), %ebx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edi, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edi, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movw %bx, (%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl $16, %esp
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X32-NO-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    subl $8, %esp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%eax), %xmm0
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%eax), %xmm1
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    leal -128(,%ecx,8), %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb $64, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb %dl, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ebx, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %eax, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb $64, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %edx, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb %bl, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %eax, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %edx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %ebx, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ebx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    leal -128(%edi), %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    negl %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ecx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %eax, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %edi, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $256, %edi # imm = 0x100
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %edi, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movw %si, (%eax)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    addl $8, %esp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
+;
+; X32-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subl $12, %esp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movdqu 16(%ecx), %xmm0
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal -128(,%eax,8), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb $64, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb %cl, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %eax, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ebx, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%ebp), %xmm0
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, (%esp), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %eax, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %edi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %ecx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb $64, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb %dl, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %esi, %edx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %esi, %ebp, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %eax, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edi, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ebx, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ecx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %ebp, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edi, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal -128(%esi), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    negl %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $128, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael (%esp), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %eax, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movw %dx, (%eax)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $12, %esp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
+;
+; X32-HAVE-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%eax), %xmm0
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%eax), %xmm2
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    leal -128(,%ebx,8), %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %eax, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $64, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %dl, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %edi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ecx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %ebp, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %eax, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %edx, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel (%esp), %ebp # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ebx, %eax, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $64, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %bl, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ecx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edx, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %eax, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %ebx, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    leal -128(%ebx), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negl %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, (%esp), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %eax, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %edx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %ebx, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $256, %ebx # imm = 0x100
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %ebx, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movw %cx, (%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $4, %esp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+  %init = load <32 x i8>, ptr %src, align 1
+  %intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %intermediate.sroa.0.0.vecblend = shufflevector <64 x i8> %intermediate.sroa.0.0.vec.expand, <64 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+  %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
+  %intermediate.val.frozen = freeze <64 x i8> %intermediate.sroa.0.0.vecblend
+  %intermediate.val.frozen.bits = bitcast <64 x i8> %intermediate.val.frozen to i512
+  %byteOff.numbits.wide = zext i64 %byteOff.numbits to i512
+  %intermediate.val.frozen.bits.positioned = lshr i512 %intermediate.val.frozen.bits, %byteOff.numbits.wide
+  %intermediate.val.frozen.bits.positioned.extracted = trunc i512 %intermediate.val.frozen.bits.positioned to i16
+  store i16 %intermediate.val.frozen.bits.positioned.extracted, ptr %dst, align 2
+  ret void
+}
+
+define void @load_4byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
+; X64-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-NO-SHLD:       # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
+; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm2, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm2, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm1, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movb $-128, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    subb %sil, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorl %r10d, %r10d
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ebx
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r9,%r9), %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rax, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r14, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %r9d
+; X64-NO-BMI2-NO-SHLD-NEXT:    leal -128(%rsi), %eax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rdi,%rdi), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r11, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %al
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rbx, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmpl $128, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovbl %r9d, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovel %r8d, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovael %r10d, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovel %r8d, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-NO-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm2, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm1, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movb $-128, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    subb %sil, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorl %r9d, %r9d
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r11, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %sil
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r11, %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    orl %ebx, %r8d
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    leal -128(%rsi), %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rax, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r10, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %r8d, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testl %esi, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %r9d, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testl %esi, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm2, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm2, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm1, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movb $-128, %r10b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    subb %sil, %r10b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r10, %r8, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorl %edi, %edi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r10b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %r10d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %r11, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rax, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r9,%r9), %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rbx, %r14, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r11, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rbx, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orl %r10d, %r9d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leal -128(%rsi), %r10d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r10, %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %r10d, %r11d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r11b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rcx,%rcx), %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r11, %rbx, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r8, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r10, %rcx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r10b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %r11, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $128, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %r9d, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %eax, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edi, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %eax, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm2, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm1, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r9, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rsi, %r9, %rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %sil
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %r10, %rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $-128, %r9b
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %sil, %r9b
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %r9, %rdi, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %r11d, %r11d
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %r9b
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %r9d
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %r10, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ecx, %r9d
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leal -128(%rsi), %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %r8, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rdi, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %r9d, %r8d
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %esi, %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %eax, %r8d
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %r11d, %r8d
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %esi, %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %eax, %r8d
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %r8d, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X32-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X32-NO-BMI2-NO-SHLD:       # %bb.0:
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    subl $12, %esp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movdqu (%eax), %xmm0
+; X32-NO-BMI2-NO-SHLD-NEXT:    movdqu 16(%eax), %xmm1
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm2, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal -128(,%ecx,8), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb $64, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    subb %bl, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %eax, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm2, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %eax, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %esi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %eax, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm2, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %ebp, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb $64, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    testl %ebx, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edi, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    subb %bl, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %esi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    testl %ecx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal -128(%ecx), %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    negl %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    andb $24, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $128, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, (%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl $12, %esp
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X32-NO-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    subl $8, %esp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%eax), %xmm0
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%eax), %xmm1
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    leal -128(,%ecx,8), %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb $64, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb %dl, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ebx, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %eax, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb $64, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %edx, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb %bl, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %eax, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %edx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %ebx, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ebx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    leal -128(%edi), %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    negl %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ecx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %eax, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %edi, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $256, %edi # imm = 0x100
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %edi, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%eax)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    addl $8, %esp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
+;
+; X32-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subl $12, %esp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movdqu 16(%ecx), %xmm0
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal -128(,%eax,8), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb $64, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb %cl, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %eax, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ebx, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%ebp), %xmm0
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, (%esp), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %eax, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %edi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %ecx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb $64, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb %dl, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %esi, %edx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %esi, %ebp, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %eax, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edi, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ebx, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ecx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %ebp, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edi, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal -128(%esi), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    negl %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $128, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael (%esp), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %eax, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $12, %esp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
+;
+; X32-HAVE-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%eax), %xmm0
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%eax), %xmm2
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    leal -128(,%ebx,8), %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %eax, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $64, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %dl, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %edi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ecx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %ebp, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %eax, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %edx, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel (%esp), %ebp # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ebx, %eax, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $64, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %bl, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ecx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edx, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %eax, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %ebx, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    leal -128(%ebx), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negl %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, (%esp), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %eax, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %edx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %ebx, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $256, %ebx # imm = 0x100
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %ebx, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $4, %esp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+  %init = load <32 x i8>, ptr %src, align 1
+  %intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %intermediate.sroa.0.0.vecblend = shufflevector <64 x i8> %intermediate.sroa.0.0.vec.expand, <64 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+  %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
+  %intermediate.val.frozen = freeze <64 x i8> %intermediate.sroa.0.0.vecblend
+  %intermediate.val.frozen.bits = bitcast <64 x i8> %intermediate.val.frozen to i512
+  %byteOff.numbits.wide = zext i64 %byteOff.numbits to i512
+  %intermediate.val.frozen.bits.positioned = lshr i512 %intermediate.val.frozen.bits, %byteOff.numbits.wide
+  %intermediate.val.frozen.bits.positioned.extracted = trunc i512 %intermediate.val.frozen.bits.positioned to i32
+  store i32 %intermediate.val.frozen.bits.positioned.extracted, ptr %dst, align 4
+  ret void
+}
+
+define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
+; X64-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-NO-SHLD:       # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
+; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm2, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm2, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm1, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movb $-128, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    subb %sil, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorl %r10d, %r10d
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r10, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r9,%r9), %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rbx, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r14, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rax, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    leal -128(%rsi), %eax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rdi,%rdi), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r11, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %al
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rbx, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmpl $128, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovbq %r9, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r8, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovaeq %r10, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r8, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-NO-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm2, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm1, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movb $-128, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    subb %sil, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorl %r10d, %r10d
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %r10, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %sil
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %rbx, %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r11, %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    leal -128(%rsi), %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rax, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r9, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovbq %r8, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testl %esi, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %rdi, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovaeq %r10, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testl %esi, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %rdi, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm2, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm2, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm1, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movb $-128, %r10b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    subb %sil, %r10b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r10, %r8, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorl %edi, %edi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r10b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %rdi, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rax, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r9,%r9), %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rbx, %r14, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r10, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rbx, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r11, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leal -128(%rsi), %r10d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r10, %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %r10d, %r11d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r11b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rcx,%rcx), %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r11, %rbx, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r8, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r10, %rcx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r10b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %r11, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $128, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovbq %r9, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovaeq %rdi, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm2, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm1, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r9, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rsi, %r9, %rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %sil
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %r10, %rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $-128, %r9b
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %sil, %r9b
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %r9, %rdi, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %r11d, %r11d
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %r9b
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r11, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %rcx, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leal -128(%rsi), %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %r8, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rdi, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbq %r10, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %esi, %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovaeq %r11, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %esi, %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X32-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X32-NO-BMI2-NO-SHLD:       # %bb.0:
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    subl $68, %esp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movdqu (%eax), %xmm0
+; X32-NO-BMI2-NO-SHLD-NEXT:    movdqu 16(%eax), %xmm1
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm2, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (,%edx,8), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal -128(%ebx), %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    negl %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    andb $24, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %eax, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %eax, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb $64, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    subb %bl, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm3, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm2, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm2, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %ebx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testl %eax, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal -128(,%eax,8), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb $64, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    subb %al, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm2, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, (%esp) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %edx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testl %ebx, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $128, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %edi, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edi, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %eax, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testl %ecx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %eax, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    testl %edx, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $128, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edi, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    testl %ecx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %eax, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %eax, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $256, %ecx # imm = 0x100
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edi, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edi, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testl %ecx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %eax, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 4(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, (%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl $68, %esp
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X32-NO-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    subl $72, %esp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%eax), %xmm0
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%eax), %xmm1
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm3, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    leal -128(,%ecx,8), %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb $64, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb %bl, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %esi, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %eax, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    # kill: def $al killed $al killed $eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %al
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm3, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm4, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ebx, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %edi, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %eax, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb $64, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb %al, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %eax, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    addl $-128, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    negl %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ecx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edx, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $128, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %eax, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ecx, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %edx, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %esi, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %eax, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ecx, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $256, %eax # imm = 0x100
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %eax, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ecx, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 4(%eax)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%eax)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    addl $72, %esp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
+;
+; X32-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subl $80, %esp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%eax), %xmm0
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movdqu 16(%eax), %xmm1
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm2, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%ecx,8), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal -128(%eax), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    negl %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %eax, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb $64, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb %dl, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm2, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm2, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm2, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %esi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %edx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb $64, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal -128(,%eax,8), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb %al, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm2, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl (%esp), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edx, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %edx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $128, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %edi, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %eax, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edi, %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $al killed $al killed $eax def $eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edx, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edi, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %esi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %eax, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %ecx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %edi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $128, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %esi, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %ecx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $256, %ecx # imm = 0x100
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %esi, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %esi, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %ecx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edi, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, (%eax)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 4(%eax)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $80, %esp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
+;
+; X32-HAVE-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $72, %esp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%eax), %xmm1
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    leal -128(,%ecx,8), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $64, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %cl, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ebx, %edx, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %edx, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edi, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%eax), %xmm0
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $dl killed $dl killed $edx def $edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %dl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %eax, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %ebp, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %esi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %eax, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $64, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %al, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %edi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %eax, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    leal -128(%eax), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negl %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $24, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %edx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %edi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ecx, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %edx, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebx, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %edi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ecx, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $256, %edi # imm = 0x100
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %edi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ecx, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 4(%ecx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%ecx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $72, %esp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+  %init = load <32 x i8>, ptr %src, align 1
+  %intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %intermediate.sroa.0.0.vecblend = shufflevector <64 x i8> %intermediate.sroa.0.0.vec.expand, <64 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+  %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
+  %intermediate.val.frozen = freeze <64 x i8> %intermediate.sroa.0.0.vecblend
+  %intermediate.val.frozen.bits = bitcast <64 x i8> %intermediate.val.frozen to i512
+  %byteOff.numbits.wide = zext i64 %byteOff.numbits to i512
+  %intermediate.val.frozen.bits.positioned = lshr i512 %intermediate.val.frozen.bits, %byteOff.numbits.wide
+  %intermediate.val.frozen.bits.positioned.extracted = trunc i512 %intermediate.val.frozen.bits.positioned to i64
+  store i64 %intermediate.val.frozen.bits.positioned.extracted, ptr %dst, align 8
+  ret void
+}
+
+define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
+; X64-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-NO-SHLD:       # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
+; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm2, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm1, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movb $-128, %al
+; X64-NO-BMI2-NO-SHLD-NEXT:    subb %sil, %al
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rbx, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rdi,%rdi), %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rbx, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorl %ebx, %ebx
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r12, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %rbx, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %al
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r14, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %rbx, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r15, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r12, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    leal -128(%rsi), %eax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r9,%r9), %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r10, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %al
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r9, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %rbx, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmpl $128, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovbq %r14, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovbq %r11, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r8, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rdi, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovaeq %rbx, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovaeq %rbx, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r8, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rdi, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r15, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-NO-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm1, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm1, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r9, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorl %r10d, %r10d
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %sil
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %r14, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %r10, %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movb $-128, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    subb %sil, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rax, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %r15, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %r10, %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %rbx, %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r14, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    leal -128(%rsi), %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rdi, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %r10, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovbq %r11, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovbq %r15, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testl %esi, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r8, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r9, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovaeq %r10, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovaeq %r10, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testl %esi, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r8, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r9, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm2, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm1, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movb $-128, %r8b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    subb %sil, %r8b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r8, %rdi, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %r8d, %r11d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r11b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r8, %r9, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leal -128(%rsi), %r14d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r14, %r9, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r11, %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r10, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rcx, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %r11d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r11b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r11, %r12, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r10, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rax, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorl %r12d, %r12d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r10, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r12, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r8b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %rbx, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r12, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r11, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r10, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %r14d, %r8d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r8b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rdi,%rdi), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r8, %r10, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r15, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r14, %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r14b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %rdi, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r12, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $128, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovbq %rbx, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovbq %r9, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rcx, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovaeq %r12, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovaeq %r12, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rcx, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm1, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm1, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rsi, %r8, %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %r9d, %r9d
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %sil
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %rbx, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r9, %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $-128, %cl
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %sil, %cl
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rax, %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rcx, %rax, %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r15, %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r9, %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r11, %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %rbx, %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leal -128(%rsi), %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r10, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %r10, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r10, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r9, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbq %r14, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbq %r15, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %esi, %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rdi, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %r8, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovaeq %r9, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovaeq %r9, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %esi, %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rdi, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %r8, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X32-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X32-NO-BMI2-NO-SHLD:       # %bb.0:
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    subl $112, %esp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movdqu (%eax), %xmm0
+; X32-NO-BMI2-NO-SHLD-NEXT:    movdqu 16(%eax), %xmm1
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (,%edx,8), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal -128(%eax), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    negl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb $64, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    subb %bl, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    andb $24, %al
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %ah
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm2, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ah
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    andb $24, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm2, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %ebp, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    andb $24, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %eax, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; X32-NO-BMI2-NO-SHLD-NEXT:    testl %ebx, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %esi, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm3, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %al
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm2, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm2, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edx, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal -128(,%ecx,8), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $128, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %eax, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %eax, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    testl %edi, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $128, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ebp, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb $64, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    subb %al, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebx, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %edx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb $64, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    subb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %edx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    testl %ecx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    testl %edi, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $128, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %esi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %eax, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ebp, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    testl %eax, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edi, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    testl %ecx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $128, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edx, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    testl %eax, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edi, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ecx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $256, %edx # imm = 0x100
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %esi, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %esi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %esi, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %esi, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testl %edx, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ecx, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 4(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, (%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 12(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 8(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl $112, %esp
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X32-NO-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    subl $88, %esp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%eax), %xmm0
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%eax), %xmm1
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm3, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    leal (,%esi,8), %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    leal -128(%ebp), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    negl %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb $64, %bl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb %al, %bl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ecx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %esi, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %al, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %ebx, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %eax, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm3, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ebx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ebx, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    leal -128(,%edx,8), %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ecx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $128, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %esi, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %esi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl (%esp), %edi # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %ebx, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %ebp, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edx, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ebx, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb $64, %dl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb %cl, %dl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %al, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bl, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %edx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb $64, %bl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %ebp, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %ecx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel (%esp), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %ecx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %esi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ebx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %ecx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel (%esp), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $256, %ecx # imm = 0x100
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %esi, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel (%esp), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%ecx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%ecx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 12(%ecx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 8(%ecx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    addl $88, %esp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
+;
+; X32-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subl $124, %esp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%eax), %xmm0
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movdqu 16(%eax), %xmm1
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm2, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%ecx,8), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal -128(%eax), %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    negl %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb $64, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb %dl, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebp, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm2, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ecx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %edx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm2, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm3, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm2, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebp, %ecx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal -128(,%eax,8), %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ecx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %eax, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %eax, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $128, (%esp) # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %esi, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $al killed $al def $eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %esi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ecx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %eax, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %edx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $128, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb $64, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb %al, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $al killed $al killed $eax def $eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, (%esp) # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %edx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %eax, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb $64, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %edi, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %edi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %edi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %eax, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %edx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $128, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %ecx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %eax, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %edi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %edi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %eax, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %edi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %eax, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %ecx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $128, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %edi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %eax, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %esi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $256, %edi # imm = 0x100
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %edi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %esi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, (%eax)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 4(%eax)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 12(%eax)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%eax)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $124, %esp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
+;
+; X32-HAVE-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $92, %esp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%eax), %xmm0
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%eax), %xmm1
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm3, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    leal (,%ecx,8), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    leal -128(%ecx), %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negl %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $64, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %dl, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $24, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %eax, %edi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %edi, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $24, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %ebp, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %ebx, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %edx, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm3, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %eax, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    leal -128(,%eax,8), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ebp, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edi, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $128, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl (%esp), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %ecx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ebx, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, %esi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %edi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $64, %dl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %cl, %dl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, %eax, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $bl killed $bl killed $ebx def $ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx def $ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %eax, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %ebp, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $64, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %cl, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %esi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %ecx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel (%esp), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %edx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %ecx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %ebx, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ebp, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %ecx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel (%esp), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $256, %ecx # imm = 0x100
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %edi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel (%esp), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 4(%ecx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%ecx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 12(%ecx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 8(%ecx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $92, %esp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+  %init = load <32 x i8>, ptr %src, align 1
+  %intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %intermediate.sroa.0.0.vecblend = shufflevector <64 x i8> %intermediate.sroa.0.0.vec.expand, <64 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+  %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
+  %intermediate.val.frozen = freeze <64 x i8> %intermediate.sroa.0.0.vecblend
+  %intermediate.val.frozen.bits = bitcast <64 x i8> %intermediate.val.frozen to i512
+  %byteOff.numbits.wide = zext i64 %byteOff.numbits to i512
+  %intermediate.val.frozen.bits.positioned = lshr i512 %intermediate.val.frozen.bits, %byteOff.numbits.wide
+  %intermediate.val.frozen.bits.positioned.extracted = trunc i512 %intermediate.val.frozen.bits.positioned to i128
+  store i128 %intermediate.val.frozen.bits.positioned.extracted, ptr %dst, align 16
+  ret void
+}
+
+define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
+; X64-NO-BMI2-NO-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-NO-SHLD:       # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdx, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm1
+; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu 16(%rdi), %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rdx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm2, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm1, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    xorl %r11d, %r11d
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r13, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r11, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r11, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
+; X64-NO-BMI2-NO-SHLD-NEXT:    notb %al
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r9,%r9), %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rdx, %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rdx
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %rbx, %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdx, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rdi,%rdi), %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rbx, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r13, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    movb $-128, %al
+; X64-NO-BMI2-NO-SHLD-NEXT:    subb %sil, %al
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdx, %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rbx, %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdx, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %al
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %rbx, %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r11, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rbp, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r12, %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    leal -128(%rsi), %eax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdx, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r12, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %al
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r12, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r11, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmpl $128, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovbq %rbx, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovaeq %r11, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovbq %r13, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovaeq %r11, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r9, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovaeq %r11, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovaeq %r11, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovaeq %r11, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmovaeq %r11, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r9, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rdx, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rdi, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, 24(%r8)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r15, 16(%r8)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r12, 8(%r8)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, (%r8)
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-NO-BMI2-HAVE-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r13
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r12
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm1, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm1, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r9, %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, %r12
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r12
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ebx, %ebx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %sil
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %r15, %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r12, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rbx, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rbx, %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %sil
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %r12, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movb $-128, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    subb %sil, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, %r12
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rax, %r12
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, %r13
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r13
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %r13, %r12
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rbx, %r13
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r14, %r13
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r15, %r12
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    leal -128(%rsi), %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %r15, %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rbx, %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovbq %r12, %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovbq %r13, %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovaeq %rbx, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovaeq %rbx, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testl %esi, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r8, %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r9, %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovaeq %rbx, %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovaeq %rbx, %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovaeq %rbx, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovaeq %rbx, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    testl %esi, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r8, %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %r9, %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %rdi, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, 24(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, 16(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r15, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r12
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r13
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-NO-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rax, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorl %r9d, %r9d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r15, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r9, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rdi, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r9, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rcx, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %r13d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r13b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rdi,%rdi), %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r13, %r14, %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r10, %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm1, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r11, %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %r10, %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r13, %r14, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbp, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r15, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movb $-128, %r13b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    subb %sil, %r13b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %r13d, %r15d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %r15b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r15, %rbp, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r13, %rax, %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbp, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r13b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r13, %r10, %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r13, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r9, %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r12, %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbx, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leal -128(%rsi), %r12d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %r12d, %ebx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rbx, %r14, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r12, %r10, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %r12b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r12, %rax, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r14, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r9, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $128, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovbq %r13, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovbq %r15, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovaeq %r9, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovaeq %r9, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rcx, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rdi, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovaeq %r9, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovaeq %r9, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovaeq %r9, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovaeq %r9, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rcx, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rdi, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %r10, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, 24(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, 16(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rbx, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r14, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r13
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r12
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%rdi), %xmm1
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm1, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm1, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r9, %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rsi, %r9, %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %r11d, %r11d
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %sil
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r15, %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rsi, %rdi, %r12
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r12, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r11, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r11, %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %sil
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r12, %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $-128, %cl
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %sil, %cl
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, %r12
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rax, %r12
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rcx, %rax, %r13
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r13, %r12
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r11, %r13
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r14, %r13
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r15, %r12
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leal -128(%rsi), %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rdi, %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r15, %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %r11, %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbq %r12, %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbq %r13, %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovaeq %r11, %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovaeq %r11, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %esi, %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %r8, %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %r9, %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovaeq %r11, %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovaeq %r11, %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovaeq %r11, %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovaeq %r11, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %esi, %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %r8, %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %r9, %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rdi, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, 24(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbx, 16(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r15, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r14, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r12
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r13
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X32-NO-BMI2-NO-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X32-NO-BMI2-NO-SHLD:       # %bb.0:
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    subl $168, %esp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movdqu (%ecx), %xmm1
+; X32-NO-BMI2-NO-SHLD-NEXT:    movdqu 16(%ecx), %xmm0
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1]
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm3, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[3,3,3,3]
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm3, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm3, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (,%eax,8), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm3, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm2, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, (%esp) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm2, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb (%esp), %ch # 1-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm1, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal -128(,%eax,8), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal -128(%edx), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    negl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    andb $24, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %ebx, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb $64, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    subb %al, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb $64, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    subb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %dh
+; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %dh
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dh
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %edi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dh
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %dh
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %esi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ebp, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %esi, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $128, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb $64, %al
+; X32-NO-BMI2-NO-SHLD-NEXT:    subb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    andb $24, %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    addb $-64, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    andb $24, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    testl %edx, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %eax, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %eax, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $128, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %ecx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edx, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    testl %eax, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $128, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %eax, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    testl %edi, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $128, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovbl %ecx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $64, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    testl %edi, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %esi, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $128, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    testl %edi, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %esi, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $128, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edi, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edi, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $256, %ebp # imm = 0x100
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ebp # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edi, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, (%esp) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edi, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edi, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edi, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edi, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edi, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovael %edi, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %edi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 28(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 24(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 20(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 16(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 12(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 8(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl $168, %esp
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X32-NO-BMI2-HAVE-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    subl $140, %esp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%eax), %xmm0
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%eax), %xmm1
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    leal (,%ecx,8), %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %eax, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %edi, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb $64, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb %bl, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bl, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    leal -128(%ebx), %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    negl %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dl, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %ebp, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    leal -128(,%eax,8), %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dl, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edx, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %dl, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %ebx, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb $64, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb %bl, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    addb $-64, %bl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %bl, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    andb $24, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %esi, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %eax, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %edi, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $128, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovbl %ebp, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %ch
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %edi, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb $64, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    subb %bl, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ecx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %edx, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %ebx, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %edi, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %ecx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ecx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edx, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    testl %edi, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %ecx, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $256, %esi # imm = 0x100
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 28(%eax)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 24(%eax)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 20(%eax)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 16(%eax)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 4(%eax)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%eax)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 12(%eax)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 8(%eax)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    addl $140, %esp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
+;
+; X32-HAVE-BMI2-NO-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subl $192, %esp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%ecx), %xmm1
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movdqu 16(%ecx), %xmm0
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm4, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm4, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[3,3,3,3]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%edx,8), %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %edx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm4, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm3, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edi, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm2, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm2, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %eax, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm1, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebp, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal -128(,%eax,8), %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal -128(%edi), %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    negl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edi, %edx, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, %ecx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %esi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ebp, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb $64, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb %bl, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %esi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $bl killed $bl killed $ebx def $ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ebp, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb $64, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb {{[-0-9]+}}(%e{{[sb]}}p), %dl # 1-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebp, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx def $ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, (%esp), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebp, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ebx, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %eax, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %esi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebp, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ebp, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ebp, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael (%esp), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $128, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb $64, %dh
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subb %bl, %dh
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb %dh, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb %dh, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %esi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dh
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movb %bl, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addb $-64, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %bl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %eax, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %ebp, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ebp, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edi, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %edi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $128, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %esi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dh
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ebx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %esi, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $128, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %eax, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %edi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %ebx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $128, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovbl %edi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $64, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %eax, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %ebx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $128, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edi, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    testl %esi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %eax, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ebp, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $128, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edx, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $256, %ecx # imm = 0x100
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edx, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edx, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edx, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %edx, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovael %ebp, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 28(%eax)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 24(%eax)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 16(%eax)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 20(%eax)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, 8(%eax)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $192, %esp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
+;
+; X32-HAVE-BMI2-HAVE-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $140, %esp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%eax), %xmm0
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu 16(%eax), %xmm1
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[3,3,3,3]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm4, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm4, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    leal (,%ecx,8), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %eax, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %ebx, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm4, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %eax, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm3, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edx, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm2, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $64, %dl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %cl, %dl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, %edi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %edx, %ebp, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm1, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel (%esp), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $-128, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negl %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $24, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %edx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %ebp, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $dl killed $dl killed $edx def $edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %dl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %bl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %eax, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    leal -128(,%eax,8), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebx, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %ebx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx def $ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ebp # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %edi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %ebx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %esi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ecx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $64, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %bl, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $24, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %eax, %edx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %eax, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addb $-64, %dl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $24, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %edi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %ebx, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ebp, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $128, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovbl %eax, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %ebp, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %dl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %ecx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ebx, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movb $64, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subb %dl, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %al
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %ecx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %edx, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $64, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %ecx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ecx, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %edi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ebx, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %eax, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testl %ebx, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %ecx, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $128, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $256, %eax # imm = 0x100
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %esi, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovael %ebp, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 28(%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 24(%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 20(%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 16(%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 4(%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 12(%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 8(%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $140, %esp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+  %init = load <32 x i8>, ptr %src, align 1
+  %intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %intermediate.sroa.0.0.vecblend = shufflevector <64 x i8> %intermediate.sroa.0.0.vec.expand, <64 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+  %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
+  %intermediate.val.frozen = freeze <64 x i8> %intermediate.sroa.0.0.vecblend
+  %intermediate.val.frozen.bits = bitcast <64 x i8> %intermediate.val.frozen to i512
+  %byteOff.numbits.wide = zext i64 %byteOff.numbits to i512
+  %intermediate.val.frozen.bits.positioned = lshr i512 %intermediate.val.frozen.bits, %byteOff.numbits.wide
+  %intermediate.val.frozen.bits.positioned.extracted = trunc i512 %intermediate.val.frozen.bits.positioned to i256
+  store i256 %intermediate.val.frozen.bits.positioned.extracted, ptr %dst, align 32
+  ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; ALL: {{.*}}
+; X32: {{.*}}
+; X32-NO-SHLD: {{.*}}
+; X32-SHLD: {{.*}}
+; X64: {{.*}}
+; X64-NO-SHLD: {{.*}}
+; X64-SHLD: {{.*}}


        


More information about the llvm-commits mailing list