[llvm] 782f14a - [NFC][Codegen][X86] Improve test coverage for wide shift legalization

Roman Lebedev via llvm-commits llvm-commits at lists.llvm.org
Sat Jan 14 09:34:28 PST 2023


Author: Roman Lebedev
Date: 2023-01-14T20:34:02+03:00
New Revision: 782f14ac96a19df0fb5357d4c52269e3d72b5c42

URL: https://github.com/llvm/llvm-project/commit/782f14ac96a19df0fb5357d4c52269e3d72b5c42
DIFF: https://github.com/llvm/llvm-project/commit/782f14ac96a19df0fb5357d4c52269e3d72b5c42.diff

LOG: [NFC][Codegen][X86] Improve test coverage for wide shift legalization

Added: 
    

Modified: 
    llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
    llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
index bf91fb3da203..f16342531d7c 100644
--- a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -1,12 +1,36 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-NO-BMI2,X64-NO-SHLD,X64-NO-BMI2-NO-SHLD
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-NO-BMI2,X64-SHLD,X64-NO-BMI2-HAVE-SHLD
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-BMI2,X64-NO-SHLD,X64-HAVE-BMI2-NO-SHLD
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-BMI2,X64-SHLD,X64-HAVE-BMI2-HAVE-SHLD
-; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X32,X32-NO-BMI2,X32-NO-SHLD,X32-NO-BMI2-NO-SHLD
-; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X32,X32-NO-BMI2,X32-SHLD,X32-NO-BMI2-HAVE-SHLD
-; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X32,X32-BMI2,X32-NO-SHLD,X32-HAVE-BMI2-NO-SHLD
-; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X32,X32-BMI2,X32-SHLD,X32-HAVE-BMI2-HAVE-SHLD
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE2,X64-NO-BMI2,X64-NO-SHLD-NO-BMI2,FALLBACK0
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE2,X64-NO-BMI2,X64-HAVE-SHLD-NO-BMI2,FALLBACK1
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE2,X64-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2,FALLBACK2
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE2,X64-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2,FALLBACK3
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE42,X64-NO-BMI2,X64-NO-SHLD-NO-BMI2,FALLBACK4
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE42,X64-NO-BMI2,X64-HAVE-SHLD-NO-BMI2,FALLBACK5
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE42,X64-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2,FALLBACK6
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE42,X64-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2,FALLBACK7
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX1,X64-NO-BMI2,X64-NO-SHLD-NO-BMI2,FALLBACK8
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX1,X64-NO-BMI2,X64-HAVE-SHLD-NO-BMI2,FALLBACK9
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX1,X64-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2,FALLBACK10
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX1,X64-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2,FALLBACK11
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-NO-BMI2,X64-NO-SHLD-NO-BMI2,FALLBACK12
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-NO-BMI2,X64-HAVE-SHLD-NO-BMI2,FALLBACK13
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2,FALLBACK14
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2,FALLBACK15
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X32,X32-SSE2,X32-NO-BMI2,X32-NO-SHLD-NO-BMI2,FALLBACK16
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X32,X32-SSE2,X32-NO-BMI2,X32-HAVE-SHLD-NO-BMI2,FALLBACK17
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X32,X32-SSE2,X32-HAVE-BMI2,X32-NO-SHLD-HAVE-BMI2,FALLBACK18
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X32,X32-SSE2,X32-HAVE-BMI2,X32-HAVE-SHLD-HAVE-BMI2,FALLBACK19
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse4.2,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X32,X32-SSE42,X32-NO-BMI2,X32-NO-SHLD-NO-BMI2,FALLBACK20
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse4.2,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X32,X32-SSE42,X32-NO-BMI2,X32-HAVE-SHLD-NO-BMI2,FALLBACK21
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse4.2,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X32,X32-SSE42,X32-HAVE-BMI2,X32-NO-SHLD-HAVE-BMI2,FALLBACK22
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse4.2,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X32,X32-SSE42,X32-HAVE-BMI2,X32-HAVE-SHLD-HAVE-BMI2,FALLBACK23
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X32,X32-AVX,X32-AVX1,X32-NO-BMI2,X32-NO-SHLD-NO-BMI2,FALLBACK24
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X32,X32-AVX,X32-AVX1,X32-NO-BMI2,X32-HAVE-SHLD-NO-BMI2,FALLBACK25
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X32,X32-AVX,X32-AVX1,X32-HAVE-BMI2,X32-NO-SHLD-HAVE-BMI2,FALLBACK26
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X32,X32-AVX,X32-AVX1,X32-HAVE-BMI2,X32-HAVE-SHLD-HAVE-BMI2,FALLBACK27
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X32,X32-AVX,X32-AVX512,X32-NO-BMI2,X32-NO-SHLD-NO-BMI2,FALLBACK28
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X32,X32-AVX,X32-AVX512,X32-NO-BMI2,X32-HAVE-SHLD-NO-BMI2,FALLBACK29
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X32,X32-AVX,X32-AVX512,X32-HAVE-BMI2,X32-NO-SHLD-HAVE-BMI2,FALLBACK30
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X32,X32-AVX,X32-AVX512,X32-HAVE-BMI2,X32-HAVE-SHLD-HAVE-BMI2,FALLBACK31
 
 define void @lshr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-LABEL: lshr_4bytes:
@@ -18,13 +42,13 @@ define void @lshr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NEXT:    movl %eax, (%rdx)
 ; X64-NO-BMI2-NEXT:    retq
 ;
-; X64-BMI2-LABEL: lshr_4bytes:
-; X64-BMI2:       # %bb.0:
-; X64-BMI2-NEXT:    movzbl (%rsi), %eax
-; X64-BMI2-NEXT:    shlb $3, %al
-; X64-BMI2-NEXT:    shrxl %eax, (%rdi), %eax
-; X64-BMI2-NEXT:    movl %eax, (%rdx)
-; X64-BMI2-NEXT:    retq
+; X64-HAVE-BMI2-LABEL: lshr_4bytes:
+; X64-HAVE-BMI2:       # %bb.0:
+; X64-HAVE-BMI2-NEXT:    movzbl (%rsi), %eax
+; X64-HAVE-BMI2-NEXT:    shlb $3, %al
+; X64-HAVE-BMI2-NEXT:    shrxl %eax, (%rdi), %eax
+; X64-HAVE-BMI2-NEXT:    movl %eax, (%rdx)
+; X64-HAVE-BMI2-NEXT:    retq
 ;
 ; X32-NO-BMI2-LABEL: lshr_4bytes:
 ; X32-NO-BMI2:       # %bb.0:
@@ -38,16 +62,16 @@ define void @lshr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X32-NO-BMI2-NEXT:    movl %edx, (%eax)
 ; X32-NO-BMI2-NEXT:    retl
 ;
-; X32-BMI2-LABEL: lshr_4bytes:
-; X32-BMI2:       # %bb.0:
-; X32-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-BMI2-NEXT:    movzbl (%edx), %edx
-; X32-BMI2-NEXT:    shlb $3, %dl
-; X32-BMI2-NEXT:    shrxl %edx, (%ecx), %ecx
-; X32-BMI2-NEXT:    movl %ecx, (%eax)
-; X32-BMI2-NEXT:    retl
+; X32-HAVE-BMI2-LABEL: lshr_4bytes:
+; X32-HAVE-BMI2:       # %bb.0:
+; X32-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-HAVE-BMI2-NEXT:    movzbl (%edx), %edx
+; X32-HAVE-BMI2-NEXT:    shlb $3, %dl
+; X32-HAVE-BMI2-NEXT:    shrxl %edx, (%ecx), %ecx
+; X32-HAVE-BMI2-NEXT:    movl %ecx, (%eax)
+; X32-HAVE-BMI2-NEXT:    retl
   %src = load i32, ptr %src.ptr, align 1
   %byteOff = load i32, ptr %byteOff.ptr, align 1
   %bitOff = shl i32 %byteOff, 3
@@ -65,13 +89,13 @@ define void @shl_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NEXT:    movl %eax, (%rdx)
 ; X64-NO-BMI2-NEXT:    retq
 ;
-; X64-BMI2-LABEL: shl_4bytes:
-; X64-BMI2:       # %bb.0:
-; X64-BMI2-NEXT:    movzbl (%rsi), %eax
-; X64-BMI2-NEXT:    shlb $3, %al
-; X64-BMI2-NEXT:    shlxl %eax, (%rdi), %eax
-; X64-BMI2-NEXT:    movl %eax, (%rdx)
-; X64-BMI2-NEXT:    retq
+; X64-HAVE-BMI2-LABEL: shl_4bytes:
+; X64-HAVE-BMI2:       # %bb.0:
+; X64-HAVE-BMI2-NEXT:    movzbl (%rsi), %eax
+; X64-HAVE-BMI2-NEXT:    shlb $3, %al
+; X64-HAVE-BMI2-NEXT:    shlxl %eax, (%rdi), %eax
+; X64-HAVE-BMI2-NEXT:    movl %eax, (%rdx)
+; X64-HAVE-BMI2-NEXT:    retq
 ;
 ; X32-NO-BMI2-LABEL: shl_4bytes:
 ; X32-NO-BMI2:       # %bb.0:
@@ -85,16 +109,16 @@ define void @shl_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X32-NO-BMI2-NEXT:    movl %edx, (%eax)
 ; X32-NO-BMI2-NEXT:    retl
 ;
-; X32-BMI2-LABEL: shl_4bytes:
-; X32-BMI2:       # %bb.0:
-; X32-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-BMI2-NEXT:    movzbl (%edx), %edx
-; X32-BMI2-NEXT:    shlb $3, %dl
-; X32-BMI2-NEXT:    shlxl %edx, (%ecx), %ecx
-; X32-BMI2-NEXT:    movl %ecx, (%eax)
-; X32-BMI2-NEXT:    retl
+; X32-HAVE-BMI2-LABEL: shl_4bytes:
+; X32-HAVE-BMI2:       # %bb.0:
+; X32-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-HAVE-BMI2-NEXT:    movzbl (%edx), %edx
+; X32-HAVE-BMI2-NEXT:    shlb $3, %dl
+; X32-HAVE-BMI2-NEXT:    shlxl %edx, (%ecx), %ecx
+; X32-HAVE-BMI2-NEXT:    movl %ecx, (%eax)
+; X32-HAVE-BMI2-NEXT:    retl
   %src = load i32, ptr %src.ptr, align 1
   %byteOff = load i32, ptr %byteOff.ptr, align 1
   %bitOff = shl i32 %byteOff, 3
@@ -112,13 +136,13 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NEXT:    movl %eax, (%rdx)
 ; X64-NO-BMI2-NEXT:    retq
 ;
-; X64-BMI2-LABEL: ashr_4bytes:
-; X64-BMI2:       # %bb.0:
-; X64-BMI2-NEXT:    movzbl (%rsi), %eax
-; X64-BMI2-NEXT:    shlb $3, %al
-; X64-BMI2-NEXT:    sarxl %eax, (%rdi), %eax
-; X64-BMI2-NEXT:    movl %eax, (%rdx)
-; X64-BMI2-NEXT:    retq
+; X64-HAVE-BMI2-LABEL: ashr_4bytes:
+; X64-HAVE-BMI2:       # %bb.0:
+; X64-HAVE-BMI2-NEXT:    movzbl (%rsi), %eax
+; X64-HAVE-BMI2-NEXT:    shlb $3, %al
+; X64-HAVE-BMI2-NEXT:    sarxl %eax, (%rdi), %eax
+; X64-HAVE-BMI2-NEXT:    movl %eax, (%rdx)
+; X64-HAVE-BMI2-NEXT:    retq
 ;
 ; X32-NO-BMI2-LABEL: ashr_4bytes:
 ; X32-NO-BMI2:       # %bb.0:
@@ -132,16 +156,16 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X32-NO-BMI2-NEXT:    movl %edx, (%eax)
 ; X32-NO-BMI2-NEXT:    retl
 ;
-; X32-BMI2-LABEL: ashr_4bytes:
-; X32-BMI2:       # %bb.0:
-; X32-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-BMI2-NEXT:    movzbl (%edx), %edx
-; X32-BMI2-NEXT:    shlb $3, %dl
-; X32-BMI2-NEXT:    sarxl %edx, (%ecx), %ecx
-; X32-BMI2-NEXT:    movl %ecx, (%eax)
-; X32-BMI2-NEXT:    retl
+; X32-HAVE-BMI2-LABEL: ashr_4bytes:
+; X32-HAVE-BMI2:       # %bb.0:
+; X32-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-HAVE-BMI2-NEXT:    movzbl (%edx), %edx
+; X32-HAVE-BMI2-NEXT:    shlb $3, %dl
+; X32-HAVE-BMI2-NEXT:    sarxl %edx, (%ecx), %ecx
+; X32-HAVE-BMI2-NEXT:    movl %ecx, (%eax)
+; X32-HAVE-BMI2-NEXT:    retl
   %src = load i32, ptr %src.ptr, align 1
   %byteOff = load i32, ptr %byteOff.ptr, align 1
   %bitOff = shl i32 %byteOff, 3
@@ -160,120 +184,120 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NEXT:    movq %rax, (%rdx)
 ; X64-NO-BMI2-NEXT:    retq
 ;
-; X64-BMI2-LABEL: lshr_8bytes:
-; X64-BMI2:       # %bb.0:
-; X64-BMI2-NEXT:    movzbl (%rsi), %eax
-; X64-BMI2-NEXT:    shlb $3, %al
-; X64-BMI2-NEXT:    shrxq %rax, (%rdi), %rax
-; X64-BMI2-NEXT:    movq %rax, (%rdx)
-; X64-BMI2-NEXT:    retq
-;
-; X32-NO-BMI2-NO-SHLD-LABEL: lshr_8bytes:
-; X32-NO-BMI2-NO-SHLD:       # %bb.0:
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%ecx), %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    shlb $3, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %esi, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%edx)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, (%edx)
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-NO-BMI2-HAVE-SHLD-LABEL: lshr_8bytes:
-; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esi), %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%esi), %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shlb $3, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %esi, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 4(%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-NO-SHLD-LABEL: lshr_8bytes:
-; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%edx), %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%ecx), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlb $3, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%edx), %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 4(%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, (%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-HAVE-SHLD-LABEL: lshr_8bytes:
-; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esi), %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%esi), %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlb $3, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %edi, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 4(%eax)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%eax)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+; X64-HAVE-BMI2-LABEL: lshr_8bytes:
+; X64-HAVE-BMI2:       # %bb.0:
+; X64-HAVE-BMI2-NEXT:    movzbl (%rsi), %eax
+; X64-HAVE-BMI2-NEXT:    shlb $3, %al
+; X64-HAVE-BMI2-NEXT:    shrxq %rax, (%rdi), %rax
+; X64-HAVE-BMI2-NEXT:    movq %rax, (%rdx)
+; X64-HAVE-BMI2-NEXT:    retq
+;
+; X32-NO-SHLD-NO-BMI2-LABEL: lshr_8bytes:
+; X32-NO-SHLD-NO-BMI2:       # %bb.0:
+; X32-NO-SHLD-NO-BMI2-NEXT:    pushl %ebx
+; X32-NO-SHLD-NO-BMI2-NEXT:    pushl %edi
+; X32-NO-SHLD-NO-BMI2-NEXT:    pushl %esi
+; X32-NO-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NO-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-SHLD-NO-BMI2-NEXT:    movl (%ecx), %ebx
+; X32-NO-SHLD-NO-BMI2-NEXT:    movl 4(%ecx), %esi
+; X32-NO-SHLD-NO-BMI2-NEXT:    movzbl (%eax), %eax
+; X32-NO-SHLD-NO-BMI2-NEXT:    shlb $3, %al
+; X32-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
+; X32-NO-SHLD-NO-BMI2-NEXT:    shrl %cl, %ebx
+; X32-NO-SHLD-NO-BMI2-NEXT:    leal (%esi,%esi), %edi
+; X32-NO-SHLD-NO-BMI2-NEXT:    notb %cl
+; X32-NO-SHLD-NO-BMI2-NEXT:    shll %cl, %edi
+; X32-NO-SHLD-NO-BMI2-NEXT:    orl %ebx, %edi
+; X32-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
+; X32-NO-SHLD-NO-BMI2-NEXT:    shrl %cl, %esi
+; X32-NO-SHLD-NO-BMI2-NEXT:    xorl %ecx, %ecx
+; X32-NO-SHLD-NO-BMI2-NEXT:    testb $32, %al
+; X32-NO-SHLD-NO-BMI2-NEXT:    cmovnel %esi, %edi
+; X32-NO-SHLD-NO-BMI2-NEXT:    cmovel %esi, %ecx
+; X32-NO-SHLD-NO-BMI2-NEXT:    movl %ecx, 4(%edx)
+; X32-NO-SHLD-NO-BMI2-NEXT:    movl %edi, (%edx)
+; X32-NO-SHLD-NO-BMI2-NEXT:    popl %esi
+; X32-NO-SHLD-NO-BMI2-NEXT:    popl %edi
+; X32-NO-SHLD-NO-BMI2-NEXT:    popl %ebx
+; X32-NO-SHLD-NO-BMI2-NEXT:    retl
+;
+; X32-HAVE-SHLD-NO-BMI2-LABEL: lshr_8bytes:
+; X32-HAVE-SHLD-NO-BMI2:       # %bb.0:
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    pushl %edi
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    pushl %esi
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    movl (%esi), %edx
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    movl 4(%esi), %esi
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    movzbl (%ecx), %ecx
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    shlb $3, %cl
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    movl %esi, %edi
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    shrl %cl, %edi
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    shrdl %cl, %esi, %edx
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    xorl %esi, %esi
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    testb $32, %cl
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    cmovnel %edi, %edx
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    cmovel %edi, %esi
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    movl %esi, 4(%eax)
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    movl %edx, (%eax)
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    popl %esi
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    popl %edi
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    retl
+;
+; X32-NO-SHLD-HAVE-BMI2-LABEL: lshr_8bytes:
+; X32-NO-SHLD-HAVE-BMI2:       # %bb.0:
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    pushl %ebx
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    pushl %edi
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    pushl %esi
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    movl 4(%edx), %esi
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    movzbl (%ecx), %ecx
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    shlb $3, %cl
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    shrxl %ecx, (%edx), %edx
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    movl %ecx, %ebx
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    notb %bl
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    leal (%esi,%esi), %edi
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    shlxl %ebx, %edi, %edi
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    orl %edx, %edi
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    shrxl %ecx, %esi, %edx
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    xorl %esi, %esi
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    testb $32, %cl
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    cmovnel %edx, %edi
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    cmovel %edx, %esi
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    movl %esi, 4(%eax)
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    movl %edi, (%eax)
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    popl %esi
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    popl %edi
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    popl %ebx
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    retl
+;
+; X32-HAVE-SHLD-HAVE-BMI2-LABEL: lshr_8bytes:
+; X32-HAVE-SHLD-HAVE-BMI2:       # %bb.0:
+; X32-HAVE-SHLD-HAVE-BMI2-NEXT:    pushl %edi
+; X32-HAVE-SHLD-HAVE-BMI2-NEXT:    pushl %esi
+; X32-HAVE-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-HAVE-SHLD-HAVE-BMI2-NEXT:    movl (%esi), %edx
+; X32-HAVE-SHLD-HAVE-BMI2-NEXT:    movl 4(%esi), %esi
+; X32-HAVE-SHLD-HAVE-BMI2-NEXT:    movzbl (%ecx), %ecx
+; X32-HAVE-SHLD-HAVE-BMI2-NEXT:    shlb $3, %cl
+; X32-HAVE-SHLD-HAVE-BMI2-NEXT:    shrdl %cl, %esi, %edx
+; X32-HAVE-SHLD-HAVE-BMI2-NEXT:    shrxl %ecx, %esi, %esi
+; X32-HAVE-SHLD-HAVE-BMI2-NEXT:    xorl %edi, %edi
+; X32-HAVE-SHLD-HAVE-BMI2-NEXT:    testb $32, %cl
+; X32-HAVE-SHLD-HAVE-BMI2-NEXT:    cmovnel %esi, %edx
+; X32-HAVE-SHLD-HAVE-BMI2-NEXT:    cmovel %esi, %edi
+; X32-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %edi, 4(%eax)
+; X32-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %edx, (%eax)
+; X32-HAVE-SHLD-HAVE-BMI2-NEXT:    popl %esi
+; X32-HAVE-SHLD-HAVE-BMI2-NEXT:    popl %edi
+; X32-HAVE-SHLD-HAVE-BMI2-NEXT:    retl
   %src = load i64, ptr %src.ptr, align 1
   %byteOff = load i64, ptr %byteOff.ptr, align 1
   %bitOff = shl i64 %byteOff, 3
@@ -291,121 +315,121 @@ define void @shl_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NEXT:    movq %rax, (%rdx)
 ; X64-NO-BMI2-NEXT:    retq
 ;
-; X64-BMI2-LABEL: shl_8bytes:
-; X64-BMI2:       # %bb.0:
-; X64-BMI2-NEXT:    movzbl (%rsi), %eax
-; X64-BMI2-NEXT:    shlb $3, %al
-; X64-BMI2-NEXT:    shlxq %rax, (%rdi), %rax
-; X64-BMI2-NEXT:    movq %rax, (%rdx)
-; X64-BMI2-NEXT:    retq
-;
-; X32-NO-BMI2-NO-SHLD-LABEL: shl_8bytes:
-; X32-NO-BMI2-NO-SHLD:       # %bb.0:
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%ecx), %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    shlb $3, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %esi, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, (%edx)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%edx)
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-NO-BMI2-HAVE-SHLD-LABEL: shl_8bytes:
-; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shlb $3, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    xorl %esi, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 4(%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-NO-SHLD-LABEL: shl_8bytes:
-; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%edx), %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%ecx), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlb $3, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, 4(%edx), %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    xorl %edx, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edi, %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 4(%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-HAVE-SHLD-LABEL: shl_8bytes:
-; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlb $3, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %esi, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %edi, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 4(%eax)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, (%eax)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+; X64-HAVE-BMI2-LABEL: shl_8bytes:
+; X64-HAVE-BMI2:       # %bb.0:
+; X64-HAVE-BMI2-NEXT:    movzbl (%rsi), %eax
+; X64-HAVE-BMI2-NEXT:    shlb $3, %al
+; X64-HAVE-BMI2-NEXT:    shlxq %rax, (%rdi), %rax
+; X64-HAVE-BMI2-NEXT:    movq %rax, (%rdx)
+; X64-HAVE-BMI2-NEXT:    retq
+;
+; X32-NO-SHLD-NO-BMI2-LABEL: shl_8bytes:
+; X32-NO-SHLD-NO-BMI2:       # %bb.0:
+; X32-NO-SHLD-NO-BMI2-NEXT:    pushl %ebx
+; X32-NO-SHLD-NO-BMI2-NEXT:    pushl %edi
+; X32-NO-SHLD-NO-BMI2-NEXT:    pushl %esi
+; X32-NO-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NO-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-SHLD-NO-BMI2-NEXT:    movl (%ecx), %esi
+; X32-NO-SHLD-NO-BMI2-NEXT:    movl 4(%ecx), %ebx
+; X32-NO-SHLD-NO-BMI2-NEXT:    movzbl (%eax), %eax
+; X32-NO-SHLD-NO-BMI2-NEXT:    shlb $3, %al
+; X32-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
+; X32-NO-SHLD-NO-BMI2-NEXT:    shll %cl, %ebx
+; X32-NO-SHLD-NO-BMI2-NEXT:    movl %esi, %edi
+; X32-NO-SHLD-NO-BMI2-NEXT:    shrl %edi
+; X32-NO-SHLD-NO-BMI2-NEXT:    notb %cl
+; X32-NO-SHLD-NO-BMI2-NEXT:    shrl %cl, %edi
+; X32-NO-SHLD-NO-BMI2-NEXT:    orl %ebx, %edi
+; X32-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
+; X32-NO-SHLD-NO-BMI2-NEXT:    shll %cl, %esi
+; X32-NO-SHLD-NO-BMI2-NEXT:    xorl %ecx, %ecx
+; X32-NO-SHLD-NO-BMI2-NEXT:    testb $32, %al
+; X32-NO-SHLD-NO-BMI2-NEXT:    cmovnel %esi, %edi
+; X32-NO-SHLD-NO-BMI2-NEXT:    cmovel %esi, %ecx
+; X32-NO-SHLD-NO-BMI2-NEXT:    movl %ecx, (%edx)
+; X32-NO-SHLD-NO-BMI2-NEXT:    movl %edi, 4(%edx)
+; X32-NO-SHLD-NO-BMI2-NEXT:    popl %esi
+; X32-NO-SHLD-NO-BMI2-NEXT:    popl %edi
+; X32-NO-SHLD-NO-BMI2-NEXT:    popl %ebx
+; X32-NO-SHLD-NO-BMI2-NEXT:    retl
+;
+; X32-HAVE-SHLD-NO-BMI2-LABEL: shl_8bytes:
+; X32-HAVE-SHLD-NO-BMI2:       # %bb.0:
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    pushl %edi
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    pushl %esi
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    movl (%edx), %esi
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    movl 4(%edx), %edx
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    movzbl (%ecx), %ecx
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    shlb $3, %cl
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    movl %esi, %edi
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    shll %cl, %edi
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    shldl %cl, %esi, %edx
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    xorl %esi, %esi
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    testb $32, %cl
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    cmovnel %edi, %edx
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    cmovel %edi, %esi
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    movl %edx, 4(%eax)
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    movl %esi, (%eax)
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    popl %esi
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    popl %edi
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    retl
+;
+; X32-NO-SHLD-HAVE-BMI2-LABEL: shl_8bytes:
+; X32-NO-SHLD-HAVE-BMI2:       # %bb.0:
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    pushl %ebx
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    pushl %edi
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    pushl %esi
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    movl (%edx), %esi
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    movzbl (%ecx), %ecx
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    shlb $3, %cl
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    shlxl %ecx, 4(%edx), %edx
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    movl %ecx, %ebx
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    notb %bl
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    shlxl %ecx, %esi, %edi
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    shrl %esi
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    shrxl %ebx, %esi, %esi
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    orl %edx, %esi
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    xorl %edx, %edx
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    testb $32, %cl
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    cmovnel %edi, %esi
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    cmovel %edi, %edx
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    movl %edx, (%eax)
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    movl %esi, 4(%eax)
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    popl %esi
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    popl %edi
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    popl %ebx
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    retl
+;
+; X32-HAVE-SHLD-HAVE-BMI2-LABEL: shl_8bytes:
+; X32-HAVE-SHLD-HAVE-BMI2:       # %bb.0:
+; X32-HAVE-SHLD-HAVE-BMI2-NEXT:    pushl %edi
+; X32-HAVE-SHLD-HAVE-BMI2-NEXT:    pushl %esi
+; X32-HAVE-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-HAVE-SHLD-HAVE-BMI2-NEXT:    movl (%edx), %esi
+; X32-HAVE-SHLD-HAVE-BMI2-NEXT:    movl 4(%edx), %edx
+; X32-HAVE-SHLD-HAVE-BMI2-NEXT:    movzbl (%ecx), %ecx
+; X32-HAVE-SHLD-HAVE-BMI2-NEXT:    shlb $3, %cl
+; X32-HAVE-SHLD-HAVE-BMI2-NEXT:    shldl %cl, %esi, %edx
+; X32-HAVE-SHLD-HAVE-BMI2-NEXT:    shlxl %ecx, %esi, %esi
+; X32-HAVE-SHLD-HAVE-BMI2-NEXT:    xorl %edi, %edi
+; X32-HAVE-SHLD-HAVE-BMI2-NEXT:    testb $32, %cl
+; X32-HAVE-SHLD-HAVE-BMI2-NEXT:    cmovnel %esi, %edx
+; X32-HAVE-SHLD-HAVE-BMI2-NEXT:    cmovel %esi, %edi
+; X32-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %edx, 4(%eax)
+; X32-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %edi, (%eax)
+; X32-HAVE-SHLD-HAVE-BMI2-NEXT:    popl %esi
+; X32-HAVE-SHLD-HAVE-BMI2-NEXT:    popl %edi
+; X32-HAVE-SHLD-HAVE-BMI2-NEXT:    retl
   %src = load i64, ptr %src.ptr, align 1
   %byteOff = load i64, ptr %byteOff.ptr, align 1
   %bitOff = shl i64 %byteOff, 3
@@ -423,121 +447,121 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NEXT:    movq %rax, (%rdx)
 ; X64-NO-BMI2-NEXT:    retq
 ;
-; X64-BMI2-LABEL: ashr_8bytes:
-; X64-BMI2:       # %bb.0:
-; X64-BMI2-NEXT:    movzbl (%rsi), %eax
-; X64-BMI2-NEXT:    shlb $3, %al
-; X64-BMI2-NEXT:    sarxq %rax, (%rdi), %rax
-; X64-BMI2-NEXT:    movq %rax, (%rdx)
-; X64-BMI2-NEXT:    retq
-;
-; X32-NO-BMI2-NO-SHLD-LABEL: ashr_8bytes:
-; X32-NO-BMI2-NO-SHLD:       # %bb.0:
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%ecx), %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %eax
-; X32-NO-BMI2-NO-SHLD-NEXT:    shlb $3, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X32-NO-BMI2-NO-SHLD-NEXT:    sarl %cl, %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    sarl $31, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebx, %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebx, %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 4(%edx)
-; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, (%edx)
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-NO-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-NO-BMI2-HAVE-SHLD-LABEL: ashr_8bytes:
-; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esi), %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%esi), %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shlb $3, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    sarl %cl, %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    sarl $31, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %edx
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 4(%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%eax)
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-NO-SHLD-LABEL: ashr_8bytes:
-; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esi), %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%edx), %edx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlb $3, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, (%esi), %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    sarxl %edx, %ecx, %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    sarl $31, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %esi, %ecx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, (%eax)
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
-;
-; X32-HAVE-BMI2-HAVE-SHLD-LABEL: ashr_8bytes:
-; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esi), %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%esi), %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlb $3, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxl %ecx, %esi, %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    sarl $31, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %edx
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 4(%eax)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%eax)
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
-; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+; X64-HAVE-BMI2-LABEL: ashr_8bytes:
+; X64-HAVE-BMI2:       # %bb.0:
+; X64-HAVE-BMI2-NEXT:    movzbl (%rsi), %eax
+; X64-HAVE-BMI2-NEXT:    shlb $3, %al
+; X64-HAVE-BMI2-NEXT:    sarxq %rax, (%rdi), %rax
+; X64-HAVE-BMI2-NEXT:    movq %rax, (%rdx)
+; X64-HAVE-BMI2-NEXT:    retq
+;
+; X32-NO-SHLD-NO-BMI2-LABEL: ashr_8bytes:
+; X32-NO-SHLD-NO-BMI2:       # %bb.0:
+; X32-NO-SHLD-NO-BMI2-NEXT:    pushl %ebx
+; X32-NO-SHLD-NO-BMI2-NEXT:    pushl %edi
+; X32-NO-SHLD-NO-BMI2-NEXT:    pushl %esi
+; X32-NO-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NO-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-SHLD-NO-BMI2-NEXT:    movl (%ecx), %ebx
+; X32-NO-SHLD-NO-BMI2-NEXT:    movl 4(%ecx), %esi
+; X32-NO-SHLD-NO-BMI2-NEXT:    movzbl (%eax), %eax
+; X32-NO-SHLD-NO-BMI2-NEXT:    shlb $3, %al
+; X32-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
+; X32-NO-SHLD-NO-BMI2-NEXT:    shrl %cl, %ebx
+; X32-NO-SHLD-NO-BMI2-NEXT:    leal (%esi,%esi), %edi
+; X32-NO-SHLD-NO-BMI2-NEXT:    notb %cl
+; X32-NO-SHLD-NO-BMI2-NEXT:    shll %cl, %edi
+; X32-NO-SHLD-NO-BMI2-NEXT:    orl %ebx, %edi
+; X32-NO-SHLD-NO-BMI2-NEXT:    movl %esi, %ebx
+; X32-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
+; X32-NO-SHLD-NO-BMI2-NEXT:    sarl %cl, %ebx
+; X32-NO-SHLD-NO-BMI2-NEXT:    sarl $31, %esi
+; X32-NO-SHLD-NO-BMI2-NEXT:    testb $32, %al
+; X32-NO-SHLD-NO-BMI2-NEXT:    cmovnel %ebx, %edi
+; X32-NO-SHLD-NO-BMI2-NEXT:    cmovel %ebx, %esi
+; X32-NO-SHLD-NO-BMI2-NEXT:    movl %esi, 4(%edx)
+; X32-NO-SHLD-NO-BMI2-NEXT:    movl %edi, (%edx)
+; X32-NO-SHLD-NO-BMI2-NEXT:    popl %esi
+; X32-NO-SHLD-NO-BMI2-NEXT:    popl %edi
+; X32-NO-SHLD-NO-BMI2-NEXT:    popl %ebx
+; X32-NO-SHLD-NO-BMI2-NEXT:    retl
+;
+; X32-HAVE-SHLD-NO-BMI2-LABEL: ashr_8bytes:
+; X32-HAVE-SHLD-NO-BMI2:       # %bb.0:
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    pushl %edi
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    pushl %esi
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    movl (%esi), %edx
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    movl 4(%esi), %esi
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    movzbl (%ecx), %ecx
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    shlb $3, %cl
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    movl %esi, %edi
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    sarl %cl, %edi
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    shrdl %cl, %esi, %edx
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    sarl $31, %esi
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    testb $32, %cl
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    cmovnel %edi, %edx
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    cmovel %edi, %esi
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    movl %esi, 4(%eax)
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    movl %edx, (%eax)
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    popl %esi
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    popl %edi
+; X32-HAVE-SHLD-NO-BMI2-NEXT:    retl
+;
+; X32-NO-SHLD-HAVE-BMI2-LABEL: ashr_8bytes:
+; X32-NO-SHLD-HAVE-BMI2:       # %bb.0:
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    pushl %ebx
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    pushl %edi
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    pushl %esi
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    movl 4(%esi), %ecx
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    movzbl (%edx), %edx
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    shlb $3, %dl
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    shrxl %edx, (%esi), %esi
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    movl %edx, %ebx
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    notb %bl
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    leal (%ecx,%ecx), %edi
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    shlxl %ebx, %edi, %edi
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    orl %esi, %edi
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    sarxl %edx, %ecx, %esi
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    sarl $31, %ecx
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    testb $32, %dl
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    cmovnel %esi, %edi
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    cmovel %esi, %ecx
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    movl %ecx, 4(%eax)
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    movl %edi, (%eax)
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    popl %esi
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    popl %edi
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    popl %ebx
+; X32-NO-SHLD-HAVE-BMI2-NEXT:    retl
+;
+; X32-HAVE-SHLD-HAVE-BMI2-LABEL: ashr_8bytes:
+; X32-HAVE-SHLD-HAVE-BMI2:       # %bb.0:
+; X32-HAVE-SHLD-HAVE-BMI2-NEXT:    pushl %edi
+; X32-HAVE-SHLD-HAVE-BMI2-NEXT:    pushl %esi
+; X32-HAVE-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-HAVE-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-HAVE-SHLD-HAVE-BMI2-NEXT:    movl (%esi), %edx
+; X32-HAVE-SHLD-HAVE-BMI2-NEXT:    movl 4(%esi), %esi
+; X32-HAVE-SHLD-HAVE-BMI2-NEXT:    movzbl (%ecx), %ecx
+; X32-HAVE-SHLD-HAVE-BMI2-NEXT:    shlb $3, %cl
+; X32-HAVE-SHLD-HAVE-BMI2-NEXT:    shrdl %cl, %esi, %edx
+; X32-HAVE-SHLD-HAVE-BMI2-NEXT:    sarxl %ecx, %esi, %edi
+; X32-HAVE-SHLD-HAVE-BMI2-NEXT:    sarl $31, %esi
+; X32-HAVE-SHLD-HAVE-BMI2-NEXT:    testb $32, %cl
+; X32-HAVE-SHLD-HAVE-BMI2-NEXT:    cmovnel %edi, %edx
+; X32-HAVE-SHLD-HAVE-BMI2-NEXT:    cmovel %edi, %esi
+; X32-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %esi, 4(%eax)
+; X32-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %edx, (%eax)
+; X32-HAVE-SHLD-HAVE-BMI2-NEXT:    popl %esi
+; X32-HAVE-SHLD-HAVE-BMI2-NEXT:    popl %edi
+; X32-HAVE-SHLD-HAVE-BMI2-NEXT:    retl
   %src = load i64, ptr %src.ptr, align 1
   %byteOff = load i64, ptr %byteOff.ptr, align 1
   %bitOff = shl i64 %byteOff, 3
@@ -547,117 +571,151 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 }
 
 define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
-; X64-NO-BMI2-NO-SHLD-LABEL: lshr_16bytes:
-; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %eax
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlb $3, %al
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rdi,%rdi), %rsi
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rsi
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r8, %rsi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %al
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %rdi, %rsi
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rdi, %rcx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rcx, 8(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rsi, (%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    retq
-;
-; X64-NO-BMI2-HAVE-SHLD-LABEL: lshr_16bytes:
-; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlb $3, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorl %edi, %edi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %rsi, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 8(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
-;
-; X64-HAVE-BMI2-NO-SHLD-LABEL: lshr_16bytes:
-; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlb $3, %cl
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, (%rdi), %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %dil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rdi, %r8, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rsi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rax, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %cl
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %rax, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsi, 8(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
-;
-; X64-HAVE-BMI2-HAVE-SHLD-LABEL: lshr_16bytes:
-; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlb $3, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rdi, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %edi, %edi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rsi, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 8(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
-;
-; X32-LABEL: lshr_16bytes:
-; X32:       # %bb.0:
-; X32-NEXT:    pushl %ebx
-; X32-NEXT:    pushl %edi
-; X32-NEXT:    pushl %esi
-; X32-NEXT:    subl $32, %esp
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    movl (%edx), %esi
-; X32-NEXT:    movl 4(%edx), %edi
-; X32-NEXT:    movl 8(%edx), %ebx
-; X32-NEXT:    movl 12(%edx), %edx
-; X32-NEXT:    movzbl (%ecx), %ecx
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl %esi, (%esp)
-; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X32-NEXT:    andl $15, %ecx
-; X32-NEXT:    movl (%esp,%ecx), %edx
-; X32-NEXT:    movl 4(%esp,%ecx), %esi
-; X32-NEXT:    movl 12(%esp,%ecx), %edi
-; X32-NEXT:    movl 8(%esp,%ecx), %ecx
-; X32-NEXT:    movl %ecx, 8(%eax)
-; X32-NEXT:    movl %edi, 12(%eax)
-; X32-NEXT:    movl %edx, (%eax)
-; X32-NEXT:    movl %esi, 4(%eax)
-; X32-NEXT:    addl $32, %esp
-; X32-NEXT:    popl %esi
-; X32-NEXT:    popl %edi
-; X32-NEXT:    popl %ebx
-; X32-NEXT:    retl
+; X64-NO-SHLD-NO-BMI2-LABEL: lshr_16bytes:
+; X64-NO-SHLD-NO-BMI2:       # %bb.0:
+; X64-NO-SHLD-NO-BMI2-NEXT:    movq (%rdi), %r8
+; X64-NO-SHLD-NO-BMI2-NEXT:    movq 8(%rdi), %rdi
+; X64-NO-SHLD-NO-BMI2-NEXT:    movzbl (%rsi), %eax
+; X64-NO-SHLD-NO-BMI2-NEXT:    shlb $3, %al
+; X64-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-NEXT:    shrq %cl, %r8
+; X64-NO-SHLD-NO-BMI2-NEXT:    leaq (%rdi,%rdi), %rsi
+; X64-NO-SHLD-NO-BMI2-NEXT:    notb %cl
+; X64-NO-SHLD-NO-BMI2-NEXT:    shlq %cl, %rsi
+; X64-NO-SHLD-NO-BMI2-NEXT:    orq %r8, %rsi
+; X64-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-NEXT:    shrq %cl, %rdi
+; X64-NO-SHLD-NO-BMI2-NEXT:    xorl %ecx, %ecx
+; X64-NO-SHLD-NO-BMI2-NEXT:    testb $64, %al
+; X64-NO-SHLD-NO-BMI2-NEXT:    cmovneq %rdi, %rsi
+; X64-NO-SHLD-NO-BMI2-NEXT:    cmoveq %rdi, %rcx
+; X64-NO-SHLD-NO-BMI2-NEXT:    movq %rcx, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-NEXT:    movq %rsi, (%rdx)
+; X64-NO-SHLD-NO-BMI2-NEXT:    retq
+;
+; X64-HAVE-SHLD-NO-BMI2-LABEL: lshr_16bytes:
+; X64-HAVE-SHLD-NO-BMI2:       # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    movq (%rdi), %rax
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    movq 8(%rdi), %rdi
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    movzbl (%rsi), %ecx
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    shlb $3, %cl
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    movq %rdi, %rsi
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    shrq %cl, %rsi
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    shrdq %cl, %rdi, %rax
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    xorl %edi, %edi
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    testb $64, %cl
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    cmovneq %rsi, %rax
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    cmoveq %rsi, %rdi
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    movq %rdi, 8(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    movq %rax, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    retq
+;
+; X64-NO-SHLD-HAVE-BMI2-LABEL: lshr_16bytes:
+; X64-NO-SHLD-HAVE-BMI2:       # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    movq 8(%rdi), %rax
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    movzbl (%rsi), %ecx
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    shlb $3, %cl
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    shrxq %rcx, (%rdi), %rsi
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    movl %ecx, %edi
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    notb %dil
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    leaq (%rax,%rax), %r8
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    shlxq %rdi, %r8, %rdi
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    orq %rsi, %rdi
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    shrxq %rcx, %rax, %rax
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    xorl %esi, %esi
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    testb $64, %cl
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    cmovneq %rax, %rdi
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    cmoveq %rax, %rsi
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    movq %rsi, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    movq %rdi, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    retq
+;
+; X64-HAVE-SHLD-HAVE-BMI2-LABEL: lshr_16bytes:
+; X64-HAVE-SHLD-HAVE-BMI2:       # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movq (%rdi), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movq 8(%rdi), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movzbl (%rsi), %ecx
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    shlb $3, %cl
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    shrdq %cl, %rdi, %rax
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    shrxq %rcx, %rdi, %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    xorl %edi, %edi
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    testb $64, %cl
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    cmovneq %rsi, %rax
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    cmoveq %rsi, %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movq %rdi, 8(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movq %rax, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    retq
+;
+; X32-SSE2-LABEL: lshr_16bytes:
+; X32-SSE2:       # %bb.0:
+; X32-SSE2-NEXT:    pushl %ebx
+; X32-SSE2-NEXT:    pushl %edi
+; X32-SSE2-NEXT:    pushl %esi
+; X32-SSE2-NEXT:    subl $32, %esp
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-SSE2-NEXT:    movl (%edx), %esi
+; X32-SSE2-NEXT:    movl 4(%edx), %edi
+; X32-SSE2-NEXT:    movl 8(%edx), %ebx
+; X32-SSE2-NEXT:    movl 12(%edx), %edx
+; X32-SSE2-NEXT:    movzbl (%ecx), %ecx
+; X32-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %esi, (%esp)
+; X32-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    andl $15, %ecx
+; X32-SSE2-NEXT:    movl (%esp,%ecx), %edx
+; X32-SSE2-NEXT:    movl 4(%esp,%ecx), %esi
+; X32-SSE2-NEXT:    movl 12(%esp,%ecx), %edi
+; X32-SSE2-NEXT:    movl 8(%esp,%ecx), %ecx
+; X32-SSE2-NEXT:    movl %ecx, 8(%eax)
+; X32-SSE2-NEXT:    movl %edi, 12(%eax)
+; X32-SSE2-NEXT:    movl %edx, (%eax)
+; X32-SSE2-NEXT:    movl %esi, 4(%eax)
+; X32-SSE2-NEXT:    addl $32, %esp
+; X32-SSE2-NEXT:    popl %esi
+; X32-SSE2-NEXT:    popl %edi
+; X32-SSE2-NEXT:    popl %ebx
+; X32-SSE2-NEXT:    retl
+;
+; X32-SSE42-LABEL: lshr_16bytes:
+; X32-SSE42:       # %bb.0:
+; X32-SSE42-NEXT:    subl $32, %esp
+; X32-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-SSE42-NEXT:    movups (%edx), %xmm0
+; X32-SSE42-NEXT:    movzbl (%ecx), %ecx
+; X32-SSE42-NEXT:    xorps %xmm1, %xmm1
+; X32-SSE42-NEXT:    movups %xmm1, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    movups %xmm0, (%esp)
+; X32-SSE42-NEXT:    andl $15, %ecx
+; X32-SSE42-NEXT:    movups (%esp,%ecx), %xmm0
+; X32-SSE42-NEXT:    movups %xmm0, (%eax)
+; X32-SSE42-NEXT:    addl $32, %esp
+; X32-SSE42-NEXT:    retl
+;
+; X32-AVX-LABEL: lshr_16bytes:
+; X32-AVX:       # %bb.0:
+; X32-AVX-NEXT:    subl $32, %esp
+; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-AVX-NEXT:    vmovups (%edx), %xmm0
+; X32-AVX-NEXT:    movzbl (%ecx), %ecx
+; X32-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X32-AVX-NEXT:    vmovups %xmm1, {{[0-9]+}}(%esp)
+; X32-AVX-NEXT:    vmovups %xmm0, (%esp)
+; X32-AVX-NEXT:    andl $15, %ecx
+; X32-AVX-NEXT:    vmovups (%esp,%ecx), %xmm0
+; X32-AVX-NEXT:    vmovups %xmm0, (%eax)
+; X32-AVX-NEXT:    addl $32, %esp
+; X32-AVX-NEXT:    retl
   %src = load i128, ptr %src.ptr, align 1
   %byteOff = load i128, ptr %byteOff.ptr, align 1
   %bitOff = shl i128 %byteOff, 3
@@ -666,120 +724,158 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
   ret void
 }
 define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
-; X64-NO-BMI2-NO-SHLD-LABEL: shl_16bytes:
-; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %eax
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlb $3, %al
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, %rsi
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %rsi
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rsi
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rdi, %rsi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %al
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r8, %rsi
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r8, %rcx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rcx, (%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rsi, 8(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    retq
-;
-; X64-NO-BMI2-HAVE-SHLD-LABEL: shl_16bytes:
-; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlb $3, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rax, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorl %eax, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %rsi, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 8(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
-;
-; X64-HAVE-BMI2-NO-SHLD-LABEL: shl_16bytes:
-; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlb $3, %cl
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, 8(%rdi), %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %dil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, %rax, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rdi, %rax, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rsi, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %cl
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %r8, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %r8, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsi, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 8(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
-;
-; X64-HAVE-BMI2-HAVE-SHLD-LABEL: shl_16bytes:
-; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlb $3, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rax, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rcx, %rax, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %esi, %esi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %rax, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 8(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
-;
-; X32-LABEL: shl_16bytes:
-; X32:       # %bb.0:
-; X32-NEXT:    pushl %ebx
-; X32-NEXT:    pushl %edi
-; X32-NEXT:    pushl %esi
-; X32-NEXT:    subl $32, %esp
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    movl (%edx), %esi
-; X32-NEXT:    movl 4(%edx), %edi
-; X32-NEXT:    movl 8(%edx), %ebx
-; X32-NEXT:    movl 12(%edx), %edx
-; X32-NEXT:    movzbl (%ecx), %ecx
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl $0, (%esp)
-; X32-NEXT:    andb $15, %cl
-; X32-NEXT:    negb %cl
-; X32-NEXT:    movsbl %cl, %ecx
-; X32-NEXT:    movl 16(%esp,%ecx), %edx
-; X32-NEXT:    movl 20(%esp,%ecx), %esi
-; X32-NEXT:    movl 28(%esp,%ecx), %edi
-; X32-NEXT:    movl 24(%esp,%ecx), %ecx
-; X32-NEXT:    movl %ecx, 8(%eax)
-; X32-NEXT:    movl %edi, 12(%eax)
-; X32-NEXT:    movl %edx, (%eax)
-; X32-NEXT:    movl %esi, 4(%eax)
-; X32-NEXT:    addl $32, %esp
-; X32-NEXT:    popl %esi
-; X32-NEXT:    popl %edi
-; X32-NEXT:    popl %ebx
-; X32-NEXT:    retl
+; X64-NO-SHLD-NO-BMI2-LABEL: shl_16bytes:
+; X64-NO-SHLD-NO-BMI2:       # %bb.0:
+; X64-NO-SHLD-NO-BMI2-NEXT:    movq (%rdi), %r8
+; X64-NO-SHLD-NO-BMI2-NEXT:    movq 8(%rdi), %rdi
+; X64-NO-SHLD-NO-BMI2-NEXT:    movzbl (%rsi), %eax
+; X64-NO-SHLD-NO-BMI2-NEXT:    shlb $3, %al
+; X64-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-NEXT:    shlq %cl, %rdi
+; X64-NO-SHLD-NO-BMI2-NEXT:    movq %r8, %rsi
+; X64-NO-SHLD-NO-BMI2-NEXT:    shrq %rsi
+; X64-NO-SHLD-NO-BMI2-NEXT:    notb %cl
+; X64-NO-SHLD-NO-BMI2-NEXT:    shrq %cl, %rsi
+; X64-NO-SHLD-NO-BMI2-NEXT:    orq %rdi, %rsi
+; X64-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-NEXT:    shlq %cl, %r8
+; X64-NO-SHLD-NO-BMI2-NEXT:    xorl %ecx, %ecx
+; X64-NO-SHLD-NO-BMI2-NEXT:    testb $64, %al
+; X64-NO-SHLD-NO-BMI2-NEXT:    cmovneq %r8, %rsi
+; X64-NO-SHLD-NO-BMI2-NEXT:    cmoveq %r8, %rcx
+; X64-NO-SHLD-NO-BMI2-NEXT:    movq %rcx, (%rdx)
+; X64-NO-SHLD-NO-BMI2-NEXT:    movq %rsi, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-NEXT:    retq
+;
+; X64-HAVE-SHLD-NO-BMI2-LABEL: shl_16bytes:
+; X64-HAVE-SHLD-NO-BMI2:       # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    movq (%rdi), %rax
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    movq 8(%rdi), %rdi
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    movzbl (%rsi), %ecx
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    shlb $3, %cl
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    movq %rax, %rsi
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    shlq %cl, %rsi
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    shldq %cl, %rax, %rdi
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    xorl %eax, %eax
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    testb $64, %cl
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    cmovneq %rsi, %rdi
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    cmoveq %rsi, %rax
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    movq %rdi, 8(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    movq %rax, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    retq
+;
+; X64-NO-SHLD-HAVE-BMI2-LABEL: shl_16bytes:
+; X64-NO-SHLD-HAVE-BMI2:       # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    movq (%rdi), %rax
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    movzbl (%rsi), %ecx
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    shlb $3, %cl
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    shlxq %rcx, 8(%rdi), %rsi
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    movl %ecx, %edi
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    notb %dil
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    shlxq %rcx, %rax, %r8
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    shrq %rax
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    shrxq %rdi, %rax, %rax
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    orq %rsi, %rax
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    xorl %esi, %esi
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    testb $64, %cl
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    cmovneq %r8, %rax
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    cmoveq %r8, %rsi
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    movq %rsi, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    movq %rax, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    retq
+;
+; X64-HAVE-SHLD-HAVE-BMI2-LABEL: shl_16bytes:
+; X64-HAVE-SHLD-HAVE-BMI2:       # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movq (%rdi), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movq 8(%rdi), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movzbl (%rsi), %ecx
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    shlb $3, %cl
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    shldq %cl, %rax, %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    shlxq %rcx, %rax, %rax
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    xorl %esi, %esi
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    testb $64, %cl
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    cmovneq %rax, %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    cmoveq %rax, %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movq %rdi, 8(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movq %rsi, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    retq
+;
+; X32-SSE2-LABEL: shl_16bytes:
+; X32-SSE2:       # %bb.0:
+; X32-SSE2-NEXT:    pushl %ebx
+; X32-SSE2-NEXT:    pushl %edi
+; X32-SSE2-NEXT:    pushl %esi
+; X32-SSE2-NEXT:    subl $32, %esp
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-SSE2-NEXT:    movl (%edx), %esi
+; X32-SSE2-NEXT:    movl 4(%edx), %edi
+; X32-SSE2-NEXT:    movl 8(%edx), %ebx
+; X32-SSE2-NEXT:    movl 12(%edx), %edx
+; X32-SSE2-NEXT:    movzbl (%ecx), %ecx
+; X32-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl $0, (%esp)
+; X32-SSE2-NEXT:    andb $15, %cl
+; X32-SSE2-NEXT:    negb %cl
+; X32-SSE2-NEXT:    movsbl %cl, %ecx
+; X32-SSE2-NEXT:    movl 16(%esp,%ecx), %edx
+; X32-SSE2-NEXT:    movl 20(%esp,%ecx), %esi
+; X32-SSE2-NEXT:    movl 28(%esp,%ecx), %edi
+; X32-SSE2-NEXT:    movl 24(%esp,%ecx), %ecx
+; X32-SSE2-NEXT:    movl %ecx, 8(%eax)
+; X32-SSE2-NEXT:    movl %edi, 12(%eax)
+; X32-SSE2-NEXT:    movl %edx, (%eax)
+; X32-SSE2-NEXT:    movl %esi, 4(%eax)
+; X32-SSE2-NEXT:    addl $32, %esp
+; X32-SSE2-NEXT:    popl %esi
+; X32-SSE2-NEXT:    popl %edi
+; X32-SSE2-NEXT:    popl %ebx
+; X32-SSE2-NEXT:    retl
+;
+; X32-SSE42-LABEL: shl_16bytes:
+; X32-SSE42:       # %bb.0:
+; X32-SSE42-NEXT:    subl $32, %esp
+; X32-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-SSE42-NEXT:    movups (%edx), %xmm0
+; X32-SSE42-NEXT:    movzbl (%ecx), %ecx
+; X32-SSE42-NEXT:    xorps %xmm1, %xmm1
+; X32-SSE42-NEXT:    movups %xmm1, (%esp)
+; X32-SSE42-NEXT:    movups %xmm0, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    andb $15, %cl
+; X32-SSE42-NEXT:    negb %cl
+; X32-SSE42-NEXT:    movsbl %cl, %ecx
+; X32-SSE42-NEXT:    movups 16(%esp,%ecx), %xmm0
+; X32-SSE42-NEXT:    movups %xmm0, (%eax)
+; X32-SSE42-NEXT:    addl $32, %esp
+; X32-SSE42-NEXT:    retl
+;
+; X32-AVX-LABEL: shl_16bytes:
+; X32-AVX:       # %bb.0:
+; X32-AVX-NEXT:    subl $32, %esp
+; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-AVX-NEXT:    vmovups (%edx), %xmm0
+; X32-AVX-NEXT:    movzbl (%ecx), %ecx
+; X32-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X32-AVX-NEXT:    vmovups %xmm1, (%esp)
+; X32-AVX-NEXT:    vmovups %xmm0, {{[0-9]+}}(%esp)
+; X32-AVX-NEXT:    andb $15, %cl
+; X32-AVX-NEXT:    negb %cl
+; X32-AVX-NEXT:    movsbl %cl, %ecx
+; X32-AVX-NEXT:    vmovups 16(%esp,%ecx), %xmm0
+; X32-AVX-NEXT:    vmovups %xmm0, (%eax)
+; X32-AVX-NEXT:    addl $32, %esp
+; X32-AVX-NEXT:    retl
   %src = load i128, ptr %src.ptr, align 1
   %byteOff = load i128, ptr %byteOff.ptr, align 1
   %bitOff = shl i128 %byteOff, 3
@@ -788,119 +884,183 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
   ret void
 }
 define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
-; X64-NO-BMI2-NO-SHLD-LABEL: ashr_16bytes:
-; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %eax
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlb $3, %al
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rdi,%rdi), %rsi
-; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rsi
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r8, %rsi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    sarq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    sarq $63, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %al
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmovneq %r8, %rsi
-; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r8, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, 8(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rsi, (%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    retq
-;
-; X64-NO-BMI2-HAVE-SHLD-LABEL: ashr_16bytes:
-; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlb $3, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    sarq %cl, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    sarq $63, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %rsi, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 8(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
-;
-; X64-HAVE-BMI2-NO-SHLD-LABEL: ashr_16bytes:
-; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%rsi), %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlb $3, %cl
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, (%rdi), %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %dil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rdi, %r8, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rsi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    sarxq %rcx, %rax, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    sarq $63, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %cl
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmovneq %rsi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rsi, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 8(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
-;
-; X64-HAVE-BMI2-HAVE-SHLD-LABEL: ashr_16bytes:
-; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%rsi), %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlb $3, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxq %rcx, %rdi, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    sarq $63, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovneq %rsi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rsi, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 8(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
-;
-; X32-LABEL: ashr_16bytes:
-; X32:       # %bb.0:
-; X32-NEXT:    pushl %ebx
-; X32-NEXT:    pushl %edi
-; X32-NEXT:    pushl %esi
-; X32-NEXT:    subl $32, %esp
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    movl (%edx), %esi
-; X32-NEXT:    movl 4(%edx), %edi
-; X32-NEXT:    movl 8(%edx), %ebx
-; X32-NEXT:    movl 12(%edx), %edx
-; X32-NEXT:    movzbl (%ecx), %ecx
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl %esi, (%esp)
-; X32-NEXT:    sarl $31, %edx
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X32-NEXT:    andl $15, %ecx
-; X32-NEXT:    movl (%esp,%ecx), %edx
-; X32-NEXT:    movl 4(%esp,%ecx), %esi
-; X32-NEXT:    movl 12(%esp,%ecx), %edi
-; X32-NEXT:    movl 8(%esp,%ecx), %ecx
-; X32-NEXT:    movl %ecx, 8(%eax)
-; X32-NEXT:    movl %edi, 12(%eax)
-; X32-NEXT:    movl %edx, (%eax)
-; X32-NEXT:    movl %esi, 4(%eax)
-; X32-NEXT:    addl $32, %esp
-; X32-NEXT:    popl %esi
-; X32-NEXT:    popl %edi
-; X32-NEXT:    popl %ebx
-; X32-NEXT:    retl
+; X64-NO-SHLD-NO-BMI2-LABEL: ashr_16bytes:
+; X64-NO-SHLD-NO-BMI2:       # %bb.0:
+; X64-NO-SHLD-NO-BMI2-NEXT:    movq (%rdi), %r8
+; X64-NO-SHLD-NO-BMI2-NEXT:    movq 8(%rdi), %rdi
+; X64-NO-SHLD-NO-BMI2-NEXT:    movzbl (%rsi), %eax
+; X64-NO-SHLD-NO-BMI2-NEXT:    shlb $3, %al
+; X64-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-NEXT:    shrq %cl, %r8
+; X64-NO-SHLD-NO-BMI2-NEXT:    leaq (%rdi,%rdi), %rsi
+; X64-NO-SHLD-NO-BMI2-NEXT:    notb %cl
+; X64-NO-SHLD-NO-BMI2-NEXT:    shlq %cl, %rsi
+; X64-NO-SHLD-NO-BMI2-NEXT:    orq %r8, %rsi
+; X64-NO-SHLD-NO-BMI2-NEXT:    movq %rdi, %r8
+; X64-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-NEXT:    sarq %cl, %r8
+; X64-NO-SHLD-NO-BMI2-NEXT:    sarq $63, %rdi
+; X64-NO-SHLD-NO-BMI2-NEXT:    testb $64, %al
+; X64-NO-SHLD-NO-BMI2-NEXT:    cmovneq %r8, %rsi
+; X64-NO-SHLD-NO-BMI2-NEXT:    cmoveq %r8, %rdi
+; X64-NO-SHLD-NO-BMI2-NEXT:    movq %rdi, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-NEXT:    movq %rsi, (%rdx)
+; X64-NO-SHLD-NO-BMI2-NEXT:    retq
+;
+; X64-HAVE-SHLD-NO-BMI2-LABEL: ashr_16bytes:
+; X64-HAVE-SHLD-NO-BMI2:       # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    movq (%rdi), %rax
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    movq 8(%rdi), %rdi
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    movzbl (%rsi), %ecx
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    shlb $3, %cl
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    movq %rdi, %rsi
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    sarq %cl, %rsi
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    shrdq %cl, %rdi, %rax
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    sarq $63, %rdi
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    testb $64, %cl
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    cmovneq %rsi, %rax
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    cmoveq %rsi, %rdi
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    movq %rdi, 8(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    movq %rax, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-NEXT:    retq
+;
+; X64-NO-SHLD-HAVE-BMI2-LABEL: ashr_16bytes:
+; X64-NO-SHLD-HAVE-BMI2:       # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    movq 8(%rdi), %rax
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    movzbl (%rsi), %ecx
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    shlb $3, %cl
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    shrxq %rcx, (%rdi), %rsi
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    movl %ecx, %edi
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    notb %dil
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    leaq (%rax,%rax), %r8
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    shlxq %rdi, %r8, %rdi
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    orq %rsi, %rdi
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    sarxq %rcx, %rax, %rsi
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    sarq $63, %rax
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    testb $64, %cl
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    cmovneq %rsi, %rdi
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    cmoveq %rsi, %rax
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    movq %rax, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    movq %rdi, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-NEXT:    retq
+;
+; X64-HAVE-SHLD-HAVE-BMI2-LABEL: ashr_16bytes:
+; X64-HAVE-SHLD-HAVE-BMI2:       # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movq (%rdi), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movq 8(%rdi), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movzbl (%rsi), %ecx
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    shlb $3, %cl
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    shrdq %cl, %rdi, %rax
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    sarxq %rcx, %rdi, %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    sarq $63, %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    testb $64, %cl
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    cmovneq %rsi, %rax
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    cmoveq %rsi, %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movq %rdi, 8(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movq %rax, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    retq
+;
+; X32-SSE2-LABEL: ashr_16bytes:
+; X32-SSE2:       # %bb.0:
+; X32-SSE2-NEXT:    pushl %ebx
+; X32-SSE2-NEXT:    pushl %edi
+; X32-SSE2-NEXT:    pushl %esi
+; X32-SSE2-NEXT:    subl $32, %esp
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-SSE2-NEXT:    movl (%edx), %esi
+; X32-SSE2-NEXT:    movl 4(%edx), %edi
+; X32-SSE2-NEXT:    movl 8(%edx), %ebx
+; X32-SSE2-NEXT:    movl 12(%edx), %edx
+; X32-SSE2-NEXT:    movzbl (%ecx), %ecx
+; X32-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %esi, (%esp)
+; X32-SSE2-NEXT:    sarl $31, %edx
+; X32-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    andl $15, %ecx
+; X32-SSE2-NEXT:    movl (%esp,%ecx), %edx
+; X32-SSE2-NEXT:    movl 4(%esp,%ecx), %esi
+; X32-SSE2-NEXT:    movl 12(%esp,%ecx), %edi
+; X32-SSE2-NEXT:    movl 8(%esp,%ecx), %ecx
+; X32-SSE2-NEXT:    movl %ecx, 8(%eax)
+; X32-SSE2-NEXT:    movl %edi, 12(%eax)
+; X32-SSE2-NEXT:    movl %edx, (%eax)
+; X32-SSE2-NEXT:    movl %esi, 4(%eax)
+; X32-SSE2-NEXT:    addl $32, %esp
+; X32-SSE2-NEXT:    popl %esi
+; X32-SSE2-NEXT:    popl %edi
+; X32-SSE2-NEXT:    popl %ebx
+; X32-SSE2-NEXT:    retl
+;
+; X32-SSE42-LABEL: ashr_16bytes:
+; X32-SSE42:       # %bb.0:
+; X32-SSE42-NEXT:    pushl %ebx
+; X32-SSE42-NEXT:    pushl %edi
+; X32-SSE42-NEXT:    pushl %esi
+; X32-SSE42-NEXT:    subl $32, %esp
+; X32-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-SSE42-NEXT:    movl (%edx), %esi
+; X32-SSE42-NEXT:    movl 4(%edx), %edi
+; X32-SSE42-NEXT:    movl 8(%edx), %ebx
+; X32-SSE42-NEXT:    movl 12(%edx), %edx
+; X32-SSE42-NEXT:    movzbl (%ecx), %ecx
+; X32-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    movl %esi, (%esp)
+; X32-SSE42-NEXT:    sarl $31, %edx
+; X32-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    andl $15, %ecx
+; X32-SSE42-NEXT:    movups (%esp,%ecx), %xmm0
+; X32-SSE42-NEXT:    movups %xmm0, (%eax)
+; X32-SSE42-NEXT:    addl $32, %esp
+; X32-SSE42-NEXT:    popl %esi
+; X32-SSE42-NEXT:    popl %edi
+; X32-SSE42-NEXT:    popl %ebx
+; X32-SSE42-NEXT:    retl
+;
+; X32-AVX-LABEL: ashr_16bytes:
+; X32-AVX:       # %bb.0:
+; X32-AVX-NEXT:    pushl %ebx
+; X32-AVX-NEXT:    pushl %edi
+; X32-AVX-NEXT:    pushl %esi
+; X32-AVX-NEXT:    subl $32, %esp
+; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-AVX-NEXT:    movl (%edx), %esi
+; X32-AVX-NEXT:    movl 4(%edx), %edi
+; X32-AVX-NEXT:    movl 8(%edx), %ebx
+; X32-AVX-NEXT:    movl 12(%edx), %edx
+; X32-AVX-NEXT:    movzbl (%ecx), %ecx
+; X32-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X32-AVX-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X32-AVX-NEXT:    movl %esi, (%esp)
+; X32-AVX-NEXT:    sarl $31, %edx
+; X32-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX-NEXT:    andl $15, %ecx
+; X32-AVX-NEXT:    vmovups (%esp,%ecx), %xmm0
+; X32-AVX-NEXT:    vmovups %xmm0, (%eax)
+; X32-AVX-NEXT:    addl $32, %esp
+; X32-AVX-NEXT:    popl %esi
+; X32-AVX-NEXT:    popl %edi
+; X32-AVX-NEXT:    popl %ebx
+; X32-AVX-NEXT:    retl
   %src = load i128, ptr %src.ptr, align 1
   %byteOff = load i128, ptr %byteOff.ptr, align 1
   %bitOff = shl i128 %byteOff, 3
@@ -910,98 +1070,172 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 }
 
 define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
-; X64-LABEL: lshr_32bytes:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    movq 8(%rdi), %rcx
-; X64-NEXT:    movq 16(%rdi), %r8
-; X64-NEXT:    movq 24(%rdi), %rdi
-; X64-NEXT:    movzbl (%rsi), %esi
-; X64-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    andl $31, %esi
-; X64-NEXT:    movq -64(%rsp,%rsi), %rax
-; X64-NEXT:    movq -56(%rsp,%rsi), %rcx
-; X64-NEXT:    movq -40(%rsp,%rsi), %rdi
-; X64-NEXT:    movq -48(%rsp,%rsi), %rsi
-; X64-NEXT:    movq %rsi, 16(%rdx)
-; X64-NEXT:    movq %rdi, 24(%rdx)
-; X64-NEXT:    movq %rax, (%rdx)
-; X64-NEXT:    movq %rcx, 8(%rdx)
-; X64-NEXT:    retq
-;
-; X32-LABEL: lshr_32bytes:
-; X32:       # %bb.0:
-; X32-NEXT:    pushl %ebp
-; X32-NEXT:    pushl %ebx
-; X32-NEXT:    pushl %edi
-; X32-NEXT:    pushl %esi
-; X32-NEXT:    subl $72, %esp
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl (%eax), %ecx
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl 4(%eax), %ecx
-; X32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X32-NEXT:    movl 8(%eax), %esi
-; X32-NEXT:    movl 12(%eax), %edi
-; X32-NEXT:    movl 16(%eax), %ebx
-; X32-NEXT:    movl 20(%eax), %ebp
-; X32-NEXT:    movl 24(%eax), %edx
-; X32-NEXT:    movl 28(%eax), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movzbl (%eax), %eax
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X32-NEXT:    andl $31, %eax
-; X32-NEXT:    movl 8(%esp,%eax), %ecx
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl 12(%esp,%eax), %ecx
-; X32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X32-NEXT:    movl 20(%esp,%eax), %esi
-; X32-NEXT:    movl 16(%esp,%eax), %edi
-; X32-NEXT:    movl 28(%esp,%eax), %ebx
-; X32-NEXT:    movl 24(%esp,%eax), %ebp
-; X32-NEXT:    movl 36(%esp,%eax), %edx
-; X32-NEXT:    movl 32(%esp,%eax), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl %ecx, 24(%eax)
-; X32-NEXT:    movl %edx, 28(%eax)
-; X32-NEXT:    movl %ebp, 16(%eax)
-; X32-NEXT:    movl %ebx, 20(%eax)
-; X32-NEXT:    movl %edi, 8(%eax)
-; X32-NEXT:    movl %esi, 12(%eax)
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    movl %ecx, (%eax)
-; X32-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X32-NEXT:    movl %ecx, 4(%eax)
-; X32-NEXT:    addl $72, %esp
-; X32-NEXT:    popl %esi
-; X32-NEXT:    popl %edi
-; X32-NEXT:    popl %ebx
-; X32-NEXT:    popl %ebp
-; X32-NEXT:    retl
+; X64-SSE2-LABEL: lshr_32bytes:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    movq (%rdi), %rax
+; X64-SSE2-NEXT:    movq 8(%rdi), %rcx
+; X64-SSE2-NEXT:    movq 16(%rdi), %r8
+; X64-SSE2-NEXT:    movq 24(%rdi), %rdi
+; X64-SSE2-NEXT:    movzbl (%rsi), %esi
+; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    andl $31, %esi
+; X64-SSE2-NEXT:    movq -64(%rsp,%rsi), %rax
+; X64-SSE2-NEXT:    movq -56(%rsp,%rsi), %rcx
+; X64-SSE2-NEXT:    movq -40(%rsp,%rsi), %rdi
+; X64-SSE2-NEXT:    movq -48(%rsp,%rsi), %rsi
+; X64-SSE2-NEXT:    movq %rsi, 16(%rdx)
+; X64-SSE2-NEXT:    movq %rdi, 24(%rdx)
+; X64-SSE2-NEXT:    movq %rax, (%rdx)
+; X64-SSE2-NEXT:    movq %rcx, 8(%rdx)
+; X64-SSE2-NEXT:    retq
+;
+; X64-SSE42-LABEL: lshr_32bytes:
+; X64-SSE42:       # %bb.0:
+; X64-SSE42-NEXT:    movups (%rdi), %xmm0
+; X64-SSE42-NEXT:    movups 16(%rdi), %xmm1
+; X64-SSE42-NEXT:    movzbl (%rsi), %eax
+; X64-SSE42-NEXT:    xorps %xmm2, %xmm2
+; X64-SSE42-NEXT:    movups %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movups %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movups %xmm1, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    andl $31, %eax
+; X64-SSE42-NEXT:    movups -64(%rsp,%rax), %xmm0
+; X64-SSE42-NEXT:    movups -48(%rsp,%rax), %xmm1
+; X64-SSE42-NEXT:    movups %xmm1, 16(%rdx)
+; X64-SSE42-NEXT:    movups %xmm0, (%rdx)
+; X64-SSE42-NEXT:    retq
+;
+; X64-AVX-LABEL: lshr_32bytes:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovups (%rdi), %ymm0
+; X64-AVX-NEXT:    movzbl (%rsi), %eax
+; X64-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X64-AVX-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    andl $31, %eax
+; X64-AVX-NEXT:    vmovups -64(%rsp,%rax), %xmm0
+; X64-AVX-NEXT:    vmovups -48(%rsp,%rax), %xmm1
+; X64-AVX-NEXT:    vmovups %xmm1, 16(%rdx)
+; X64-AVX-NEXT:    vmovups %xmm0, (%rdx)
+; X64-AVX-NEXT:    vzeroupper
+; X64-AVX-NEXT:    retq
+;
+; X32-SSE2-LABEL: lshr_32bytes:
+; X32-SSE2:       # %bb.0:
+; X32-SSE2-NEXT:    pushl %ebp
+; X32-SSE2-NEXT:    pushl %ebx
+; X32-SSE2-NEXT:    pushl %edi
+; X32-SSE2-NEXT:    pushl %esi
+; X32-SSE2-NEXT:    subl $72, %esp
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movl (%eax), %ecx
+; X32-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 4(%eax), %ecx
+; X32-SSE2-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 8(%eax), %esi
+; X32-SSE2-NEXT:    movl 12(%eax), %edi
+; X32-SSE2-NEXT:    movl 16(%eax), %ebx
+; X32-SSE2-NEXT:    movl 20(%eax), %ebp
+; X32-SSE2-NEXT:    movl 24(%eax), %edx
+; X32-SSE2-NEXT:    movl 28(%eax), %ecx
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movzbl (%eax), %eax
+; X32-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    andl $31, %eax
+; X32-SSE2-NEXT:    movl 8(%esp,%eax), %ecx
+; X32-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 12(%esp,%eax), %ecx
+; X32-SSE2-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 20(%esp,%eax), %esi
+; X32-SSE2-NEXT:    movl 16(%esp,%eax), %edi
+; X32-SSE2-NEXT:    movl 28(%esp,%eax), %ebx
+; X32-SSE2-NEXT:    movl 24(%esp,%eax), %ebp
+; X32-SSE2-NEXT:    movl 36(%esp,%eax), %edx
+; X32-SSE2-NEXT:    movl 32(%esp,%eax), %ecx
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movl %ecx, 24(%eax)
+; X32-SSE2-NEXT:    movl %edx, 28(%eax)
+; X32-SSE2-NEXT:    movl %ebp, 16(%eax)
+; X32-SSE2-NEXT:    movl %ebx, 20(%eax)
+; X32-SSE2-NEXT:    movl %edi, 8(%eax)
+; X32-SSE2-NEXT:    movl %esi, 12(%eax)
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %ecx, (%eax)
+; X32-SSE2-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %ecx, 4(%eax)
+; X32-SSE2-NEXT:    addl $72, %esp
+; X32-SSE2-NEXT:    popl %esi
+; X32-SSE2-NEXT:    popl %edi
+; X32-SSE2-NEXT:    popl %ebx
+; X32-SSE2-NEXT:    popl %ebp
+; X32-SSE2-NEXT:    retl
+;
+; X32-SSE42-LABEL: lshr_32bytes:
+; X32-SSE42:       # %bb.0:
+; X32-SSE42-NEXT:    subl $64, %esp
+; X32-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-SSE42-NEXT:    movups (%edx), %xmm0
+; X32-SSE42-NEXT:    movups 16(%edx), %xmm1
+; X32-SSE42-NEXT:    movzbl (%ecx), %ecx
+; X32-SSE42-NEXT:    xorps %xmm2, %xmm2
+; X32-SSE42-NEXT:    movups %xmm2, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    movups %xmm2, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    movups %xmm1, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    movups %xmm0, (%esp)
+; X32-SSE42-NEXT:    andl $31, %ecx
+; X32-SSE42-NEXT:    movups (%esp,%ecx), %xmm0
+; X32-SSE42-NEXT:    movups 16(%esp,%ecx), %xmm1
+; X32-SSE42-NEXT:    movups %xmm1, 16(%eax)
+; X32-SSE42-NEXT:    movups %xmm0, (%eax)
+; X32-SSE42-NEXT:    addl $64, %esp
+; X32-SSE42-NEXT:    retl
+;
+; X32-AVX-LABEL: lshr_32bytes:
+; X32-AVX:       # %bb.0:
+; X32-AVX-NEXT:    subl $64, %esp
+; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-AVX-NEXT:    vmovups (%edx), %ymm0
+; X32-AVX-NEXT:    movzbl (%ecx), %ecx
+; X32-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X32-AVX-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
+; X32-AVX-NEXT:    vmovups %ymm0, (%esp)
+; X32-AVX-NEXT:    andl $31, %ecx
+; X32-AVX-NEXT:    vmovups (%esp,%ecx), %xmm0
+; X32-AVX-NEXT:    vmovups 16(%esp,%ecx), %xmm1
+; X32-AVX-NEXT:    vmovups %xmm1, 16(%eax)
+; X32-AVX-NEXT:    vmovups %xmm0, (%eax)
+; X32-AVX-NEXT:    addl $64, %esp
+; X32-AVX-NEXT:    vzeroupper
+; X32-AVX-NEXT:    retl
   %src = load i256, ptr %src.ptr, align 1
   %byteOff = load i256, ptr %byteOff.ptr, align 1
   %bitOff = shl i256 %byteOff, 3
@@ -1010,102 +1244,184 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
   ret void
 }
 define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
-; X64-LABEL: shl_32bytes:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    movq 8(%rdi), %rcx
-; X64-NEXT:    movq 16(%rdi), %r8
-; X64-NEXT:    movq 24(%rdi), %rdi
-; X64-NEXT:    movzbl (%rsi), %esi
-; X64-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    andb $31, %sil
-; X64-NEXT:    negb %sil
-; X64-NEXT:    movsbq %sil, %rax
-; X64-NEXT:    movq -32(%rsp,%rax), %rcx
-; X64-NEXT:    movq -24(%rsp,%rax), %rsi
-; X64-NEXT:    movq -8(%rsp,%rax), %rdi
-; X64-NEXT:    movq -16(%rsp,%rax), %rax
-; X64-NEXT:    movq %rax, 16(%rdx)
-; X64-NEXT:    movq %rdi, 24(%rdx)
-; X64-NEXT:    movq %rcx, (%rdx)
-; X64-NEXT:    movq %rsi, 8(%rdx)
-; X64-NEXT:    retq
-;
-; X32-LABEL: shl_32bytes:
-; X32:       # %bb.0:
-; X32-NEXT:    pushl %ebp
-; X32-NEXT:    pushl %ebx
-; X32-NEXT:    pushl %edi
-; X32-NEXT:    pushl %esi
-; X32-NEXT:    subl $72, %esp
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    movl (%edx), %ecx
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl 4(%edx), %ecx
-; X32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X32-NEXT:    movl 8(%edx), %edi
-; X32-NEXT:    movl 12(%edx), %ebx
-; X32-NEXT:    movl 16(%edx), %ebp
-; X32-NEXT:    movzbl (%eax), %eax
-; X32-NEXT:    movl 20(%edx), %esi
-; X32-NEXT:    movl 24(%edx), %ecx
-; X32-NEXT:    movl 28(%edx), %edx
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X32-NEXT:    andb $31, %al
-; X32-NEXT:    negb %al
-; X32-NEXT:    movsbl %al, %eax
-; X32-NEXT:    movl 40(%esp,%eax), %ecx
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl 44(%esp,%eax), %ecx
-; X32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X32-NEXT:    movl 52(%esp,%eax), %esi
-; X32-NEXT:    movl 48(%esp,%eax), %edi
-; X32-NEXT:    movl 60(%esp,%eax), %ebx
-; X32-NEXT:    movl 56(%esp,%eax), %ebp
-; X32-NEXT:    movl 68(%esp,%eax), %edx
-; X32-NEXT:    movl 64(%esp,%eax), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl %ecx, 24(%eax)
-; X32-NEXT:    movl %edx, 28(%eax)
-; X32-NEXT:    movl %ebp, 16(%eax)
-; X32-NEXT:    movl %ebx, 20(%eax)
-; X32-NEXT:    movl %edi, 8(%eax)
-; X32-NEXT:    movl %esi, 12(%eax)
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    movl %ecx, (%eax)
-; X32-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X32-NEXT:    movl %ecx, 4(%eax)
-; X32-NEXT:    addl $72, %esp
-; X32-NEXT:    popl %esi
-; X32-NEXT:    popl %edi
-; X32-NEXT:    popl %ebx
-; X32-NEXT:    popl %ebp
-; X32-NEXT:    retl
+; X64-SSE2-LABEL: shl_32bytes:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    movq (%rdi), %rax
+; X64-SSE2-NEXT:    movq 8(%rdi), %rcx
+; X64-SSE2-NEXT:    movq 16(%rdi), %r8
+; X64-SSE2-NEXT:    movq 24(%rdi), %rdi
+; X64-SSE2-NEXT:    movzbl (%rsi), %esi
+; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    andb $31, %sil
+; X64-SSE2-NEXT:    negb %sil
+; X64-SSE2-NEXT:    movsbq %sil, %rax
+; X64-SSE2-NEXT:    movq -32(%rsp,%rax), %rcx
+; X64-SSE2-NEXT:    movq -24(%rsp,%rax), %rsi
+; X64-SSE2-NEXT:    movq -8(%rsp,%rax), %rdi
+; X64-SSE2-NEXT:    movq -16(%rsp,%rax), %rax
+; X64-SSE2-NEXT:    movq %rax, 16(%rdx)
+; X64-SSE2-NEXT:    movq %rdi, 24(%rdx)
+; X64-SSE2-NEXT:    movq %rcx, (%rdx)
+; X64-SSE2-NEXT:    movq %rsi, 8(%rdx)
+; X64-SSE2-NEXT:    retq
+;
+; X64-SSE42-LABEL: shl_32bytes:
+; X64-SSE42:       # %bb.0:
+; X64-SSE42-NEXT:    movups (%rdi), %xmm0
+; X64-SSE42-NEXT:    movups 16(%rdi), %xmm1
+; X64-SSE42-NEXT:    movzbl (%rsi), %eax
+; X64-SSE42-NEXT:    xorps %xmm2, %xmm2
+; X64-SSE42-NEXT:    movups %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movups %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movups %xmm1, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    andb $31, %al
+; X64-SSE42-NEXT:    negb %al
+; X64-SSE42-NEXT:    movsbq %al, %rax
+; X64-SSE42-NEXT:    movups -32(%rsp,%rax), %xmm0
+; X64-SSE42-NEXT:    movups -16(%rsp,%rax), %xmm1
+; X64-SSE42-NEXT:    movups %xmm1, 16(%rdx)
+; X64-SSE42-NEXT:    movups %xmm0, (%rdx)
+; X64-SSE42-NEXT:    retq
+;
+; X64-AVX-LABEL: shl_32bytes:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovups (%rdi), %ymm0
+; X64-AVX-NEXT:    movzbl (%rsi), %eax
+; X64-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X64-AVX-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    andb $31, %al
+; X64-AVX-NEXT:    negb %al
+; X64-AVX-NEXT:    movsbq %al, %rax
+; X64-AVX-NEXT:    vmovups -32(%rsp,%rax), %xmm0
+; X64-AVX-NEXT:    vmovups -16(%rsp,%rax), %xmm1
+; X64-AVX-NEXT:    vmovups %xmm1, 16(%rdx)
+; X64-AVX-NEXT:    vmovups %xmm0, (%rdx)
+; X64-AVX-NEXT:    vzeroupper
+; X64-AVX-NEXT:    retq
+;
+; X32-SSE2-LABEL: shl_32bytes:
+; X32-SSE2:       # %bb.0:
+; X32-SSE2-NEXT:    pushl %ebp
+; X32-SSE2-NEXT:    pushl %ebx
+; X32-SSE2-NEXT:    pushl %edi
+; X32-SSE2-NEXT:    pushl %esi
+; X32-SSE2-NEXT:    subl $72, %esp
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-SSE2-NEXT:    movl (%edx), %ecx
+; X32-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 4(%edx), %ecx
+; X32-SSE2-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 8(%edx), %edi
+; X32-SSE2-NEXT:    movl 12(%edx), %ebx
+; X32-SSE2-NEXT:    movl 16(%edx), %ebp
+; X32-SSE2-NEXT:    movzbl (%eax), %eax
+; X32-SSE2-NEXT:    movl 20(%edx), %esi
+; X32-SSE2-NEXT:    movl 24(%edx), %ecx
+; X32-SSE2-NEXT:    movl 28(%edx), %edx
+; X32-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    andb $31, %al
+; X32-SSE2-NEXT:    negb %al
+; X32-SSE2-NEXT:    movsbl %al, %eax
+; X32-SSE2-NEXT:    movl 40(%esp,%eax), %ecx
+; X32-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 44(%esp,%eax), %ecx
+; X32-SSE2-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 52(%esp,%eax), %esi
+; X32-SSE2-NEXT:    movl 48(%esp,%eax), %edi
+; X32-SSE2-NEXT:    movl 60(%esp,%eax), %ebx
+; X32-SSE2-NEXT:    movl 56(%esp,%eax), %ebp
+; X32-SSE2-NEXT:    movl 68(%esp,%eax), %edx
+; X32-SSE2-NEXT:    movl 64(%esp,%eax), %ecx
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movl %ecx, 24(%eax)
+; X32-SSE2-NEXT:    movl %edx, 28(%eax)
+; X32-SSE2-NEXT:    movl %ebp, 16(%eax)
+; X32-SSE2-NEXT:    movl %ebx, 20(%eax)
+; X32-SSE2-NEXT:    movl %edi, 8(%eax)
+; X32-SSE2-NEXT:    movl %esi, 12(%eax)
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %ecx, (%eax)
+; X32-SSE2-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %ecx, 4(%eax)
+; X32-SSE2-NEXT:    addl $72, %esp
+; X32-SSE2-NEXT:    popl %esi
+; X32-SSE2-NEXT:    popl %edi
+; X32-SSE2-NEXT:    popl %ebx
+; X32-SSE2-NEXT:    popl %ebp
+; X32-SSE2-NEXT:    retl
+;
+; X32-SSE42-LABEL: shl_32bytes:
+; X32-SSE42:       # %bb.0:
+; X32-SSE42-NEXT:    subl $64, %esp
+; X32-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-SSE42-NEXT:    movups (%edx), %xmm0
+; X32-SSE42-NEXT:    movups 16(%edx), %xmm1
+; X32-SSE42-NEXT:    movzbl (%ecx), %ecx
+; X32-SSE42-NEXT:    xorps %xmm2, %xmm2
+; X32-SSE42-NEXT:    movups %xmm2, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    movups %xmm2, (%esp)
+; X32-SSE42-NEXT:    movups %xmm1, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    movups %xmm0, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    andb $31, %cl
+; X32-SSE42-NEXT:    negb %cl
+; X32-SSE42-NEXT:    movsbl %cl, %ecx
+; X32-SSE42-NEXT:    movups 32(%esp,%ecx), %xmm0
+; X32-SSE42-NEXT:    movups 48(%esp,%ecx), %xmm1
+; X32-SSE42-NEXT:    movups %xmm1, 16(%eax)
+; X32-SSE42-NEXT:    movups %xmm0, (%eax)
+; X32-SSE42-NEXT:    addl $64, %esp
+; X32-SSE42-NEXT:    retl
+;
+; X32-AVX-LABEL: shl_32bytes:
+; X32-AVX:       # %bb.0:
+; X32-AVX-NEXT:    subl $64, %esp
+; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-AVX-NEXT:    vmovups (%edx), %ymm0
+; X32-AVX-NEXT:    movzbl (%ecx), %ecx
+; X32-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X32-AVX-NEXT:    vmovups %ymm1, (%esp)
+; X32-AVX-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; X32-AVX-NEXT:    andb $31, %cl
+; X32-AVX-NEXT:    negb %cl
+; X32-AVX-NEXT:    movsbl %cl, %ecx
+; X32-AVX-NEXT:    vmovups 32(%esp,%ecx), %xmm0
+; X32-AVX-NEXT:    vmovups 48(%esp,%ecx), %xmm1
+; X32-AVX-NEXT:    vmovups %xmm1, 16(%eax)
+; X32-AVX-NEXT:    vmovups %xmm0, (%eax)
+; X32-AVX-NEXT:    addl $64, %esp
+; X32-AVX-NEXT:    vzeroupper
+; X32-AVX-NEXT:    retl
   %src = load i256, ptr %src.ptr, align 1
   %byteOff = load i256, ptr %byteOff.ptr, align 1
   %bitOff = shl i256 %byteOff, 3
@@ -1114,100 +1430,222 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
   ret void
 }
 define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
-; X64-LABEL: ashr_32bytes:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    movq 8(%rdi), %rcx
-; X64-NEXT:    movq 16(%rdi), %r8
-; X64-NEXT:    movq 24(%rdi), %rdi
-; X64-NEXT:    movzbl (%rsi), %esi
-; X64-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    sarq $63, %rdi
-; X64-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    andl $31, %esi
-; X64-NEXT:    movq -64(%rsp,%rsi), %rax
-; X64-NEXT:    movq -56(%rsp,%rsi), %rcx
-; X64-NEXT:    movq -40(%rsp,%rsi), %rdi
-; X64-NEXT:    movq -48(%rsp,%rsi), %rsi
-; X64-NEXT:    movq %rsi, 16(%rdx)
-; X64-NEXT:    movq %rdi, 24(%rdx)
-; X64-NEXT:    movq %rax, (%rdx)
-; X64-NEXT:    movq %rcx, 8(%rdx)
-; X64-NEXT:    retq
-;
-; X32-LABEL: ashr_32bytes:
-; X32:       # %bb.0:
-; X32-NEXT:    pushl %ebp
-; X32-NEXT:    pushl %ebx
-; X32-NEXT:    pushl %edi
-; X32-NEXT:    pushl %esi
-; X32-NEXT:    subl $72, %esp
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl (%eax), %ecx
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl 4(%eax), %ecx
-; X32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X32-NEXT:    movl 8(%eax), %edi
-; X32-NEXT:    movl 12(%eax), %ebx
-; X32-NEXT:    movl 16(%eax), %ebp
-; X32-NEXT:    movl 20(%eax), %esi
-; X32-NEXT:    movl 24(%eax), %edx
-; X32-NEXT:    movl 28(%eax), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movzbl (%eax), %eax
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X32-NEXT:    sarl $31, %ecx
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X32-NEXT:    andl $31, %eax
-; X32-NEXT:    movl 8(%esp,%eax), %ecx
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl 12(%esp,%eax), %ecx
-; X32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X32-NEXT:    movl 20(%esp,%eax), %esi
-; X32-NEXT:    movl 16(%esp,%eax), %edi
-; X32-NEXT:    movl 28(%esp,%eax), %ebx
-; X32-NEXT:    movl 24(%esp,%eax), %ebp
-; X32-NEXT:    movl 36(%esp,%eax), %edx
-; X32-NEXT:    movl 32(%esp,%eax), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl %ecx, 24(%eax)
-; X32-NEXT:    movl %edx, 28(%eax)
-; X32-NEXT:    movl %ebp, 16(%eax)
-; X32-NEXT:    movl %ebx, 20(%eax)
-; X32-NEXT:    movl %edi, 8(%eax)
-; X32-NEXT:    movl %esi, 12(%eax)
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    movl %ecx, (%eax)
-; X32-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X32-NEXT:    movl %ecx, 4(%eax)
-; X32-NEXT:    addl $72, %esp
-; X32-NEXT:    popl %esi
-; X32-NEXT:    popl %edi
-; X32-NEXT:    popl %ebx
-; X32-NEXT:    popl %ebp
-; X32-NEXT:    retl
+; X64-SSE2-LABEL: ashr_32bytes:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    movq (%rdi), %rax
+; X64-SSE2-NEXT:    movq 8(%rdi), %rcx
+; X64-SSE2-NEXT:    movq 16(%rdi), %r8
+; X64-SSE2-NEXT:    movq 24(%rdi), %rdi
+; X64-SSE2-NEXT:    movzbl (%rsi), %esi
+; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    sarq $63, %rdi
+; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    andl $31, %esi
+; X64-SSE2-NEXT:    movq -64(%rsp,%rsi), %rax
+; X64-SSE2-NEXT:    movq -56(%rsp,%rsi), %rcx
+; X64-SSE2-NEXT:    movq -40(%rsp,%rsi), %rdi
+; X64-SSE2-NEXT:    movq -48(%rsp,%rsi), %rsi
+; X64-SSE2-NEXT:    movq %rsi, 16(%rdx)
+; X64-SSE2-NEXT:    movq %rdi, 24(%rdx)
+; X64-SSE2-NEXT:    movq %rax, (%rdx)
+; X64-SSE2-NEXT:    movq %rcx, 8(%rdx)
+; X64-SSE2-NEXT:    retq
+;
+; X64-SSE42-LABEL: ashr_32bytes:
+; X64-SSE42:       # %bb.0:
+; X64-SSE42-NEXT:    movups (%rdi), %xmm0
+; X64-SSE42-NEXT:    movq 16(%rdi), %rax
+; X64-SSE42-NEXT:    movq 24(%rdi), %rcx
+; X64-SSE42-NEXT:    movzbl (%rsi), %esi
+; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    sarq $63, %rcx
+; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    andl $31, %esi
+; X64-SSE42-NEXT:    movups -64(%rsp,%rsi), %xmm0
+; X64-SSE42-NEXT:    movups -48(%rsp,%rsi), %xmm1
+; X64-SSE42-NEXT:    movups %xmm1, 16(%rdx)
+; X64-SSE42-NEXT:    movups %xmm0, (%rdx)
+; X64-SSE42-NEXT:    retq
+;
+; X64-AVX-LABEL: ashr_32bytes:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovups (%rdi), %xmm0
+; X64-AVX-NEXT:    movq 16(%rdi), %rax
+; X64-AVX-NEXT:    movq 24(%rdi), %rcx
+; X64-AVX-NEXT:    movzbl (%rsi), %esi
+; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    vmovups %xmm0, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    sarq $63, %rcx
+; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    andl $31, %esi
+; X64-AVX-NEXT:    vmovups -64(%rsp,%rsi), %xmm0
+; X64-AVX-NEXT:    vmovups -48(%rsp,%rsi), %xmm1
+; X64-AVX-NEXT:    vmovups %xmm1, 16(%rdx)
+; X64-AVX-NEXT:    vmovups %xmm0, (%rdx)
+; X64-AVX-NEXT:    retq
+;
+; X32-SSE2-LABEL: ashr_32bytes:
+; X32-SSE2:       # %bb.0:
+; X32-SSE2-NEXT:    pushl %ebp
+; X32-SSE2-NEXT:    pushl %ebx
+; X32-SSE2-NEXT:    pushl %edi
+; X32-SSE2-NEXT:    pushl %esi
+; X32-SSE2-NEXT:    subl $72, %esp
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movl (%eax), %ecx
+; X32-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 4(%eax), %ecx
+; X32-SSE2-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 8(%eax), %edi
+; X32-SSE2-NEXT:    movl 12(%eax), %ebx
+; X32-SSE2-NEXT:    movl 16(%eax), %ebp
+; X32-SSE2-NEXT:    movl 20(%eax), %esi
+; X32-SSE2-NEXT:    movl 24(%eax), %edx
+; X32-SSE2-NEXT:    movl 28(%eax), %ecx
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movzbl (%eax), %eax
+; X32-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    sarl $31, %ecx
+; X32-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    andl $31, %eax
+; X32-SSE2-NEXT:    movl 8(%esp,%eax), %ecx
+; X32-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 12(%esp,%eax), %ecx
+; X32-SSE2-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 20(%esp,%eax), %esi
+; X32-SSE2-NEXT:    movl 16(%esp,%eax), %edi
+; X32-SSE2-NEXT:    movl 28(%esp,%eax), %ebx
+; X32-SSE2-NEXT:    movl 24(%esp,%eax), %ebp
+; X32-SSE2-NEXT:    movl 36(%esp,%eax), %edx
+; X32-SSE2-NEXT:    movl 32(%esp,%eax), %ecx
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movl %ecx, 24(%eax)
+; X32-SSE2-NEXT:    movl %edx, 28(%eax)
+; X32-SSE2-NEXT:    movl %ebp, 16(%eax)
+; X32-SSE2-NEXT:    movl %ebx, 20(%eax)
+; X32-SSE2-NEXT:    movl %edi, 8(%eax)
+; X32-SSE2-NEXT:    movl %esi, 12(%eax)
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %ecx, (%eax)
+; X32-SSE2-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %ecx, 4(%eax)
+; X32-SSE2-NEXT:    addl $72, %esp
+; X32-SSE2-NEXT:    popl %esi
+; X32-SSE2-NEXT:    popl %edi
+; X32-SSE2-NEXT:    popl %ebx
+; X32-SSE2-NEXT:    popl %ebp
+; X32-SSE2-NEXT:    retl
+;
+; X32-SSE42-LABEL: ashr_32bytes:
+; X32-SSE42:       # %bb.0:
+; X32-SSE42-NEXT:    pushl %ebx
+; X32-SSE42-NEXT:    pushl %edi
+; X32-SSE42-NEXT:    pushl %esi
+; X32-SSE42-NEXT:    subl $64, %esp
+; X32-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-SSE42-NEXT:    movups (%edx), %xmm0
+; X32-SSE42-NEXT:    movl 16(%edx), %esi
+; X32-SSE42-NEXT:    movl 20(%edx), %edi
+; X32-SSE42-NEXT:    movl 24(%edx), %ebx
+; X32-SSE42-NEXT:    movl 28(%edx), %edx
+; X32-SSE42-NEXT:    movzbl (%ecx), %ecx
+; X32-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    movups %xmm0, (%esp)
+; X32-SSE42-NEXT:    sarl $31, %edx
+; X32-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    andl $31, %ecx
+; X32-SSE42-NEXT:    movups (%esp,%ecx), %xmm0
+; X32-SSE42-NEXT:    movups 16(%esp,%ecx), %xmm1
+; X32-SSE42-NEXT:    movups %xmm1, 16(%eax)
+; X32-SSE42-NEXT:    movups %xmm0, (%eax)
+; X32-SSE42-NEXT:    addl $64, %esp
+; X32-SSE42-NEXT:    popl %esi
+; X32-SSE42-NEXT:    popl %edi
+; X32-SSE42-NEXT:    popl %ebx
+; X32-SSE42-NEXT:    retl
+;
+; X32-AVX-LABEL: ashr_32bytes:
+; X32-AVX:       # %bb.0:
+; X32-AVX-NEXT:    pushl %ebx
+; X32-AVX-NEXT:    pushl %edi
+; X32-AVX-NEXT:    pushl %esi
+; X32-AVX-NEXT:    subl $64, %esp
+; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-AVX-NEXT:    vmovups (%edx), %xmm0
+; X32-AVX-NEXT:    movl 16(%edx), %esi
+; X32-AVX-NEXT:    movl 20(%edx), %edi
+; X32-AVX-NEXT:    movl 24(%edx), %ebx
+; X32-AVX-NEXT:    movl 28(%edx), %edx
+; X32-AVX-NEXT:    movzbl (%ecx), %ecx
+; X32-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X32-AVX-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X32-AVX-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-AVX-NEXT:    vmovups %xmm0, (%esp)
+; X32-AVX-NEXT:    sarl $31, %edx
+; X32-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX-NEXT:    andl $31, %ecx
+; X32-AVX-NEXT:    vmovups (%esp,%ecx), %xmm0
+; X32-AVX-NEXT:    vmovups 16(%esp,%ecx), %xmm1
+; X32-AVX-NEXT:    vmovups %xmm1, 16(%eax)
+; X32-AVX-NEXT:    vmovups %xmm0, (%eax)
+; X32-AVX-NEXT:    addl $64, %esp
+; X32-AVX-NEXT:    popl %esi
+; X32-AVX-NEXT:    popl %edi
+; X32-AVX-NEXT:    popl %ebx
+; X32-AVX-NEXT:    retl
   %src = load i256, ptr %src.ptr, align 1
   %byteOff = load i256, ptr %byteOff.ptr, align 1
   %bitOff = shl i256 %byteOff, 3
@@ -1215,9 +1653,1124 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
   store i256 %res, ptr %dst, align 1
   ret void
 }
+
+define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
+; X64-SSE2-LABEL: lshr_64bytes:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    pushq %rbx
+; X64-SSE2-NEXT:    movq (%rdi), %rax
+; X64-SSE2-NEXT:    movq 8(%rdi), %rcx
+; X64-SSE2-NEXT:    movq 16(%rdi), %r8
+; X64-SSE2-NEXT:    movq 24(%rdi), %r9
+; X64-SSE2-NEXT:    movq 32(%rdi), %r10
+; X64-SSE2-NEXT:    movq 40(%rdi), %r11
+; X64-SSE2-NEXT:    movq 48(%rdi), %rbx
+; X64-SSE2-NEXT:    movq 56(%rdi), %rdi
+; X64-SSE2-NEXT:    movl (%rsi), %esi
+; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    andl $63, %esi
+; X64-SSE2-NEXT:    movq -128(%rsp,%rsi), %rax
+; X64-SSE2-NEXT:    movq -120(%rsp,%rsi), %rcx
+; X64-SSE2-NEXT:    movq -104(%rsp,%rsi), %rdi
+; X64-SSE2-NEXT:    movq -112(%rsp,%rsi), %r8
+; X64-SSE2-NEXT:    movq -88(%rsp,%rsi), %r9
+; X64-SSE2-NEXT:    movq -96(%rsp,%rsi), %r10
+; X64-SSE2-NEXT:    movq -72(%rsp,%rsi), %r11
+; X64-SSE2-NEXT:    movq -80(%rsp,%rsi), %rsi
+; X64-SSE2-NEXT:    movq %rsi, 48(%rdx)
+; X64-SSE2-NEXT:    movq %r11, 56(%rdx)
+; X64-SSE2-NEXT:    movq %r10, 32(%rdx)
+; X64-SSE2-NEXT:    movq %r9, 40(%rdx)
+; X64-SSE2-NEXT:    movq %r8, 16(%rdx)
+; X64-SSE2-NEXT:    movq %rdi, 24(%rdx)
+; X64-SSE2-NEXT:    movq %rax, (%rdx)
+; X64-SSE2-NEXT:    movq %rcx, 8(%rdx)
+; X64-SSE2-NEXT:    popq %rbx
+; X64-SSE2-NEXT:    retq
+;
+; X64-SSE42-LABEL: lshr_64bytes:
+; X64-SSE42:       # %bb.0:
+; X64-SSE42-NEXT:    movups (%rdi), %xmm0
+; X64-SSE42-NEXT:    movups 16(%rdi), %xmm1
+; X64-SSE42-NEXT:    movups 32(%rdi), %xmm2
+; X64-SSE42-NEXT:    movups 48(%rdi), %xmm3
+; X64-SSE42-NEXT:    movl (%rsi), %eax
+; X64-SSE42-NEXT:    xorps %xmm4, %xmm4
+; X64-SSE42-NEXT:    movups %xmm4, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movups %xmm4, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movups %xmm4, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movups %xmm4, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movups %xmm3, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movups %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movups %xmm1, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    andl $63, %eax
+; X64-SSE42-NEXT:    movups -128(%rsp,%rax), %xmm0
+; X64-SSE42-NEXT:    movups -112(%rsp,%rax), %xmm1
+; X64-SSE42-NEXT:    movups -96(%rsp,%rax), %xmm2
+; X64-SSE42-NEXT:    movups -80(%rsp,%rax), %xmm3
+; X64-SSE42-NEXT:    movups %xmm3, 48(%rdx)
+; X64-SSE42-NEXT:    movups %xmm1, 16(%rdx)
+; X64-SSE42-NEXT:    movups %xmm2, 32(%rdx)
+; X64-SSE42-NEXT:    movups %xmm0, (%rdx)
+; X64-SSE42-NEXT:    retq
+;
+; X64-AVX1-LABEL: lshr_64bytes:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
+; X64-AVX1-NEXT:    vmovups 32(%rdi), %ymm1
+; X64-AVX1-NEXT:    movl (%rsi), %eax
+; X64-AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; X64-AVX1-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT:    andl $63, %eax
+; X64-AVX1-NEXT:    vmovups -128(%rsp,%rax), %xmm0
+; X64-AVX1-NEXT:    vmovups -112(%rsp,%rax), %xmm1
+; X64-AVX1-NEXT:    vmovups -96(%rsp,%rax), %xmm2
+; X64-AVX1-NEXT:    vmovups -80(%rsp,%rax), %xmm3
+; X64-AVX1-NEXT:    vmovups %xmm3, 48(%rdx)
+; X64-AVX1-NEXT:    vmovups %xmm1, 16(%rdx)
+; X64-AVX1-NEXT:    vmovups %xmm2, 32(%rdx)
+; X64-AVX1-NEXT:    vmovups %xmm0, (%rdx)
+; X64-AVX1-NEXT:    vzeroupper
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX512-LABEL: lshr_64bytes:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    vmovups (%rdi), %zmm0
+; X64-AVX512-NEXT:    movl (%rsi), %eax
+; X64-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X64-AVX512-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
+; X64-AVX512-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; X64-AVX512-NEXT:    andl $63, %eax
+; X64-AVX512-NEXT:    vmovups -128(%rsp,%rax), %xmm0
+; X64-AVX512-NEXT:    vmovups -112(%rsp,%rax), %xmm1
+; X64-AVX512-NEXT:    vmovups -96(%rsp,%rax), %xmm2
+; X64-AVX512-NEXT:    vmovups -80(%rsp,%rax), %xmm3
+; X64-AVX512-NEXT:    vmovups %xmm3, 48(%rdx)
+; X64-AVX512-NEXT:    vmovups %xmm1, 16(%rdx)
+; X64-AVX512-NEXT:    vmovups %xmm2, 32(%rdx)
+; X64-AVX512-NEXT:    vmovups %xmm0, (%rdx)
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+;
+; X32-SSE2-LABEL: lshr_64bytes:
+; X32-SSE2:       # %bb.0:
+; X32-SSE2-NEXT:    pushl %ebp
+; X32-SSE2-NEXT:    pushl %ebx
+; X32-SSE2-NEXT:    pushl %edi
+; X32-SSE2-NEXT:    pushl %esi
+; X32-SSE2-NEXT:    subl $168, %esp
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movl (%eax), %ecx
+; X32-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 4(%eax), %ecx
+; X32-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 8(%eax), %ecx
+; X32-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 12(%eax), %ecx
+; X32-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 16(%eax), %ecx
+; X32-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 20(%eax), %ecx
+; X32-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 24(%eax), %ecx
+; X32-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 28(%eax), %ecx
+; X32-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 32(%eax), %ecx
+; X32-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 36(%eax), %ecx
+; X32-SSE2-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 40(%eax), %ebp
+; X32-SSE2-NEXT:    movl 44(%eax), %ebx
+; X32-SSE2-NEXT:    movl 48(%eax), %edi
+; X32-SSE2-NEXT:    movl 52(%eax), %esi
+; X32-SSE2-NEXT:    movl 56(%eax), %edx
+; X32-SSE2-NEXT:    movl 60(%eax), %ecx
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movl (%eax), %eax
+; X32-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    andl $63, %eax
+; X32-SSE2-NEXT:    movl 40(%esp,%eax), %ecx
+; X32-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 44(%esp,%eax), %ecx
+; X32-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 52(%esp,%eax), %ecx
+; X32-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 48(%esp,%eax), %ecx
+; X32-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 60(%esp,%eax), %ecx
+; X32-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 56(%esp,%eax), %ecx
+; X32-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 68(%esp,%eax), %ecx
+; X32-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 64(%esp,%eax), %ecx
+; X32-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 76(%esp,%eax), %ecx
+; X32-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 72(%esp,%eax), %ecx
+; X32-SSE2-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 84(%esp,%eax), %ebp
+; X32-SSE2-NEXT:    movl 80(%esp,%eax), %ebx
+; X32-SSE2-NEXT:    movl 92(%esp,%eax), %edi
+; X32-SSE2-NEXT:    movl 88(%esp,%eax), %esi
+; X32-SSE2-NEXT:    movl 100(%esp,%eax), %edx
+; X32-SSE2-NEXT:    movl 96(%esp,%eax), %ecx
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movl %ecx, 56(%eax)
+; X32-SSE2-NEXT:    movl %edx, 60(%eax)
+; X32-SSE2-NEXT:    movl %esi, 48(%eax)
+; X32-SSE2-NEXT:    movl %edi, 52(%eax)
+; X32-SSE2-NEXT:    movl %ebx, 40(%eax)
+; X32-SSE2-NEXT:    movl %ebp, 44(%eax)
+; X32-SSE2-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %ecx, 32(%eax)
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %ecx, 36(%eax)
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %ecx, 24(%eax)
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %ecx, 28(%eax)
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %ecx, 16(%eax)
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %ecx, 20(%eax)
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %ecx, 8(%eax)
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %ecx, 12(%eax)
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %ecx, (%eax)
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %ecx, 4(%eax)
+; X32-SSE2-NEXT:    addl $168, %esp
+; X32-SSE2-NEXT:    popl %esi
+; X32-SSE2-NEXT:    popl %edi
+; X32-SSE2-NEXT:    popl %ebx
+; X32-SSE2-NEXT:    popl %ebp
+; X32-SSE2-NEXT:    retl
+;
+; X32-SSE42-LABEL: lshr_64bytes:
+; X32-SSE42:       # %bb.0:
+; X32-SSE42-NEXT:    subl $128, %esp
+; X32-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-SSE42-NEXT:    movups (%edx), %xmm0
+; X32-SSE42-NEXT:    movups 16(%edx), %xmm1
+; X32-SSE42-NEXT:    movups 32(%edx), %xmm2
+; X32-SSE42-NEXT:    movups 48(%edx), %xmm3
+; X32-SSE42-NEXT:    movl (%ecx), %ecx
+; X32-SSE42-NEXT:    xorps %xmm4, %xmm4
+; X32-SSE42-NEXT:    movups %xmm4, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    movups %xmm4, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    movups %xmm4, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    movups %xmm4, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    movups %xmm3, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    movups %xmm2, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    movups %xmm1, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    movups %xmm0, (%esp)
+; X32-SSE42-NEXT:    andl $63, %ecx
+; X32-SSE42-NEXT:    movups (%esp,%ecx), %xmm0
+; X32-SSE42-NEXT:    movups 16(%esp,%ecx), %xmm1
+; X32-SSE42-NEXT:    movups 32(%esp,%ecx), %xmm2
+; X32-SSE42-NEXT:    movups 48(%esp,%ecx), %xmm3
+; X32-SSE42-NEXT:    movups %xmm3, 48(%eax)
+; X32-SSE42-NEXT:    movups %xmm2, 32(%eax)
+; X32-SSE42-NEXT:    movups %xmm1, 16(%eax)
+; X32-SSE42-NEXT:    movups %xmm0, (%eax)
+; X32-SSE42-NEXT:    addl $128, %esp
+; X32-SSE42-NEXT:    retl
+;
+; X32-AVX1-LABEL: lshr_64bytes:
+; X32-AVX1:       # %bb.0:
+; X32-AVX1-NEXT:    subl $128, %esp
+; X32-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-AVX1-NEXT:    vmovups (%edx), %ymm0
+; X32-AVX1-NEXT:    vmovups 32(%edx), %ymm1
+; X32-AVX1-NEXT:    movl (%ecx), %ecx
+; X32-AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
+; X32-AVX1-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
+; X32-AVX1-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
+; X32-AVX1-NEXT:    vmovups %ymm0, (%esp)
+; X32-AVX1-NEXT:    andl $63, %ecx
+; X32-AVX1-NEXT:    vmovups (%esp,%ecx), %xmm0
+; X32-AVX1-NEXT:    vmovups 16(%esp,%ecx), %xmm1
+; X32-AVX1-NEXT:    vmovups 32(%esp,%ecx), %xmm2
+; X32-AVX1-NEXT:    vmovups 48(%esp,%ecx), %xmm3
+; X32-AVX1-NEXT:    vmovups %xmm3, 48(%eax)
+; X32-AVX1-NEXT:    vmovups %xmm2, 32(%eax)
+; X32-AVX1-NEXT:    vmovups %xmm1, 16(%eax)
+; X32-AVX1-NEXT:    vmovups %xmm0, (%eax)
+; X32-AVX1-NEXT:    addl $128, %esp
+; X32-AVX1-NEXT:    vzeroupper
+; X32-AVX1-NEXT:    retl
+;
+; X32-AVX512-LABEL: lshr_64bytes:
+; X32-AVX512:       # %bb.0:
+; X32-AVX512-NEXT:    subl $128, %esp
+; X32-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-AVX512-NEXT:    vmovups (%edx), %zmm0
+; X32-AVX512-NEXT:    movl (%ecx), %ecx
+; X32-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X32-AVX512-NEXT:    vmovups %zmm1, {{[0-9]+}}(%esp)
+; X32-AVX512-NEXT:    vmovups %zmm0, (%esp)
+; X32-AVX512-NEXT:    andl $63, %ecx
+; X32-AVX512-NEXT:    vmovups (%esp,%ecx), %xmm0
+; X32-AVX512-NEXT:    vmovups 16(%esp,%ecx), %xmm1
+; X32-AVX512-NEXT:    vmovups 32(%esp,%ecx), %xmm2
+; X32-AVX512-NEXT:    vmovups 48(%esp,%ecx), %xmm3
+; X32-AVX512-NEXT:    vmovups %xmm3, 48(%eax)
+; X32-AVX512-NEXT:    vmovups %xmm2, 32(%eax)
+; X32-AVX512-NEXT:    vmovups %xmm1, 16(%eax)
+; X32-AVX512-NEXT:    vmovups %xmm0, (%eax)
+; X32-AVX512-NEXT:    addl $128, %esp
+; X32-AVX512-NEXT:    vzeroupper
+; X32-AVX512-NEXT:    retl
+  %src = load i512, ptr %src.ptr, align 1
+  %byteOff = load i512, ptr %byteOff.ptr, align 1
+  %bitOff = shl i512 %byteOff, 3
+  %res = lshr i512 %src, %bitOff
+  store i512 %res, ptr %dst, align 1
+  ret void
+}
+define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
+; X64-SSE2-LABEL: shl_64bytes:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    pushq %rbx
+; X64-SSE2-NEXT:    movq (%rdi), %rax
+; X64-SSE2-NEXT:    movq 8(%rdi), %rcx
+; X64-SSE2-NEXT:    movq 16(%rdi), %r8
+; X64-SSE2-NEXT:    movq 24(%rdi), %r9
+; X64-SSE2-NEXT:    movq 32(%rdi), %r10
+; X64-SSE2-NEXT:    movq 40(%rdi), %r11
+; X64-SSE2-NEXT:    movq 48(%rdi), %rbx
+; X64-SSE2-NEXT:    movq 56(%rdi), %rdi
+; X64-SSE2-NEXT:    movl (%rsi), %esi
+; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    andl $63, %esi
+; X64-SSE2-NEXT:    negl %esi
+; X64-SSE2-NEXT:    movslq %esi, %rax
+; X64-SSE2-NEXT:    movq -64(%rsp,%rax), %rcx
+; X64-SSE2-NEXT:    movq -56(%rsp,%rax), %rsi
+; X64-SSE2-NEXT:    movq -40(%rsp,%rax), %rdi
+; X64-SSE2-NEXT:    movq -48(%rsp,%rax), %r8
+; X64-SSE2-NEXT:    movq -24(%rsp,%rax), %r9
+; X64-SSE2-NEXT:    movq -32(%rsp,%rax), %r10
+; X64-SSE2-NEXT:    movq -8(%rsp,%rax), %r11
+; X64-SSE2-NEXT:    movq -16(%rsp,%rax), %rax
+; X64-SSE2-NEXT:    movq %rax, 48(%rdx)
+; X64-SSE2-NEXT:    movq %r11, 56(%rdx)
+; X64-SSE2-NEXT:    movq %r10, 32(%rdx)
+; X64-SSE2-NEXT:    movq %r9, 40(%rdx)
+; X64-SSE2-NEXT:    movq %r8, 16(%rdx)
+; X64-SSE2-NEXT:    movq %rdi, 24(%rdx)
+; X64-SSE2-NEXT:    movq %rcx, (%rdx)
+; X64-SSE2-NEXT:    movq %rsi, 8(%rdx)
+; X64-SSE2-NEXT:    popq %rbx
+; X64-SSE2-NEXT:    retq
+;
+; X64-SSE42-LABEL: shl_64bytes:
+; X64-SSE42:       # %bb.0:
+; X64-SSE42-NEXT:    movups (%rdi), %xmm0
+; X64-SSE42-NEXT:    movups 16(%rdi), %xmm1
+; X64-SSE42-NEXT:    movups 32(%rdi), %xmm2
+; X64-SSE42-NEXT:    movups 48(%rdi), %xmm3
+; X64-SSE42-NEXT:    movl (%rsi), %eax
+; X64-SSE42-NEXT:    xorps %xmm4, %xmm4
+; X64-SSE42-NEXT:    movups %xmm4, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movups %xmm4, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movups %xmm4, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movups %xmm4, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movups %xmm3, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movups %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movups %xmm1, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    andl $63, %eax
+; X64-SSE42-NEXT:    negl %eax
+; X64-SSE42-NEXT:    cltq
+; X64-SSE42-NEXT:    movups -64(%rsp,%rax), %xmm0
+; X64-SSE42-NEXT:    movups -48(%rsp,%rax), %xmm1
+; X64-SSE42-NEXT:    movups -32(%rsp,%rax), %xmm2
+; X64-SSE42-NEXT:    movups -16(%rsp,%rax), %xmm3
+; X64-SSE42-NEXT:    movups %xmm3, 48(%rdx)
+; X64-SSE42-NEXT:    movups %xmm1, 16(%rdx)
+; X64-SSE42-NEXT:    movups %xmm2, 32(%rdx)
+; X64-SSE42-NEXT:    movups %xmm0, (%rdx)
+; X64-SSE42-NEXT:    retq
+;
+; X64-AVX1-LABEL: shl_64bytes:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
+; X64-AVX1-NEXT:    vmovups 32(%rdi), %ymm1
+; X64-AVX1-NEXT:    movl (%rsi), %eax
+; X64-AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; X64-AVX1-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT:    andl $63, %eax
+; X64-AVX1-NEXT:    negl %eax
+; X64-AVX1-NEXT:    cltq
+; X64-AVX1-NEXT:    vmovups -64(%rsp,%rax), %xmm0
+; X64-AVX1-NEXT:    vmovups -48(%rsp,%rax), %xmm1
+; X64-AVX1-NEXT:    vmovups -32(%rsp,%rax), %xmm2
+; X64-AVX1-NEXT:    vmovups -16(%rsp,%rax), %xmm3
+; X64-AVX1-NEXT:    vmovups %xmm3, 48(%rdx)
+; X64-AVX1-NEXT:    vmovups %xmm1, 16(%rdx)
+; X64-AVX1-NEXT:    vmovups %xmm2, 32(%rdx)
+; X64-AVX1-NEXT:    vmovups %xmm0, (%rdx)
+; X64-AVX1-NEXT:    vzeroupper
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX512-LABEL: shl_64bytes:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    vmovups (%rdi), %zmm0
+; X64-AVX512-NEXT:    movl (%rsi), %eax
+; X64-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X64-AVX512-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
+; X64-AVX512-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; X64-AVX512-NEXT:    andl $63, %eax
+; X64-AVX512-NEXT:    negl %eax
+; X64-AVX512-NEXT:    cltq
+; X64-AVX512-NEXT:    vmovups -64(%rsp,%rax), %xmm0
+; X64-AVX512-NEXT:    vmovups -48(%rsp,%rax), %xmm1
+; X64-AVX512-NEXT:    vmovups -32(%rsp,%rax), %xmm2
+; X64-AVX512-NEXT:    vmovups -16(%rsp,%rax), %xmm3
+; X64-AVX512-NEXT:    vmovups %xmm3, 48(%rdx)
+; X64-AVX512-NEXT:    vmovups %xmm1, 16(%rdx)
+; X64-AVX512-NEXT:    vmovups %xmm2, 32(%rdx)
+; X64-AVX512-NEXT:    vmovups %xmm0, (%rdx)
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+;
+; X32-SSE2-LABEL: shl_64bytes:
+; X32-SSE2:       # %bb.0:
+; X32-SSE2-NEXT:    pushl %ebp
+; X32-SSE2-NEXT:    pushl %ebx
+; X32-SSE2-NEXT:    pushl %edi
+; X32-SSE2-NEXT:    pushl %esi
+; X32-SSE2-NEXT:    subl $168, %esp
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movl (%eax), %ecx
+; X32-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 4(%eax), %ecx
+; X32-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 8(%eax), %ecx
+; X32-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 12(%eax), %ecx
+; X32-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 16(%eax), %ecx
+; X32-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 20(%eax), %ecx
+; X32-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 24(%eax), %ecx
+; X32-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 28(%eax), %ecx
+; X32-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 32(%eax), %ecx
+; X32-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 36(%eax), %ecx
+; X32-SSE2-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 40(%eax), %ebp
+; X32-SSE2-NEXT:    movl 44(%eax), %ebx
+; X32-SSE2-NEXT:    movl 48(%eax), %edi
+; X32-SSE2-NEXT:    movl 52(%eax), %esi
+; X32-SSE2-NEXT:    movl 56(%eax), %edx
+; X32-SSE2-NEXT:    movl 60(%eax), %ecx
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movl (%eax), %eax
+; X32-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    andl $63, %eax
+; X32-SSE2-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; X32-SSE2-NEXT:    subl %eax, %ecx
+; X32-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl (%ecx), %edx
+; X32-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 4(%ecx), %edx
+; X32-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 12(%ecx), %edx
+; X32-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 8(%ecx), %edx
+; X32-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 20(%ecx), %edx
+; X32-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 16(%ecx), %edx
+; X32-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 28(%ecx), %edx
+; X32-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 24(%ecx), %edx
+; X32-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 36(%ecx), %edx
+; X32-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 32(%ecx), %edx
+; X32-SSE2-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 44(%ecx), %ebp
+; X32-SSE2-NEXT:    movl 40(%ecx), %ebx
+; X32-SSE2-NEXT:    movl 52(%ecx), %edi
+; X32-SSE2-NEXT:    movl 60(%ecx), %esi
+; X32-SSE2-NEXT:    movl 56(%ecx), %edx
+; X32-SSE2-NEXT:    negl %eax
+; X32-SSE2-NEXT:    movl 152(%esp,%eax), %ecx
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movl %edx, 56(%eax)
+; X32-SSE2-NEXT:    movl %esi, 60(%eax)
+; X32-SSE2-NEXT:    movl %ecx, 48(%eax)
+; X32-SSE2-NEXT:    movl %edi, 52(%eax)
+; X32-SSE2-NEXT:    movl %ebx, 40(%eax)
+; X32-SSE2-NEXT:    movl %ebp, 44(%eax)
+; X32-SSE2-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %ecx, 32(%eax)
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %ecx, 36(%eax)
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %ecx, 24(%eax)
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %ecx, 28(%eax)
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %ecx, 16(%eax)
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %ecx, 20(%eax)
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %ecx, 8(%eax)
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %ecx, 12(%eax)
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %ecx, (%eax)
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %ecx, 4(%eax)
+; X32-SSE2-NEXT:    addl $168, %esp
+; X32-SSE2-NEXT:    popl %esi
+; X32-SSE2-NEXT:    popl %edi
+; X32-SSE2-NEXT:    popl %ebx
+; X32-SSE2-NEXT:    popl %ebp
+; X32-SSE2-NEXT:    retl
+;
+; X32-SSE42-LABEL: shl_64bytes:
+; X32-SSE42:       # %bb.0:
+; X32-SSE42-NEXT:    subl $128, %esp
+; X32-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-SSE42-NEXT:    movups (%edx), %xmm0
+; X32-SSE42-NEXT:    movups 16(%edx), %xmm1
+; X32-SSE42-NEXT:    movups 32(%edx), %xmm2
+; X32-SSE42-NEXT:    movups 48(%edx), %xmm3
+; X32-SSE42-NEXT:    movl (%ecx), %ecx
+; X32-SSE42-NEXT:    xorps %xmm4, %xmm4
+; X32-SSE42-NEXT:    movups %xmm4, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    movups %xmm4, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    movups %xmm4, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    movups %xmm4, (%esp)
+; X32-SSE42-NEXT:    movups %xmm3, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    movups %xmm2, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    movups %xmm1, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    movups %xmm0, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    andl $63, %ecx
+; X32-SSE42-NEXT:    leal {{[0-9]+}}(%esp), %edx
+; X32-SSE42-NEXT:    subl %ecx, %edx
+; X32-SSE42-NEXT:    movups (%edx), %xmm0
+; X32-SSE42-NEXT:    movups 16(%edx), %xmm1
+; X32-SSE42-NEXT:    movups 32(%edx), %xmm2
+; X32-SSE42-NEXT:    negl %ecx
+; X32-SSE42-NEXT:    movups 112(%esp,%ecx), %xmm3
+; X32-SSE42-NEXT:    movups %xmm3, 48(%eax)
+; X32-SSE42-NEXT:    movups %xmm2, 32(%eax)
+; X32-SSE42-NEXT:    movups %xmm1, 16(%eax)
+; X32-SSE42-NEXT:    movups %xmm0, (%eax)
+; X32-SSE42-NEXT:    addl $128, %esp
+; X32-SSE42-NEXT:    retl
+;
+; X32-AVX1-LABEL: shl_64bytes:
+; X32-AVX1:       # %bb.0:
+; X32-AVX1-NEXT:    subl $128, %esp
+; X32-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-AVX1-NEXT:    vmovups (%edx), %ymm0
+; X32-AVX1-NEXT:    vmovups 32(%edx), %ymm1
+; X32-AVX1-NEXT:    movl (%ecx), %ecx
+; X32-AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; X32-AVX1-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
+; X32-AVX1-NEXT:    vmovups %ymm2, (%esp)
+; X32-AVX1-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
+; X32-AVX1-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; X32-AVX1-NEXT:    andl $63, %ecx
+; X32-AVX1-NEXT:    leal {{[0-9]+}}(%esp), %edx
+; X32-AVX1-NEXT:    subl %ecx, %edx
+; X32-AVX1-NEXT:    vmovups (%edx), %xmm0
+; X32-AVX1-NEXT:    vmovups 16(%edx), %xmm1
+; X32-AVX1-NEXT:    vmovups 32(%edx), %xmm2
+; X32-AVX1-NEXT:    negl %ecx
+; X32-AVX1-NEXT:    vmovups 112(%esp,%ecx), %xmm3
+; X32-AVX1-NEXT:    vmovups %xmm3, 48(%eax)
+; X32-AVX1-NEXT:    vmovups %xmm2, 32(%eax)
+; X32-AVX1-NEXT:    vmovups %xmm1, 16(%eax)
+; X32-AVX1-NEXT:    vmovups %xmm0, (%eax)
+; X32-AVX1-NEXT:    addl $128, %esp
+; X32-AVX1-NEXT:    vzeroupper
+; X32-AVX1-NEXT:    retl
+;
+; X32-AVX512-LABEL: shl_64bytes:
+; X32-AVX512:       # %bb.0:
+; X32-AVX512-NEXT:    subl $128, %esp
+; X32-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-AVX512-NEXT:    vmovups (%edx), %zmm0
+; X32-AVX512-NEXT:    movl (%ecx), %ecx
+; X32-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X32-AVX512-NEXT:    vmovups %zmm1, (%esp)
+; X32-AVX512-NEXT:    vmovups %zmm0, {{[0-9]+}}(%esp)
+; X32-AVX512-NEXT:    andl $63, %ecx
+; X32-AVX512-NEXT:    leal {{[0-9]+}}(%esp), %edx
+; X32-AVX512-NEXT:    subl %ecx, %edx
+; X32-AVX512-NEXT:    vmovups (%edx), %xmm0
+; X32-AVX512-NEXT:    vmovups 16(%edx), %xmm1
+; X32-AVX512-NEXT:    vmovups 32(%edx), %xmm2
+; X32-AVX512-NEXT:    negl %ecx
+; X32-AVX512-NEXT:    vmovups 112(%esp,%ecx), %xmm3
+; X32-AVX512-NEXT:    vmovups %xmm3, 48(%eax)
+; X32-AVX512-NEXT:    vmovups %xmm2, 32(%eax)
+; X32-AVX512-NEXT:    vmovups %xmm1, 16(%eax)
+; X32-AVX512-NEXT:    vmovups %xmm0, (%eax)
+; X32-AVX512-NEXT:    addl $128, %esp
+; X32-AVX512-NEXT:    vzeroupper
+; X32-AVX512-NEXT:    retl
+  %src = load i512, ptr %src.ptr, align 1
+  %byteOff = load i512, ptr %byteOff.ptr, align 1
+  %bitOff = shl i512 %byteOff, 3
+  %res = shl i512 %src, %bitOff
+  store i512 %res, ptr %dst, align 1
+  ret void
+}
+define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
+; X64-SSE2-LABEL: ashr_64bytes:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    pushq %r14
+; X64-SSE2-NEXT:    pushq %rbx
+; X64-SSE2-NEXT:    movq (%rdi), %rcx
+; X64-SSE2-NEXT:    movq 8(%rdi), %r8
+; X64-SSE2-NEXT:    movq 16(%rdi), %r9
+; X64-SSE2-NEXT:    movq 24(%rdi), %r10
+; X64-SSE2-NEXT:    movq 32(%rdi), %r11
+; X64-SSE2-NEXT:    movq 40(%rdi), %rbx
+; X64-SSE2-NEXT:    movq 48(%rdi), %r14
+; X64-SSE2-NEXT:    movq 56(%rdi), %rdi
+; X64-SSE2-NEXT:    movl (%rsi), %eax
+; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    sarq $63, %rdi
+; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT:    andl $63, %eax
+; X64-SSE2-NEXT:    movq -128(%rsp,%rax), %rcx
+; X64-SSE2-NEXT:    movq -120(%rsp,%rax), %rsi
+; X64-SSE2-NEXT:    movq -104(%rsp,%rax), %rdi
+; X64-SSE2-NEXT:    movq -112(%rsp,%rax), %r8
+; X64-SSE2-NEXT:    movq -88(%rsp,%rax), %r9
+; X64-SSE2-NEXT:    movq -96(%rsp,%rax), %r10
+; X64-SSE2-NEXT:    movq -72(%rsp,%rax), %r11
+; X64-SSE2-NEXT:    movq -80(%rsp,%rax), %rax
+; X64-SSE2-NEXT:    movq %rax, 48(%rdx)
+; X64-SSE2-NEXT:    movq %r11, 56(%rdx)
+; X64-SSE2-NEXT:    movq %r10, 32(%rdx)
+; X64-SSE2-NEXT:    movq %r9, 40(%rdx)
+; X64-SSE2-NEXT:    movq %r8, 16(%rdx)
+; X64-SSE2-NEXT:    movq %rdi, 24(%rdx)
+; X64-SSE2-NEXT:    movq %rcx, (%rdx)
+; X64-SSE2-NEXT:    movq %rsi, 8(%rdx)
+; X64-SSE2-NEXT:    popq %rbx
+; X64-SSE2-NEXT:    popq %r14
+; X64-SSE2-NEXT:    retq
+;
+; X64-SSE42-LABEL: ashr_64bytes:
+; X64-SSE42:       # %bb.0:
+; X64-SSE42-NEXT:    movups (%rdi), %xmm0
+; X64-SSE42-NEXT:    movups 16(%rdi), %xmm1
+; X64-SSE42-NEXT:    movups 32(%rdi), %xmm2
+; X64-SSE42-NEXT:    movq 48(%rdi), %rax
+; X64-SSE42-NEXT:    movq 56(%rdi), %rcx
+; X64-SSE42-NEXT:    movl (%rsi), %esi
+; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movups %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movups %xmm1, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    sarq $63, %rcx
+; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT:    andl $63, %esi
+; X64-SSE42-NEXT:    movups -128(%rsp,%rsi), %xmm0
+; X64-SSE42-NEXT:    movups -112(%rsp,%rsi), %xmm1
+; X64-SSE42-NEXT:    movups -96(%rsp,%rsi), %xmm2
+; X64-SSE42-NEXT:    movups -80(%rsp,%rsi), %xmm3
+; X64-SSE42-NEXT:    movups %xmm1, 16(%rdx)
+; X64-SSE42-NEXT:    movups %xmm2, 32(%rdx)
+; X64-SSE42-NEXT:    movups %xmm3, 48(%rdx)
+; X64-SSE42-NEXT:    movups %xmm0, (%rdx)
+; X64-SSE42-NEXT:    retq
+;
+; X64-AVX-LABEL: ashr_64bytes:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovups (%rdi), %ymm0
+; X64-AVX-NEXT:    vmovups 32(%rdi), %xmm1
+; X64-AVX-NEXT:    movq 48(%rdi), %rax
+; X64-AVX-NEXT:    movq 56(%rdi), %rcx
+; X64-AVX-NEXT:    movl (%rsi), %esi
+; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    vmovups %xmm1, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    sarq $63, %rcx
+; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    andl $63, %esi
+; X64-AVX-NEXT:    vmovups -128(%rsp,%rsi), %xmm0
+; X64-AVX-NEXT:    vmovups -112(%rsp,%rsi), %xmm1
+; X64-AVX-NEXT:    vmovups -96(%rsp,%rsi), %xmm2
+; X64-AVX-NEXT:    vmovups -80(%rsp,%rsi), %xmm3
+; X64-AVX-NEXT:    vmovups %xmm1, 16(%rdx)
+; X64-AVX-NEXT:    vmovups %xmm2, 32(%rdx)
+; X64-AVX-NEXT:    vmovups %xmm3, 48(%rdx)
+; X64-AVX-NEXT:    vmovups %xmm0, (%rdx)
+; X64-AVX-NEXT:    vzeroupper
+; X64-AVX-NEXT:    retq
+;
+; X32-SSE2-LABEL: ashr_64bytes:
+; X32-SSE2:       # %bb.0:
+; X32-SSE2-NEXT:    pushl %ebp
+; X32-SSE2-NEXT:    pushl %ebx
+; X32-SSE2-NEXT:    pushl %edi
+; X32-SSE2-NEXT:    pushl %esi
+; X32-SSE2-NEXT:    subl $168, %esp
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movl (%eax), %ecx
+; X32-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 4(%eax), %ecx
+; X32-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 8(%eax), %ecx
+; X32-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 12(%eax), %ecx
+; X32-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 16(%eax), %ecx
+; X32-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 20(%eax), %ecx
+; X32-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 24(%eax), %ecx
+; X32-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 28(%eax), %ecx
+; X32-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 32(%eax), %ecx
+; X32-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 36(%eax), %ecx
+; X32-SSE2-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 40(%eax), %ebp
+; X32-SSE2-NEXT:    movl 44(%eax), %ebx
+; X32-SSE2-NEXT:    movl 48(%eax), %edi
+; X32-SSE2-NEXT:    movl 52(%eax), %esi
+; X32-SSE2-NEXT:    movl 56(%eax), %edx
+; X32-SSE2-NEXT:    movl 60(%eax), %ecx
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movl (%eax), %eax
+; X32-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    sarl $31, %ecx
+; X32-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-SSE2-NEXT:    andl $63, %eax
+; X32-SSE2-NEXT:    movl 40(%esp,%eax), %ecx
+; X32-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 44(%esp,%eax), %ecx
+; X32-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 52(%esp,%eax), %ecx
+; X32-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 48(%esp,%eax), %ecx
+; X32-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 60(%esp,%eax), %ecx
+; X32-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 56(%esp,%eax), %ecx
+; X32-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 68(%esp,%eax), %ecx
+; X32-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 64(%esp,%eax), %ecx
+; X32-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 76(%esp,%eax), %ecx
+; X32-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 72(%esp,%eax), %ecx
+; X32-SSE2-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-SSE2-NEXT:    movl 84(%esp,%eax), %ebp
+; X32-SSE2-NEXT:    movl 80(%esp,%eax), %ebx
+; X32-SSE2-NEXT:    movl 92(%esp,%eax), %edi
+; X32-SSE2-NEXT:    movl 88(%esp,%eax), %esi
+; X32-SSE2-NEXT:    movl 100(%esp,%eax), %edx
+; X32-SSE2-NEXT:    movl 96(%esp,%eax), %ecx
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movl %ecx, 56(%eax)
+; X32-SSE2-NEXT:    movl %edx, 60(%eax)
+; X32-SSE2-NEXT:    movl %esi, 48(%eax)
+; X32-SSE2-NEXT:    movl %edi, 52(%eax)
+; X32-SSE2-NEXT:    movl %ebx, 40(%eax)
+; X32-SSE2-NEXT:    movl %ebp, 44(%eax)
+; X32-SSE2-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %ecx, 32(%eax)
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %ecx, 36(%eax)
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %ecx, 24(%eax)
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %ecx, 28(%eax)
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %ecx, 16(%eax)
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %ecx, 20(%eax)
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %ecx, 8(%eax)
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %ecx, 12(%eax)
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %ecx, (%eax)
+; X32-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-SSE2-NEXT:    movl %ecx, 4(%eax)
+; X32-SSE2-NEXT:    addl $168, %esp
+; X32-SSE2-NEXT:    popl %esi
+; X32-SSE2-NEXT:    popl %edi
+; X32-SSE2-NEXT:    popl %ebx
+; X32-SSE2-NEXT:    popl %ebp
+; X32-SSE2-NEXT:    retl
+;
+; X32-SSE42-LABEL: ashr_64bytes:
+; X32-SSE42:       # %bb.0:
+; X32-SSE42-NEXT:    pushl %ebx
+; X32-SSE42-NEXT:    pushl %edi
+; X32-SSE42-NEXT:    pushl %esi
+; X32-SSE42-NEXT:    subl $128, %esp
+; X32-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-SSE42-NEXT:    movups (%edx), %xmm0
+; X32-SSE42-NEXT:    movups 16(%edx), %xmm1
+; X32-SSE42-NEXT:    movups 32(%edx), %xmm2
+; X32-SSE42-NEXT:    movl 48(%edx), %esi
+; X32-SSE42-NEXT:    movl 52(%edx), %edi
+; X32-SSE42-NEXT:    movl 56(%edx), %ebx
+; X32-SSE42-NEXT:    movl 60(%edx), %edx
+; X32-SSE42-NEXT:    movl (%ecx), %ecx
+; X32-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    movups %xmm2, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    movups %xmm1, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    movups %xmm0, (%esp)
+; X32-SSE42-NEXT:    sarl $31, %edx
+; X32-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-SSE42-NEXT:    andl $63, %ecx
+; X32-SSE42-NEXT:    movups (%esp,%ecx), %xmm0
+; X32-SSE42-NEXT:    movups 16(%esp,%ecx), %xmm1
+; X32-SSE42-NEXT:    movups 32(%esp,%ecx), %xmm2
+; X32-SSE42-NEXT:    movups 48(%esp,%ecx), %xmm3
+; X32-SSE42-NEXT:    movups %xmm3, 48(%eax)
+; X32-SSE42-NEXT:    movups %xmm2, 32(%eax)
+; X32-SSE42-NEXT:    movups %xmm1, 16(%eax)
+; X32-SSE42-NEXT:    movups %xmm0, (%eax)
+; X32-SSE42-NEXT:    addl $128, %esp
+; X32-SSE42-NEXT:    popl %esi
+; X32-SSE42-NEXT:    popl %edi
+; X32-SSE42-NEXT:    popl %ebx
+; X32-SSE42-NEXT:    retl
+;
+; X32-AVX-LABEL: ashr_64bytes:
+; X32-AVX:       # %bb.0:
+; X32-AVX-NEXT:    pushl %ebx
+; X32-AVX-NEXT:    pushl %edi
+; X32-AVX-NEXT:    pushl %esi
+; X32-AVX-NEXT:    subl $128, %esp
+; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-AVX-NEXT:    vmovups (%edx), %ymm0
+; X32-AVX-NEXT:    vmovups 32(%edx), %xmm1
+; X32-AVX-NEXT:    movl 48(%edx), %esi
+; X32-AVX-NEXT:    movl 52(%edx), %edi
+; X32-AVX-NEXT:    movl 56(%edx), %ebx
+; X32-AVX-NEXT:    movl 60(%edx), %edx
+; X32-AVX-NEXT:    movl (%ecx), %ecx
+; X32-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X32-AVX-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X32-AVX-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-AVX-NEXT:    vmovups %xmm1, {{[0-9]+}}(%esp)
+; X32-AVX-NEXT:    vmovups %ymm0, (%esp)
+; X32-AVX-NEXT:    sarl $31, %edx
+; X32-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX-NEXT:    andl $63, %ecx
+; X32-AVX-NEXT:    vmovups (%esp,%ecx), %xmm0
+; X32-AVX-NEXT:    vmovups 16(%esp,%ecx), %xmm1
+; X32-AVX-NEXT:    vmovups 32(%esp,%ecx), %xmm2
+; X32-AVX-NEXT:    vmovups 48(%esp,%ecx), %xmm3
+; X32-AVX-NEXT:    vmovups %xmm3, 48(%eax)
+; X32-AVX-NEXT:    vmovups %xmm2, 32(%eax)
+; X32-AVX-NEXT:    vmovups %xmm1, 16(%eax)
+; X32-AVX-NEXT:    vmovups %xmm0, (%eax)
+; X32-AVX-NEXT:    addl $128, %esp
+; X32-AVX-NEXT:    popl %esi
+; X32-AVX-NEXT:    popl %edi
+; X32-AVX-NEXT:    popl %ebx
+; X32-AVX-NEXT:    vzeroupper
+; X32-AVX-NEXT:    retl
+  %src = load i512, ptr %src.ptr, align 1
+  %byteOff = load i512, ptr %byteOff.ptr, align 1
+  %bitOff = shl i512 %byteOff, 3
+  %res = ashr i512 %src, %bitOff
+  store i512 %res, ptr %dst, align 1
+  ret void
+}
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; ALL: {{.*}}
-; X32-NO-SHLD: {{.*}}
-; X32-SHLD: {{.*}}
-; X64-NO-SHLD: {{.*}}
-; X64-SHLD: {{.*}}
+; FALLBACK0: {{.*}}
+; FALLBACK1: {{.*}}
+; FALLBACK10: {{.*}}
+; FALLBACK11: {{.*}}
+; FALLBACK12: {{.*}}
+; FALLBACK13: {{.*}}
+; FALLBACK14: {{.*}}
+; FALLBACK15: {{.*}}
+; FALLBACK16: {{.*}}
+; FALLBACK17: {{.*}}
+; FALLBACK18: {{.*}}
+; FALLBACK19: {{.*}}
+; FALLBACK2: {{.*}}
+; FALLBACK20: {{.*}}
+; FALLBACK21: {{.*}}
+; FALLBACK22: {{.*}}
+; FALLBACK23: {{.*}}
+; FALLBACK24: {{.*}}
+; FALLBACK25: {{.*}}
+; FALLBACK26: {{.*}}
+; FALLBACK27: {{.*}}
+; FALLBACK28: {{.*}}
+; FALLBACK29: {{.*}}
+; FALLBACK3: {{.*}}
+; FALLBACK30: {{.*}}
+; FALLBACK31: {{.*}}
+; FALLBACK4: {{.*}}
+; FALLBACK5: {{.*}}
+; FALLBACK6: {{.*}}
+; FALLBACK7: {{.*}}
+; FALLBACK8: {{.*}}
+; FALLBACK9: {{.*}}
+; X32: {{.*}}
+; X64: {{.*}}

diff  --git a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
index 3b37dead8a77..2e940f306dca 100644
--- a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
@@ -3373,6 +3373,3998 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
   store i256 %res, ptr %dst, align 1
   ret void
 }
+
+define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
+; X64-NO-BMI2-NO-SHLD-LABEL: lshr_64bytes:
+; X64-NO-BMI2-NO-SHLD:       # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %rcx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 32(%rdi), %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 40(%rdi), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 48(%rdi), %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 56(%rdi), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl (%rsi), %r8d
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %r8d, %eax
+; X64-NO-BMI2-NO-SHLD-NEXT:    andl $7, %eax
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %r8d
+; X64-NO-BMI2-NO-SHLD-NEXT:    andl $63, %r8d
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -128(%rsp,%r8), %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%r8), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT:    notl %edi
+; X64-NO-BMI2-NO-SHLD-NEXT:    andl $63, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%r8), %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r14,%r14), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rsi, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    notb %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    addq %r9, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r11, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%r8), %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -96(%rsp,%r8), %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r15,%r15), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r12, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    addq %r11, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r14, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -88(%rsp,%r8), %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -80(%rsp,%r8), %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rbp,%rbp), %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r13, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    addq %r14, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r15, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -72(%rsp,%r8), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rdi,%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rbp, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, 56(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, 48(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, 32(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r12, 40(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, 16(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, 24(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-NO-BMI2-HAVE-SHLD-LABEL: lshr_64bytes:
+; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r13
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r12
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %rcx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 32(%rdi), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 40(%rdi), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 48(%rdi), %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 56(%rdi), %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl (%rsi), %edi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, (%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $7, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrl $3, %edi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $63, %edi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -120(%rsp,%rdi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -112(%rsp,%rdi), %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    notl %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $63, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -104(%rsp,%rdi), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    leaq (%r11,%r11), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %rbx, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -96(%rsp,%rdi), %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r15, %r12
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r12
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -88(%rsp,%rdi), %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    leaq (%r14,%r14), %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r12, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -80(%rsp,%rdi), %r12
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r12, %r13
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r13
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rdi), %rbp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    leaq (%rbp,%rbp), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r13, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r15, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r12, %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rdi), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rbp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbp, 48(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 56(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, 32(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, 16(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, 40(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, 24(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    addq $8, %rsp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r12
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r13
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-NO-SHLD-LABEL: lshr_64bytes:
+; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 32(%rdi), %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 40(%rdi), %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 48(%rdi), %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 56(%rdi), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl (%rsi), %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $7, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%rax), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%rax), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rdi, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, -128(%rsp,%rax), %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%rax), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r10, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r8, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -88(%rsp,%rax), %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r15, %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %r13d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notl %r13d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %r13d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r13, %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -96(%rsp,%rax), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r9, %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r11, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r13, %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbx, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -80(%rsp,%rax), %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r11, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -72(%rsp,%rax), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rax, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r10, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %r10, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r11, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r13, %r11, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r12, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r15, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %r15, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbp, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbx, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, 56(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 48(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r14, 32(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, 40(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, 16(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, 24(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-HAVE-SHLD-LABEL: lshr_64bytes:
+; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r13
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r12
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 32(%rdi), %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 40(%rdi), %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 48(%rdi), %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 56(%rdi), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%rsi), %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $7, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl $3, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $63, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -120(%rsp,%rax), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rsi, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -104(%rsp,%rax), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %r8, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -88(%rsp,%rax), %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %r10, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %ebx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    notl %ebx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $63, %ebx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -112(%rsp,%rax), %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leaq (%r14,%r14), %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rbx, %r15, %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %rdi, %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -96(%rsp,%rax), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leaq (%rdi,%rdi), %r12
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rbx, %r12, %r12
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r9, %r12
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -80(%rsp,%rax), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leaq (%r9,%r9), %r13
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rbx, %r13, %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rax), %r13
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %r13, %rbp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -128(%rsp,%rax), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r11, %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r10, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r13, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, 48(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbp, 56(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 32(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r14, 16(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbx, 40(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r12, 24(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r15, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r12
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r13
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X32-NO-BMI2-NO-SHLD-LABEL: lshr_64bytes:
+; X32-NO-BMI2-NO-SHLD:       # %bb.0:
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    subl $208, %esp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esi), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esi), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 8(%esi), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 12(%esi), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 16(%esi), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 20(%esi), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 24(%esi), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 28(%esi), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esi), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 36(%esi), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 40(%esi), %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 44(%esi), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 48(%esi), %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 52(%esi), %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 56(%esi), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 60(%esi), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esi), %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    andl $7, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    andl $63, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%esi), %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%esi), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    notl %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    andl $31, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%esi), %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 96(%esp,%esi), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 100(%esp,%esi), %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 104(%esp,%esi), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 108(%esp,%esi), %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 112(%esp,%esi), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 116(%esp,%esi), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 120(%esp,%esi), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 124(%esp,%esi), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 128(%esp,%esi), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 132(%esp,%esi), %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 136(%esp,%esi), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%esi), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ebp # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, (%esp) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 140(%esp,%esi), %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 60(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 56(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 48(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 40(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 32(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 24(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 16(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 52(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 44(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 36(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 28(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 20(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl $208, %esp
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X32-NO-BMI2-HAVE-SHLD-LABEL: lshr_64bytes:
+; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    subl $204, %esp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esi), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%esi), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%esi), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%esi), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%esi), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%esi), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%esi), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%esi), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 32(%esi), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 36(%esi), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 40(%esi), %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 44(%esi), %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 48(%esi), %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 52(%esi), %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 56(%esi), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 60(%esi), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esi), %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    andl $7, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl $3, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    andl $63, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 80(%esp,%esi), %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 84(%esp,%esi), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    notl %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    andl $31, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 88(%esp,%esi), %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 92(%esp,%esi), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 96(%esp,%esi), %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 100(%esp,%esi), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 104(%esp,%esi), %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 108(%esp,%esi), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 112(%esp,%esi), %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 116(%esp,%esi), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 120(%esp,%esi), %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 124(%esp,%esi), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 128(%esp,%esi), %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 132(%esp,%esi), %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    leal (%edi,%edi), %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %ebx, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, (%esp) # 4-byte Folded Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 76(%esp,%esi), %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 136(%esp,%esi), %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 56(%edx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 60(%edx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 48(%edx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 40(%edx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 32(%edx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 24(%edx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 16(%edx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 8(%edx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%edx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 52(%edx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 44(%edx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 36(%edx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 28(%edx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 20(%edx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 12(%edx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%edx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    addl $204, %esp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
+;
+; X32-HAVE-BMI2-NO-SHLD-LABEL: lshr_64bytes:
+; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subl $204, %esp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%edx), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%edx), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%edx), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%edx), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%edx), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%edx), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%edx), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%edx), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%edx), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%edx), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%edx), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%edx), %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%edx), %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%edx), %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%edx), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%edx), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%edx), %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    andl $7, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%ebx), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%ebx), %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ecx, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notl %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    andl $31, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%ebx), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%ebx), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 100(%esp,%ebx), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 96(%esp,%ebx), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 108(%esp,%ebx), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 104(%esp,%ebx), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 116(%esp,%ebx), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 112(%esp,%ebx), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 124(%esp,%ebx), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 120(%esp,%ebx), %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 132(%esp,%ebx), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 128(%esp,%ebx), %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, 76(%esp,%ebx), %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, (%esp), %ebp # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 136(%esp,%ebx), %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 60(%edx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 56(%edx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 48(%edx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 40(%edx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 32(%edx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 24(%edx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 16(%edx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 8(%edx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%edx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 52(%edx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 44(%edx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 36(%edx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 28(%edx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 20(%edx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%edx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%edx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $204, %esp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
+;
+; X32-HAVE-BMI2-HAVE-SHLD-LABEL: lshr_64bytes:
+; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $200, %esp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%eax), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%eax), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%eax), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%eax), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%eax), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%eax), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%eax), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 32(%eax), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 36(%eax), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 40(%eax), %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 44(%eax), %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 48(%eax), %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 52(%eax), %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 56(%eax), %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 60(%eax), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $7, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl $3, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $63, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 76(%esp,%eax), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %ecx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    notl %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $31, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 80(%esp,%eax), %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl %edi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %edi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 88(%esp,%eax), %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl %esi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %esi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 84(%esp,%eax), %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %esi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 96(%esp,%eax), %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl %esi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %esi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 92(%esp,%eax), %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %esi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 104(%esp,%eax), %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl %esi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %esi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 100(%esp,%eax), %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %esi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 112(%esp,%eax), %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl %esi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %esi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 108(%esp,%eax), %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %esi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 120(%esp,%eax), %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    leal (%esi,%esi), %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %ebx, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 116(%esp,%eax), %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %ebx, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ebx, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 128(%esp,%eax), %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    leal (%ebx,%ebx), %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %ebp, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 124(%esp,%eax), %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %ebp, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ecx, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, (%esp) # 4-byte Folded Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 72(%esp,%eax), %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 132(%esp,%eax), %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 56(%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 48(%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 40(%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 32(%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 24(%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 16(%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 8(%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %edx, %edi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, (%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 60(%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 52(%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 44(%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 36(%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 28(%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 20(%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 12(%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 4(%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $200, %esp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+  %src = load i512, ptr %src.ptr, align 1
+  %bitOff = load i512, ptr %bitOff.ptr, align 1
+  %res = lshr i512 %src, %bitOff
+  store i512 %res, ptr %dst, align 1
+  ret void
+}
+define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
+; X64-NO-BMI2-NO-SHLD-LABEL: shl_64bytes:
+; X64-NO-BMI2-NO-SHLD:       # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %rcx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 32(%rdi), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 40(%rdi), %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 48(%rdi), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 56(%rdi), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl (%rsi), %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
+; X64-NO-BMI2-NO-SHLD-NEXT:    andl $7, %eax
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    andl $63, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    negl %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movslq %esi, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%r14), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%r14), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    notb %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rdi, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -40(%rsp,%r14), %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%r14), %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r15, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rdi, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT:    notl %edi
+; X64-NO-BMI2-NO-SHLD-NEXT:    andl $63, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r15, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -24(%rsp,%r14), %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r15, %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -32(%rsp,%r14), %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbp, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r13, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rbp, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -8(%rsp,%r14), %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -16(%rsp,%r14), %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r13, %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r14, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r15, 48(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbp, 56(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, 32(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r12, 40(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, 16(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, 24(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-NO-BMI2-HAVE-SHLD-LABEL: shl_64bytes:
+; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r13
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r12
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %rcx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 32(%rdi), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 40(%rdi), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 48(%rdi), %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 56(%rdi), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl (%rsi), %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, (%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $7, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrl $3, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $63, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    negl %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movslq %esi, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -40(%rsp,%r10), %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    notl %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $63, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%r10), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -48(%rsp,%r10), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %rbx, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -24(%rsp,%r10), %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r15, %r12
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r12
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -32(%rsp,%r10), %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r12, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -8(%rsp,%r10), %r12
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r12, %r13
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r13
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -16(%rsp,%r10), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, %rbp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %rbp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rbp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r13, %rbp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rsi, %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r15, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rsp,%r10), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r12, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r9, %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 56(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, 40(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, 24(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbp, 48(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, 32(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 16(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    addq $8, %rsp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r12
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r13
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-NO-SHLD-LABEL: shl_64bytes:
+; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 32(%rdi), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 40(%rdi), %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 48(%rdi), %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 56(%rdi), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl (%rsi), %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $7, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    negl %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movslq %esi, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rcx), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rcx), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rsi, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -40(%rsp,%rcx), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rdi, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%rcx), %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rbx, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %r10d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -24(%rsp,%rcx), %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r15, %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r8, %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %bpl
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rbp, %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rbp, %rbx, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r11, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -32(%rsp,%rcx), %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r11, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notl %r10d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %r10d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r10, %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, -8(%rsp,%rcx), %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -16(%rsp,%rcx), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rcx, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rbp, %r11, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r12, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r10, %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbx, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rbp, %rcx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r10, %r15, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rax, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r13, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, 48(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, 56(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, 32(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, 40(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsi, 16(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, 24(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-HAVE-SHLD-LABEL: shl_64bytes:
+; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r13
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r12
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 32(%rdi), %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 40(%rdi), %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 48(%rdi), %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 56(%rdi), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%rsi), %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $7, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl $3, %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $63, %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    negl %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movslq %esi, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -48(%rsp,%rax), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rcx, %rsi, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -32(%rsp,%rax), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rcx, %rdi, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -16(%rsp,%rax), %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rcx, %r10, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %ebx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    notl %ebx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $63, %ebx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rax), %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%rax), %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r15, %r12
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrq %r12
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rbx, %r12, %r12
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r8, %r12
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -40(%rsp,%rax), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, %r13
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrq %r13
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rbx, %r13, %r13
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r9, %r13
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -24(%rsp,%rax), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, %rbp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrq %rbp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rbx, %rbp, %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rcx, %r14, %rbp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r11, %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rsi, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rdi, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -8(%rsp,%rax), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r10, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r14, %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, 56(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, 40(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, 24(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbp, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r15, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbx, 48(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r13, 32(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r12, 16(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r12
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r13
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X32-NO-BMI2-NO-SHLD-LABEL: shl_64bytes:
+; X32-NO-BMI2-NO-SHLD:       # %bb.0:
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    subl $192, %esp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%ebx), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ebx), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 8(%ebx), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ebx), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 16(%ebx), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 20(%ebx), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 24(%ebx), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 28(%ebx), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 32(%ebx), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 36(%ebx), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 40(%ebx), %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 44(%ebx), %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 48(%ebx), %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 52(%ebx), %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 56(%ebx), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 60(%ebx), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%ebx), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    andl $63, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal {{[0-9]+}}(%esp), %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    subl %eax, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%ebp), %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ebp), %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    andl $7, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %ch
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, (%esp) # 1-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ebp), %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 8(%ebp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    notl %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    andl $31, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 20(%ebp), %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 16(%ebp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 28(%ebp), %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 24(%ebp), %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 36(%ebp), %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 32(%ebp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 44(%ebp), %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 40(%ebp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 52(%ebp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    negl %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 176(%esp,%ecx), %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 60(%ebp), %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 56(%ebp), %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%ecx)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 56(%ecx)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 60(%ecx)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 48(%ecx)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 52(%ecx)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 40(%ecx)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 44(%ecx)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 32(%ecx)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 36(%ecx)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 24(%ecx)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 28(%ecx)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 16(%ecx)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 20(%ecx)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 8(%ecx)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%ecx)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%ecx)
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl $192, %esp
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X32-NO-BMI2-HAVE-SHLD-LABEL: shl_64bytes:
+; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    subl $204, %esp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%eax), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%eax), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%eax), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%eax), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%eax), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%eax), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%eax), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 32(%eax), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 36(%eax), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 40(%eax), %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 44(%eax), %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 48(%eax), %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 52(%eax), %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 56(%eax), %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 60(%eax), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl $3, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    andl $63, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    leal {{[0-9]+}}(%esp), %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    subl %edi, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%esi), %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    andl $7, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%esi), %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    notl %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    andl $31, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %ebx, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%esi), %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%esi), %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %ebx, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%esi), %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%esi), %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %ebx, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 32(%esi), %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%esi), %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %ebx, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 40(%esi), %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 36(%esi), %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %ebx, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    negl %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 188(%esp,%edi), %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 44(%esi), %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %ebx, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 56(%esi), %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 52(%esi), %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %ebx, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, (%esp) # 4-byte Folded Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esi), %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 60(%esi), %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 60(%edx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 52(%edx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 44(%edx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 36(%edx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 28(%edx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 20(%edx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 12(%edx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%edx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 4(%edx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 56(%edx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 48(%edx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 40(%edx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 32(%edx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 24(%edx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 16(%edx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 8(%edx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    addl $204, %esp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
+;
+; X32-HAVE-BMI2-NO-SHLD-LABEL: shl_64bytes:
+; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subl $216, %esp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%eax), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%eax), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%eax), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%eax), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%eax), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%eax), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%eax), %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%eax), %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%eax), %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%eax), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%eax), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%ebx), %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    andl $7, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal {{[0-9]+}}(%esp), %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subl %ebp, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%ebx), %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    andl $31, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%ebx), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %esi, %eax, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%ebx), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %esi, %eax, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%ebx), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%ebx), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %esi, %eax, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%ebx), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%ebx), %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %esi, %eax, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%ebx), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%ebx), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %esi, %eax, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%ebx), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%ebx), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %esi, %eax, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%ebx), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%ebx), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %esi, %eax, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%ebx), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edi, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    negl %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, 212(%esp,%ebp), %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%ebx), %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, (%edx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 60(%edx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 52(%edx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 44(%edx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 36(%edx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 28(%edx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 20(%edx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%edx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%edx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 56(%edx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 48(%edx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 40(%edx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 32(%edx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 24(%edx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 16(%edx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 8(%edx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $216, %esp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
+;
+; X32-HAVE-BMI2-HAVE-SHLD-LABEL: shl_64bytes:
+; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $204, %esp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%edi), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%edi), %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%edi), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%edi), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%edi), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%edi), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%edi), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%edi), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 32(%edi), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 36(%edi), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 40(%edi), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 44(%edi), %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 48(%edi), %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 52(%edi), %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 56(%edi), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 60(%edi), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%edi), %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $7, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl $3, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $63, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    leal {{[0-9]+}}(%esp), %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subl %edi, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%edx), %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    notl %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $31, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %ebp, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%edx), %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%edx), %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%edx), %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%edx), %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%edx), %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 32(%edx), %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 36(%edx), %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 40(%edx), %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 52(%edx), %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 56(%edx), %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 44(%edx), %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    negl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 188(%esp,%esi), %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %ecx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, (%esp) # 4-byte Folded Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 60(%edx), %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 60(%edx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %ebx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 52(%edx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 44(%edx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 36(%edx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 28(%edx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 20(%edx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 12(%edx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%edx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 4(%edx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 56(%edx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 48(%edx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 40(%edx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 32(%edx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 24(%edx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 16(%edx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 8(%edx)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $204, %esp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+  %src = load i512, ptr %src.ptr, align 1
+  %bitOff = load i512, ptr %bitOff.ptr, align 1
+  %res = shl i512 %src, %bitOff
+  store i512 %res, ptr %dst, align 1
+  ret void
+}
+define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
+; X64-NO-BMI2-NO-SHLD-LABEL: ashr_64bytes:
+; X64-NO-BMI2-NO-SHLD:       # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %rcx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 32(%rdi), %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 40(%rdi), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 48(%rdi), %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 56(%rdi), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl (%rsi), %r8d
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    sarq $63, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %r8d, %eax
+; X64-NO-BMI2-NO-SHLD-NEXT:    andl $7, %eax
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %r8d
+; X64-NO-BMI2-NO-SHLD-NEXT:    andl $63, %r8d
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -128(%rsp,%r8), %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%r8), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT:    notl %edi
+; X64-NO-BMI2-NO-SHLD-NEXT:    andl $63, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%r8), %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r14,%r14), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rsi, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    notb %sil
+; X64-NO-BMI2-NO-SHLD-NEXT:    addq %r9, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r11, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%r8), %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -96(%rsp,%r8), %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r15,%r15), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r12, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    addq %r11, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r14, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -88(%rsp,%r8), %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -80(%rsp,%r8), %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rbp,%rbp), %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r13, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    addq %r14, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r15, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -72(%rsp,%r8), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rdi,%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rbp, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    sarq %cl, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, 56(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, 48(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, 32(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r12, 40(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, 16(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, 24(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-NO-BMI2-HAVE-SHLD-LABEL: ashr_64bytes:
+; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r13
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r12
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %rcx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 32(%rdi), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 40(%rdi), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 48(%rdi), %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 56(%rdi), %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl (%rsi), %edi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    sarq $63, %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, (%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $7, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrl $3, %edi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $63, %edi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -120(%rsp,%rdi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -112(%rsp,%rdi), %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    notl %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $63, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -104(%rsp,%rdi), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    leaq (%r11,%r11), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %rbx, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -96(%rsp,%rdi), %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r15, %r12
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r12
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -88(%rsp,%rdi), %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    leaq (%r14,%r14), %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r12, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -80(%rsp,%rdi), %r12
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r12, %r13
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r13
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rdi), %rbp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    leaq (%rbp,%rbp), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r13, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r15, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r12, %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rdi), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rbp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    sarq %cl, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbp, 48(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 56(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, 32(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, 16(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, 40(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, 24(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    addq $8, %rsp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r12
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r13
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbp
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-NO-SHLD-LABEL: ashr_64bytes:
+; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 32(%rdi), %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 40(%rdi), %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 48(%rdi), %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 56(%rdi), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl (%rsi), %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    sarq $63, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $7, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%rax), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%rax), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rdi, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, -128(%rsp,%rax), %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%rax), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r10, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r8, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -88(%rsp,%rax), %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r15, %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %r13d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notl %r13d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %r13d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r13, %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -96(%rsp,%rax), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r9, %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r11, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r13, %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbx, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -80(%rsp,%rax), %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r11, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -72(%rsp,%rax), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    sarxq %rcx, %rax, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r10, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %r10, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r11, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r13, %r11, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r12, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r15, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %r15, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbp, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbx, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, 56(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 48(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r14, 32(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, 40(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, 16(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, 24(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
+;
+; X64-HAVE-BMI2-HAVE-SHLD-LABEL: ashr_64bytes:
+; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r13
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r12
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 32(%rdi), %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 40(%rdi), %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 48(%rdi), %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 56(%rdi), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%rsi), %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    sarq $63, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $7, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl $3, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $63, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -120(%rsp,%rax), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rsi, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -104(%rsp,%rax), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %r8, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -88(%rsp,%rax), %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %r10, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %ebx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    notl %ebx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $63, %ebx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -112(%rsp,%rax), %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leaq (%r14,%r14), %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rbx, %r15, %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %rdi, %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -96(%rsp,%rax), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leaq (%rdi,%rdi), %r12
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rbx, %r12, %r12
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r9, %r12
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -80(%rsp,%rax), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leaq (%r9,%r9), %r13
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rbx, %r13, %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rax), %r13
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxq %rcx, %r13, %rbp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -128(%rsp,%rax), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r11, %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r10, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r13, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, 48(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbp, 56(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 32(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r14, 16(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbx, 40(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r12, 24(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r15, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r12
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r13
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
+;
+; X32-NO-BMI2-NO-SHLD-LABEL: ashr_64bytes:
+; X32-NO-BMI2-NO-SHLD:       # %bb.0:
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    subl $208, %esp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 16(%eax), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 20(%eax), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 24(%eax), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 32(%eax), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 36(%eax), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 40(%eax), %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 44(%eax), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 48(%eax), %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 52(%eax), %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 56(%eax), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 60(%eax), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esi), %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    sarl $31, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    andl $7, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    andl $63, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%esi), %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%esi), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    notl %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    andl $31, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%esi), %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 96(%esp,%esi), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 100(%esp,%esi), %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 104(%esp,%esi), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 108(%esp,%esi), %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 112(%esp,%esi), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 116(%esp,%esi), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 120(%esp,%esi), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 124(%esp,%esi), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 128(%esp,%esi), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 132(%esp,%esi), %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 136(%esp,%esi), %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%esi), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X32-NO-BMI2-NO-SHLD-NEXT:    notb %dl
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ebp # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, (%esp) # 4-byte Spill
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl 140(%esp,%esi), %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X32-NO-BMI2-NO-SHLD-NEXT:    sarl %cl, %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 60(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 56(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 48(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 40(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 32(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 24(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 16(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 52(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 44(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 36(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 28(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 20(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
+; X32-NO-BMI2-NO-SHLD-NEXT:    addl $208, %esp
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %edi
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X32-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X32-NO-BMI2-NO-SHLD-NEXT:    retl
+;
+; X32-NO-BMI2-HAVE-SHLD-LABEL: ashr_64bytes:
+; X32-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    subl $204, %esp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%eax), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%eax), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%eax), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%eax), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%eax), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%eax), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%eax), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 32(%eax), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 36(%eax), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 40(%eax), %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 44(%eax), %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 48(%eax), %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 52(%eax), %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 56(%eax), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 60(%eax), %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esi), %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    sarl $31, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    andl $7, %eax
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl $3, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    andl $63, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 80(%esp,%esi), %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 84(%esp,%esi), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    notl %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    andl $31, %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 88(%esp,%esi), %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 92(%esp,%esi), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 96(%esp,%esi), %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 100(%esp,%esi), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 104(%esp,%esi), %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 108(%esp,%esi), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 112(%esp,%esi), %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 116(%esp,%esi), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 120(%esp,%esi), %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 124(%esp,%esi), %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    leal (%ecx,%ecx), %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %edi, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 128(%esp,%esi), %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 132(%esp,%esi), %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    leal (%edi,%edi), %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    orl %ebx, %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, (%esp) # 4-byte Folded Spill
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 76(%esp,%esi), %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl 136(%esp,%esi), %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 56(%edx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    sarl %cl, %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 60(%edx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 48(%edx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 40(%edx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 32(%edx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 24(%edx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 16(%edx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 8(%edx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%edx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 52(%edx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 44(%edx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 36(%edx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 28(%edx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 20(%edx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 12(%edx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%edx)
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    addl $204, %esp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebp
+; X32-NO-BMI2-HAVE-SHLD-NEXT:    retl
+;
+; X32-HAVE-BMI2-NO-SHLD-LABEL: ashr_64bytes:
+; X32-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    subl $204, %esp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%eax), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%eax), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%eax), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%eax), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%eax), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%eax), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%eax), %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%eax), %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%eax), %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%eax), %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%eax), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl (%edx), %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    sarl $31, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    andl $7, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%ebx), %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%ebx), %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %eax, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notl %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    andl $31, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%ebx), %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%ebx), %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 100(%esp,%ebx), %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 96(%esp,%ebx), %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 108(%esp,%ebx), %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 104(%esp,%ebx), %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 116(%esp,%ebx), %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 112(%esp,%ebx), %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 124(%esp,%ebx), %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 120(%esp,%ebx), %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 132(%esp,%ebx), %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 128(%esp,%ebx), %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, 76(%esp,%ebx), %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edx, %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl (%esp), %ebp # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edi, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl 136(%esp,%ebx), %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    sarxl %ecx, %edx, %ecx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edx, %eax
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    orl (%esp), %eax # 4-byte Folded Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 60(%edx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 56(%edx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 48(%edx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 40(%edx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 32(%edx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 24(%edx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 16(%edx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 8(%edx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%edx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 52(%edx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 44(%edx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 36(%edx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 28(%edx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 20(%edx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%edx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%edx)
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    addl $204, %esp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
+; X32-HAVE-BMI2-NO-SHLD-NEXT:    retl
+;
+; X32-HAVE-BMI2-HAVE-SHLD-LABEL: ashr_64bytes:
+; X32-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $204, %esp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%eax), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%eax), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%eax), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%eax), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%eax), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%eax), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%eax), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 32(%eax), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 36(%eax), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 40(%eax), %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 44(%eax), %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 48(%eax), %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 52(%eax), %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 56(%eax), %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 60(%eax), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    sarl $31, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $7, %ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl $3, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $63, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 80(%esp,%edx), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %eax, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    notl %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $31, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 84(%esp,%edx), %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl %edi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %edi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 92(%esp,%edx), %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl %esi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 88(%esp,%edx), %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 100(%esp,%edx), %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl %esi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 96(%esp,%edx), %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 108(%esp,%edx), %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl %esi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 104(%esp,%edx), %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 116(%esp,%edx), %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl %esi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 112(%esp,%edx), %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 124(%esp,%edx), %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    leal (%ebp,%ebp), %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 120(%esp,%edx), %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 132(%esp,%edx), %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    leal (%edi,%edi), %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %eax, %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 128(%esp,%edx), %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    orl %esi, %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, (%esp) # 4-byte Folded Spill
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 76(%esp,%edx), %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 136(%esp,%edx), %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 56(%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 48(%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 40(%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 32(%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 24(%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 16(%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 8(%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxl %ecx, %edx, %edx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 60(%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 52(%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 44(%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 36(%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 28(%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 20(%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 12(%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 4(%eax)
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $204, %esp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebp
+; X32-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
+  %src = load i512, ptr %src.ptr, align 1
+  %bitOff = load i512, ptr %bitOff.ptr, align 1
+  %res = ashr i512 %src, %bitOff
+  store i512 %res, ptr %dst, align 1
+  ret void
+}
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; ALL: {{.*}}
 ; X32: {{.*}}


        


More information about the llvm-commits mailing list