[llvm] [TableGen] Fix regunit superset calculation (PR #81850)
Jay Foad via llvm-commits
llvm-commits at lists.llvm.org
Wed Oct 30 03:49:57 PDT 2024
https://github.com/jayfoad updated https://github.com/llvm/llvm-project/pull/81850
>From ec587dc9e1c205b529711eb39cd054973c585ae4 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Thu, 15 Feb 2024 12:14:17 +0000
Subject: [PATCH 1/2] [TableGen] Fix regunit superset calculation
Keep creating new supersets from new supersets until we reach a fixed
point. For most targets this ends up with fewer regunitsets overall
because it makes pruning more effective.
---
.../CodeGen/X86/2011-10-19-LegelizeLoad.ll | 15 +-
llvm/test/CodeGen/X86/abs.ll | 60 +-
llvm/test/CodeGen/X86/apx/mul-i1024.ll | 131 +-
llvm/test/CodeGen/X86/avx512-calling-conv.ll | 54 +-
llvm/test/CodeGen/X86/avx512-regcall-Mask.ll | 4 +-
.../test/CodeGen/X86/avx512-regcall-NoMask.ll | 8 +-
.../X86/div-rem-pair-recomposition-signed.ll | 220 +-
.../div-rem-pair-recomposition-unsigned.ll | 222 +-
llvm/test/CodeGen/X86/divrem-by-select.ll | 30 +-
llvm/test/CodeGen/X86/divrem.ll | 24 +-
llvm/test/CodeGen/X86/fold-tied-op.ll | 100 +-
llvm/test/CodeGen/X86/funnel-shift.ll | 47 +-
llvm/test/CodeGen/X86/h-registers-1.ll | 62 +-
llvm/test/CodeGen/X86/isel-sdiv.ll | 2 +-
llvm/test/CodeGen/X86/isel-srem.ll | 2 +-
llvm/test/CodeGen/X86/isel-udiv.ll | 2 +-
llvm/test/CodeGen/X86/isel-urem.ll | 2 +-
llvm/test/CodeGen/X86/legalize-shift-64.ll | 18 +-
llvm/test/CodeGen/X86/mul-i1024.ll | 1650 +++++++------
llvm/test/CodeGen/X86/mul-i256.ll | 14 +-
llvm/test/CodeGen/X86/mul-i512.ll | 54 +-
llvm/test/CodeGen/X86/musttail-varargs.ll | 8 +-
.../X86/peephole-na-phys-copy-folding.ll | 12 +-
llvm/test/CodeGen/X86/pr38539.ll | 101 +-
llvm/test/CodeGen/X86/sdiv_fix.ll | 130 +-
llvm/test/CodeGen/X86/shift-i128.ll | 470 ++--
llvm/test/CodeGen/X86/shift-i256.ll | 186 +-
llvm/test/CodeGen/X86/shrink_vmul.ll | 196 +-
llvm/test/CodeGen/X86/smax.ll | 27 +-
llvm/test/CodeGen/X86/smin.ll | 27 +-
llvm/test/CodeGen/X86/smul-with-overflow.ll | 54 +-
.../X86/smulo-128-legalisation-lowering.ll | 42 +-
llvm/test/CodeGen/X86/sse-regcall.ll | 8 +-
llvm/test/CodeGen/X86/sse-regcall4.ll | 8 +-
llvm/test/CodeGen/X86/sshl_sat_vec.ll | 57 +-
.../subvectorwise-store-of-vector-splat.ll | 297 +--
llvm/test/CodeGen/X86/umax.ll | 58 +-
llvm/test/CodeGen/X86/umin.ll | 27 +-
llvm/test/CodeGen/X86/umul-with-overflow.ll | 125 +-
...unfold-masked-merge-vector-variablemask.ll | 556 +++--
llvm/test/CodeGen/X86/ushl_sat_vec.ll | 162 +-
.../X86/wide-scalar-shift-legalization.ll | 2199 +++++++++--------
.../CodeGen/X86/x86-64-flags-intrinsics.ll | 32 +-
.../TableGen/Common/CodeGenRegisters.cpp | 11 +-
44 files changed, 3802 insertions(+), 3712 deletions(-)
diff --git a/llvm/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll b/llvm/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll
index 65ff22f960f233..606253b35326c6 100644
--- a/llvm/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll
+++ b/llvm/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll
@@ -17,21 +17,20 @@ target triple = "x86_64-unknown-linux-gnu"
define dso_local i32 @main() nounwind uwtable {
; CHECK-LABEL: main:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: movl i(%rip), %esi
+; CHECK-NEXT: movl i(%rip), %edx
; CHECK-NEXT: movl j(%rip), %eax
-; CHECK-NEXT: movl %esi, %edx
-; CHECK-NEXT: shrl $8, %edx
+; CHECK-NEXT: movl %edx, %esi
+; CHECK-NEXT: shrl $8, %esi
; CHECK-NEXT: movsbl %al, %ecx
; CHECK-NEXT: shrl $8, %eax
; CHECK-NEXT: cbtw
-; CHECK-NEXT: idivb %dl
-; CHECK-NEXT: movl %eax, %edx
-; CHECK-NEXT: movl %ecx, %eax
; CHECK-NEXT: idivb %sil
-; CHECK-NEXT: movzbl %dl, %ecx
+; CHECK-NEXT: movzbl %al, %esi
+; CHECK-NEXT: movl %ecx, %eax
+; CHECK-NEXT: idivb %dl
; CHECK-NEXT: movzbl %al, %eax
; CHECK-NEXT: movd %eax, %xmm0
-; CHECK-NEXT: pinsrb $1, %ecx, %xmm0
+; CHECK-NEXT: pinsrb $1, %esi, %xmm0
; CHECK-NEXT: pextrw $0, %xmm0, res(%rip)
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/abs.ll b/llvm/test/CodeGen/X86/abs.ll
index 5969aae43f82e8..6a07fe0633d7fd 100644
--- a/llvm/test/CodeGen/X86/abs.ll
+++ b/llvm/test/CodeGen/X86/abs.ll
@@ -487,10 +487,10 @@ define <16 x i8> @test_v16i8(<16 x i8> %a) nounwind {
; X86: # %bb.0:
; X86-NEXT: pushl %ebx
; X86-NEXT: subl $12, %esp
+; X86-NEXT: movb {{[0-9]+}}(%esp), %ch
; X86-NEXT: movb {{[0-9]+}}(%esp), %bh
; X86-NEXT: movb {{[0-9]+}}(%esp), %bl
; X86-NEXT: movb {{[0-9]+}}(%esp), %dh
-; X86-NEXT: movb {{[0-9]+}}(%esp), %ch
; X86-NEXT: movb {{[0-9]+}}(%esp), %ah
; X86-NEXT: movb {{[0-9]+}}(%esp), %dl
; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
@@ -509,26 +509,23 @@ define <16 x i8> @test_v16i8(<16 x i8> %a) nounwind {
; X86-NEXT: xorb %al, %ah
; X86-NEXT: subb %al, %ah
; X86-NEXT: movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT: movb %ch, %al
-; X86-NEXT: sarb $7, %al
-; X86-NEXT: xorb %al, %ch
-; X86-NEXT: subb %al, %ch
-; X86-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
; X86-NEXT: movb %dh, %al
; X86-NEXT: sarb $7, %al
; X86-NEXT: xorb %al, %dh
; X86-NEXT: subb %al, %dh
-; X86-NEXT: movb %dh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
; X86-NEXT: movl %ebx, %eax
; X86-NEXT: sarb $7, %al
; X86-NEXT: xorb %al, %bl
; X86-NEXT: subb %al, %bl
-; X86-NEXT: movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
; X86-NEXT: movb %bh, %al
; X86-NEXT: sarb $7, %al
; X86-NEXT: xorb %al, %bh
; X86-NEXT: subb %al, %bh
-; X86-NEXT: movb %bh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT: movb %ch, %al
+; X86-NEXT: sarb $7, %al
+; X86-NEXT: xorb %al, %ch
+; X86-NEXT: subb %al, %ch
+; X86-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: sarb $7, %al
@@ -547,21 +544,24 @@ define <16 x i8> @test_v16i8(<16 x i8> %a) nounwind {
; X86-NEXT: xorb %al, %cl
; X86-NEXT: subb %al, %cl
; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT: movb {{[0-9]+}}(%esp), %bh
-; X86-NEXT: movb %bh, %al
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: sarb $7, %al
-; X86-NEXT: xorb %al, %bh
-; X86-NEXT: subb %al, %bh
-; X86-NEXT: movb {{[0-9]+}}(%esp), %bl
-; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: xorb %al, %cl
+; X86-NEXT: subb %al, %cl
+; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: sarb $7, %al
-; X86-NEXT: xorb %al, %bl
-; X86-NEXT: subb %al, %bl
-; X86-NEXT: movb {{[0-9]+}}(%esp), %dh
-; X86-NEXT: movb %dh, %al
+; X86-NEXT: xorb %al, %cl
+; X86-NEXT: subb %al, %cl
+; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: sarb $7, %al
-; X86-NEXT: xorb %al, %dh
-; X86-NEXT: subb %al, %dh
+; X86-NEXT: xorb %al, %cl
+; X86-NEXT: subb %al, %cl
+; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
; X86-NEXT: movb {{[0-9]+}}(%esp), %ch
; X86-NEXT: movb %ch, %al
; X86-NEXT: sarb $7, %al
@@ -581,9 +581,12 @@ define <16 x i8> @test_v16i8(<16 x i8> %a) nounwind {
; X86-NEXT: movb %cl, 15(%eax)
; X86-NEXT: movb %dl, 14(%eax)
; X86-NEXT: movb %ch, 13(%eax)
-; X86-NEXT: movb %dh, 12(%eax)
-; X86-NEXT: movb %bl, 11(%eax)
-; X86-NEXT: movb %bh, 10(%eax)
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT: movb %cl, 12(%eax)
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT: movb %cl, 11(%eax)
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT: movb %cl, 10(%eax)
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
; X86-NEXT: movb %cl, 9(%eax)
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
@@ -592,12 +595,9 @@ define <16 x i8> @test_v16i8(<16 x i8> %a) nounwind {
; X86-NEXT: movb %cl, 7(%eax)
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
; X86-NEXT: movb %cl, 6(%eax)
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-NEXT: movb %cl, 5(%eax)
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-NEXT: movb %cl, 4(%eax)
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-NEXT: movb %cl, 3(%eax)
+; X86-NEXT: movb %bh, 5(%eax)
+; X86-NEXT: movb %bl, 4(%eax)
+; X86-NEXT: movb %dh, 3(%eax)
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
; X86-NEXT: movb %cl, 2(%eax)
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
diff --git a/llvm/test/CodeGen/X86/apx/mul-i1024.ll b/llvm/test/CodeGen/X86/apx/mul-i1024.ll
index 2b99c44fc769a2..e66ca42b45bee9 100644
--- a/llvm/test/CodeGen/X86/apx/mul-i1024.ll
+++ b/llvm/test/CodeGen/X86/apx/mul-i1024.ll
@@ -759,20 +759,20 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; EGPR-NEXT: adcq %r15, %r27
; EGPR-NEXT: adcq %r14, %r24
; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; EGPR-NEXT: movq 80(%r11), %rbx
-; EGPR-NEXT: movq %rbx, %rax
+; EGPR-NEXT: movq 88(%r11), %r28
+; EGPR-NEXT: movq 80(%r11), %r14
+; EGPR-NEXT: movq %r14, %rax
; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r19 # 8-byte Reload
; EGPR-NEXT: mulq %r19
-; EGPR-NEXT: movq %rax, %r21
; EGPR-NEXT: movq %rdx, %r8
-; EGPR-NEXT: movq 88(%r11), %r28
+; EGPR-NEXT: movq %rax, %r21
; EGPR-NEXT: movq %r28, %rax
; EGPR-NEXT: mulq %r19
; EGPR-NEXT: movq %rdx, %r9
; EGPR-NEXT: movq %rax, %r16
; EGPR-NEXT: addq %r8, %r16
; EGPR-NEXT: adcq $0, %r9
-; EGPR-NEXT: movq %rbx, %rax
+; EGPR-NEXT: movq %r14, %rax
; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r17 # 8-byte Reload
; EGPR-NEXT: mulq %r17
; EGPR-NEXT: movq %rdx, %r8
@@ -787,13 +787,13 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; EGPR-NEXT: addq %r8, %r16
; EGPR-NEXT: movzbl %cl, %eax
; EGPR-NEXT: adcq %rax, %r9
+; EGPR-NEXT: movq 72(%r11), %rbx
; EGPR-NEXT: movq 64(%r11), %r15
; EGPR-NEXT: movq %r15, %rax
; EGPR-NEXT: mulq %r19
-; EGPR-NEXT: movq %rax, %r23
; EGPR-NEXT: movq %rdx, %r8
-; EGPR-NEXT: movq 72(%r11), %r14
-; EGPR-NEXT: movq %r14, %rax
+; EGPR-NEXT: movq %rax, %r23
+; EGPR-NEXT: movq %rbx, %rax
; EGPR-NEXT: mulq %r19
; EGPR-NEXT: movq %rdx, %r30
; EGPR-NEXT: movq %rax, %r31
@@ -806,7 +806,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; EGPR-NEXT: addq %r31, %r29
; EGPR-NEXT: adcq %r30, %r8
; EGPR-NEXT: setb %cl
-; EGPR-NEXT: movq %r14, %rax
+; EGPR-NEXT: movq %rbx, %rax
; EGPR-NEXT: mulq %r17
; EGPR-NEXT: movq %rdx, %r31
; EGPR-NEXT: movq %rax, %r13
@@ -822,7 +822,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; EGPR-NEXT: mulq %rdi
; EGPR-NEXT: movq %rdx, %r8
; EGPR-NEXT: movq %rax, %r30
-; EGPR-NEXT: movq %r14, %rax
+; EGPR-NEXT: movq %rbx, %rax
; EGPR-NEXT: mulq %rdi
; EGPR-NEXT: movq %rdx, %r26
; EGPR-NEXT: movq %rax, %rcx
@@ -836,7 +836,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; EGPR-NEXT: addq %rcx, %r21
; EGPR-NEXT: adcq %r26, %r10
; EGPR-NEXT: setb %cl
-; EGPR-NEXT: movq %r14, %rax
+; EGPR-NEXT: movq %rbx, %rax
; EGPR-NEXT: mulq %r18
; EGPR-NEXT: movq %rdx, %r26
; EGPR-NEXT: movq %rax, %r8
@@ -850,7 +850,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; EGPR-NEXT: addq %r16, %r8
; EGPR-NEXT: adcq %r9, %r26
; EGPR-NEXT: setb %sil
-; EGPR-NEXT: movq %rbx, %rax
+; EGPR-NEXT: movq %r14, %rax
; EGPR-NEXT: mulq %rdi
; EGPR-NEXT: movq %rdx, %rcx
; EGPR-NEXT: movq %rax, %r31
@@ -860,7 +860,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; EGPR-NEXT: movq %rax, %r10
; EGPR-NEXT: addq %rcx, %r10
; EGPR-NEXT: adcq $0, %r9
-; EGPR-NEXT: movq %rbx, %rax
+; EGPR-NEXT: movq %r14, %rax
; EGPR-NEXT: mulq %r18
; EGPR-NEXT: movq %rdx, %rcx
; EGPR-NEXT: movq %rax, %r13
@@ -935,13 +935,13 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; EGPR-NEXT: movq %rax, %rcx
; EGPR-NEXT: addq %rdi, %rdx
; EGPR-NEXT: movq %r8, %rax
-; EGPR-NEXT: imulq %r14, %rax
+; EGPR-NEXT: imulq %rbx, %rax
; EGPR-NEXT: addq %rdx, %rax
; EGPR-NEXT: movq %rax, %r18
-; EGPR-NEXT: movq %rbx, %rdi
+; EGPR-NEXT: movq %r14, %rdi
; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r19 # 8-byte Reload
; EGPR-NEXT: imulq %r19, %rdi
-; EGPR-NEXT: movq %rbx, %rax
+; EGPR-NEXT: movq %r14, %rax
; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
; EGPR-NEXT: mulq %r8
; EGPR-NEXT: movq %rax, %r26
@@ -957,19 +957,19 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; EGPR-NEXT: movq %rax, %r8
; EGPR-NEXT: movq %r19, %rax
; EGPR-NEXT: mulq %r15
-; EGPR-NEXT: movq %rdx, %rbx
+; EGPR-NEXT: movq %rdx, %r14
; EGPR-NEXT: movq %rax, %r15
; EGPR-NEXT: addq %rcx, %r15
-; EGPR-NEXT: adcq $0, %rbx
+; EGPR-NEXT: adcq $0, %r14
; EGPR-NEXT: movq %rdi, %rax
-; EGPR-NEXT: mulq %r14
+; EGPR-NEXT: mulq %rbx
; EGPR-NEXT: movq %rdx, %rcx
; EGPR-NEXT: movq %rax, %r18
; EGPR-NEXT: addq %r15, %r18
-; EGPR-NEXT: adcq %rbx, %rcx
+; EGPR-NEXT: adcq %r14, %rcx
; EGPR-NEXT: setb %dil
; EGPR-NEXT: movq %r19, %rax
-; EGPR-NEXT: mulq %r14
+; EGPR-NEXT: mulq %rbx
; EGPR-NEXT: addq %rcx, %rax
; EGPR-NEXT: movzbl %dil, %ecx
; EGPR-NEXT: adcq %rcx, %rdx
@@ -1041,7 +1041,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; EGPR-NDD-NEXT: pushq %r13
; EGPR-NDD-NEXT: pushq %r12
; EGPR-NDD-NEXT: pushq %rbx
-; EGPR-NDD-NEXT: subq $104, %rsp
+; EGPR-NDD-NEXT: subq $96, %rsp
; EGPR-NDD-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; EGPR-NDD-NEXT: movq %rdi, %r20
; EGPR-NDD-NEXT: movq (%rdi), %r16
@@ -1298,7 +1298,6 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; EGPR-NDD-NEXT: adcq $0, %r29, %r8
; EGPR-NDD-NEXT: adcq $0, %rsi, %r9
; EGPR-NDD-NEXT: movq %r11, %r14
-; EGPR-NDD-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; EGPR-NDD-NEXT: movq 48(%r11), %r11
; EGPR-NDD-NEXT: movq %r10, %rsi
; EGPR-NDD-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -1393,22 +1392,22 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; EGPR-NDD-NEXT: addq %rax, %rdi
; EGPR-NDD-NEXT: movzbl %r8b, %eax
; EGPR-NDD-NEXT: adcq %rax, %rdx, %r8
-; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; EGPR-NDD-NEXT: movq %r14, %rax
+; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; EGPR-NDD-NEXT: movq %r15, %rax
; EGPR-NDD-NEXT: mulq %r26
; EGPR-NDD-NEXT: movq %rdx, %r29
; EGPR-NDD-NEXT: movq %rax, %r25
-; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; EGPR-NDD-NEXT: movq %r15, %rax
+; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; EGPR-NDD-NEXT: movq %r12, %rax
; EGPR-NDD-NEXT: mulq %r26
; EGPR-NDD-NEXT: addq %r29, %rax, %r9
; EGPR-NDD-NEXT: adcq $0, %rdx, %r10
-; EGPR-NDD-NEXT: movq %r14, %rax
+; EGPR-NDD-NEXT: movq %r15, %rax
; EGPR-NDD-NEXT: mulq %r18
; EGPR-NDD-NEXT: addq %r9, %rax, %rbx
; EGPR-NDD-NEXT: adcq %rdx, %r10, %r9
; EGPR-NDD-NEXT: setb %r10b
-; EGPR-NDD-NEXT: movq %r15, %rax
+; EGPR-NDD-NEXT: movq %r12, %rax
; EGPR-NDD-NEXT: mulq %r18
; EGPR-NDD-NEXT: addq %r9, %rax
; EGPR-NDD-NEXT: movzbl %r10b, %r9d
@@ -1417,20 +1416,20 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; EGPR-NDD-NEXT: adcq %rdx, %rsi
; EGPR-NDD-NEXT: adcq $0, %rdi
; EGPR-NDD-NEXT: adcq $0, %r8
-; EGPR-NDD-NEXT: movq %r14, %rax
+; EGPR-NDD-NEXT: movq %r15, %rax
; EGPR-NDD-NEXT: mulq %r11
; EGPR-NDD-NEXT: movq %rdx, %r28
; EGPR-NDD-NEXT: movq %rax, %r29
-; EGPR-NDD-NEXT: movq %r15, %rax
+; EGPR-NDD-NEXT: movq %r12, %rax
; EGPR-NDD-NEXT: mulq %r11
; EGPR-NDD-NEXT: addq %r28, %rax, %r10
; EGPR-NDD-NEXT: adcq $0, %rdx, %r27
-; EGPR-NDD-NEXT: movq %r14, %rax
+; EGPR-NDD-NEXT: movq %r15, %rax
; EGPR-NDD-NEXT: mulq %r16
; EGPR-NDD-NEXT: addq %rax, %r10
; EGPR-NDD-NEXT: adcq %rdx, %r27
; EGPR-NDD-NEXT: setb %r28b
-; EGPR-NDD-NEXT: movq %r15, %rax
+; EGPR-NDD-NEXT: movq %r12, %rax
; EGPR-NDD-NEXT: mulq %r16
; EGPR-NDD-NEXT: addq %r27, %rax
; EGPR-NDD-NEXT: movzbl %r28b, %r27d
@@ -1446,7 +1445,6 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; EGPR-NDD-NEXT: mulq %r11
; EGPR-NDD-NEXT: movq %rdx, %r28
; EGPR-NDD-NEXT: movq %rax, %r29
-; EGPR-NDD-NEXT: movq %r23, %r14
; EGPR-NDD-NEXT: movq %r23, %rax
; EGPR-NDD-NEXT: mulq %r11
; EGPR-NDD-NEXT: addq %r28, %rax, %r27
@@ -1476,9 +1474,9 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; EGPR-NDD-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; EGPR-NDD-NEXT: movzbl %bpl, %ecx
; EGPR-NDD-NEXT: adcq %rdi, %rcx
-; EGPR-NDD-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT: adcq $0, %r8, %rcx
; EGPR-NDD-NEXT: movq %rcx, (%rsp) # 8-byte Spill
+; EGPR-NDD-NEXT: adcq $0, %r8, %rcx
+; EGPR-NDD-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; EGPR-NDD-NEXT: adcq $0, %rax
; EGPR-NDD-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; EGPR-NDD-NEXT: adcq $0, %rdx, %rax
@@ -1660,19 +1658,18 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; EGPR-NDD-NEXT: adcq %rcx, %r19, %rbx
; EGPR-NDD-NEXT: adcq %rax, %r31, %rbp
; EGPR-NDD-NEXT: adcq %rdx, %r12, %r30
-; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r18 # 8-byte Reload
-; EGPR-NDD-NEXT: movq 80(%r18), %r22
-; EGPR-NDD-NEXT: movq %r22, %rax
+; EGPR-NDD-NEXT: movq 88(%r14), %r20
+; EGPR-NDD-NEXT: movq 80(%r14), %r23
+; EGPR-NDD-NEXT: movq %r23, %rax
; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r21 # 8-byte Reload
; EGPR-NDD-NEXT: mulq %r21
-; EGPR-NDD-NEXT: movq %rax, %r26
; EGPR-NDD-NEXT: movq %rdx, %rdi
-; EGPR-NDD-NEXT: movq 88(%r18), %r20
+; EGPR-NDD-NEXT: movq %rax, %r26
; EGPR-NDD-NEXT: movq %r20, %rax
; EGPR-NDD-NEXT: mulq %r21
; EGPR-NDD-NEXT: addq %rdi, %rax, %rcx
; EGPR-NDD-NEXT: adcq $0, %rdx, %rsi
-; EGPR-NDD-NEXT: movq %r22, %rax
+; EGPR-NDD-NEXT: movq %r23, %rax
; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
; EGPR-NDD-NEXT: mulq %r12
; EGPR-NDD-NEXT: addq %rax, %rcx
@@ -1683,13 +1680,13 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; EGPR-NDD-NEXT: addq %rax, %rsi
; EGPR-NDD-NEXT: movzbl %dil, %eax
; EGPR-NDD-NEXT: adcq %rax, %rdx, %rdi
-; EGPR-NDD-NEXT: movq 64(%r18), %r24
+; EGPR-NDD-NEXT: movq 72(%r14), %r22
+; EGPR-NDD-NEXT: movq 64(%r14), %r24
; EGPR-NDD-NEXT: movq %r24, %rax
; EGPR-NDD-NEXT: mulq %r21
-; EGPR-NDD-NEXT: movq %rax, %r29
; EGPR-NDD-NEXT: movq %rdx, %r27
-; EGPR-NDD-NEXT: movq 72(%r18), %r23
-; EGPR-NDD-NEXT: movq %r23, %rax
+; EGPR-NDD-NEXT: movq %rax, %r29
+; EGPR-NDD-NEXT: movq %r22, %rax
; EGPR-NDD-NEXT: mulq %r21
; EGPR-NDD-NEXT: addq %r27, %rax, %r8
; EGPR-NDD-NEXT: adcq $0, %rdx, %r9
@@ -1698,7 +1695,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; EGPR-NDD-NEXT: addq %r8, %rax, %r31
; EGPR-NDD-NEXT: adcq %rdx, %r9, %r8
; EGPR-NDD-NEXT: setb %r9b
-; EGPR-NDD-NEXT: movq %r23, %rax
+; EGPR-NDD-NEXT: movq %r22, %rax
; EGPR-NDD-NEXT: mulq %r12
; EGPR-NDD-NEXT: addq %r8, %rax
; EGPR-NDD-NEXT: movzbl %r9b, %r8d
@@ -1712,7 +1709,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; EGPR-NDD-NEXT: mulq %r16
; EGPR-NDD-NEXT: movq %rdx, %r26
; EGPR-NDD-NEXT: movq %rax, %r27
-; EGPR-NDD-NEXT: movq %r23, %rax
+; EGPR-NDD-NEXT: movq %r22, %rax
; EGPR-NDD-NEXT: mulq %r16
; EGPR-NDD-NEXT: addq %r26, %rax, %r9
; EGPR-NDD-NEXT: adcq $0, %rdx, %r10
@@ -1721,7 +1718,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; EGPR-NDD-NEXT: addq %rax, %r9
; EGPR-NDD-NEXT: adcq %rdx, %r10
; EGPR-NDD-NEXT: setb %r11b
-; EGPR-NDD-NEXT: movq %r23, %rax
+; EGPR-NDD-NEXT: movq %r22, %rax
; EGPR-NDD-NEXT: mulq %r17
; EGPR-NDD-NEXT: addq %r10, %rax
; EGPR-NDD-NEXT: movzbl %r11b, %r10d
@@ -1733,7 +1730,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; EGPR-NDD-NEXT: addq %rax, %rsi
; EGPR-NDD-NEXT: adcq %rdi, %rcx
; EGPR-NDD-NEXT: setb %dil
-; EGPR-NDD-NEXT: movq %r22, %rax
+; EGPR-NDD-NEXT: movq %r23, %rax
; EGPR-NDD-NEXT: mulq %r16
; EGPR-NDD-NEXT: movq %rdx, %r26
; EGPR-NDD-NEXT: movq %rax, %r27
@@ -1741,7 +1738,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; EGPR-NDD-NEXT: mulq %r16
; EGPR-NDD-NEXT: addq %r26, %rax, %r8
; EGPR-NDD-NEXT: adcq $0, %rdx, %r9
-; EGPR-NDD-NEXT: movq %r22, %rax
+; EGPR-NDD-NEXT: movq %r23, %rax
; EGPR-NDD-NEXT: mulq %r17
; EGPR-NDD-NEXT: addq %rax, %r8
; EGPR-NDD-NEXT: adcq %rdx, %r9
@@ -1756,22 +1753,20 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; EGPR-NDD-NEXT: movzbl %dil, %ecx
; EGPR-NDD-NEXT: adcq %rax, %rcx
; EGPR-NDD-NEXT: adcq $0, %rdx, %rdi
-; EGPR-NDD-NEXT: movq %r18, %r9
-; EGPR-NDD-NEXT: movq 96(%r18), %r26
+; EGPR-NDD-NEXT: movq 96(%r14), %r26
; EGPR-NDD-NEXT: imulq %r17, %r26, %rsi
; EGPR-NDD-NEXT: movq %r26, %rax
; EGPR-NDD-NEXT: mulq %r16
; EGPR-NDD-NEXT: movq %rax, %r18
; EGPR-NDD-NEXT: addq %rsi, %rdx, %rax
-; EGPR-NDD-NEXT: movq 104(%r9), %r8
+; EGPR-NDD-NEXT: movq 104(%r14), %r8
; EGPR-NDD-NEXT: imulq %r16, %r8, %rdx
; EGPR-NDD-NEXT: addq %rdx, %rax, %rsi
-; EGPR-NDD-NEXT: movq 112(%r9), %rax
-; EGPR-NDD-NEXT: movq %r9, %r11
+; EGPR-NDD-NEXT: movq 112(%r14), %rax
; EGPR-NDD-NEXT: imulq %r12, %rax, %r9
; EGPR-NDD-NEXT: mulq %r21
; EGPR-NDD-NEXT: addq %r9, %rdx
-; EGPR-NDD-NEXT: imulq 120(%r11), %r21, %r9
+; EGPR-NDD-NEXT: imulq 120(%r14), %r21, %r9
; EGPR-NDD-NEXT: addq %r9, %rdx
; EGPR-NDD-NEXT: addq %r18, %rax, %r9
; EGPR-NDD-NEXT: adcq %rsi, %rdx, %r16
@@ -1795,16 +1790,16 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; EGPR-NDD-NEXT: adcq %r8, %rdx
; EGPR-NDD-NEXT: addq %r9, %rax, %r10
; EGPR-NDD-NEXT: adcq %r16, %rdx, %r17
-; EGPR-NDD-NEXT: imulq %r14, %r24, %r8
+; EGPR-NDD-NEXT: imulq {{[-0-9]+}}(%r{{[sb]}}p), %r24, %r8 # 8-byte Folded Reload
; EGPR-NDD-NEXT: movq %r24, %rax
; EGPR-NDD-NEXT: mulq %r13
; EGPR-NDD-NEXT: movq %rax, %r9
; EGPR-NDD-NEXT: addq %r8, %rdx, %rax
-; EGPR-NDD-NEXT: imulq %r13, %r23, %rdx
+; EGPR-NDD-NEXT: imulq %r13, %r22, %rdx
; EGPR-NDD-NEXT: addq %rdx, %rax, %r8
; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r21 # 8-byte Reload
-; EGPR-NDD-NEXT: imulq %r21, %r22, %r16
-; EGPR-NDD-NEXT: movq %r22, %rax
+; EGPR-NDD-NEXT: imulq %r21, %r23, %r16
+; EGPR-NDD-NEXT: movq %r23, %rax
; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r26 # 8-byte Reload
; EGPR-NDD-NEXT: mulq %r26
; EGPR-NDD-NEXT: addq %r16, %rdx
@@ -1817,17 +1812,17 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; EGPR-NDD-NEXT: movq %rdx, %r8
; EGPR-NDD-NEXT: movq %rax, %r9
; EGPR-NDD-NEXT: movq %r21, %rax
-; EGPR-NDD-NEXT: movq %r21, %r22
+; EGPR-NDD-NEXT: movq %r21, %r23
; EGPR-NDD-NEXT: mulq %r24
; EGPR-NDD-NEXT: addq %rax, %r8
; EGPR-NDD-NEXT: adcq $0, %rdx, %r20
; EGPR-NDD-NEXT: movq %r26, %rax
-; EGPR-NDD-NEXT: mulq %r23
+; EGPR-NDD-NEXT: mulq %r22
; EGPR-NDD-NEXT: addq %rax, %r8
; EGPR-NDD-NEXT: adcq %rdx, %r20
; EGPR-NDD-NEXT: setb %r21b
-; EGPR-NDD-NEXT: movq %r22, %rax
-; EGPR-NDD-NEXT: mulq %r23
+; EGPR-NDD-NEXT: movq %r23, %rax
+; EGPR-NDD-NEXT: mulq %r22
; EGPR-NDD-NEXT: addq %r20, %rax
; EGPR-NDD-NEXT: movzbl %r21b, %r20d
; EGPR-NDD-NEXT: adcq %r20, %rdx
@@ -1853,8 +1848,8 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; EGPR-NDD-NEXT: adcq %rdi, {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
; EGPR-NDD-NEXT: adcq %r9, {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
; EGPR-NDD-NEXT: adcq %r10, {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
-; EGPR-NDD-NEXT: adcq %rsi, {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; EGPR-NDD-NEXT: adcq %r8, (%rsp), %r8 # 8-byte Folded Reload
+; EGPR-NDD-NEXT: adcq %rsi, (%rsp), %rsi # 8-byte Folded Reload
+; EGPR-NDD-NEXT: adcq %r8, {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
; EGPR-NDD-NEXT: adcq %rax, {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
; EGPR-NDD-NEXT: adcq %rcx, {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
@@ -1882,7 +1877,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; EGPR-NDD-NEXT: movq %r8, 104(%r11)
; EGPR-NDD-NEXT: movq %rax, 112(%r11)
; EGPR-NDD-NEXT: movq %rcx, 120(%r11)
-; EGPR-NDD-NEXT: addq $104, %rsp
+; EGPR-NDD-NEXT: addq $96, %rsp
; EGPR-NDD-NEXT: popq %rbx
; EGPR-NDD-NEXT: popq %r12
; EGPR-NDD-NEXT: popq %r13
diff --git a/llvm/test/CodeGen/X86/avx512-calling-conv.ll b/llvm/test/CodeGen/X86/avx512-calling-conv.ll
index b39b089faa2a5e..92a31bb2d345d2 100644
--- a/llvm/test/CodeGen/X86/avx512-calling-conv.ll
+++ b/llvm/test/CodeGen/X86/avx512-calling-conv.ll
@@ -2731,7 +2731,10 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x
;
; KNL_X32-LABEL: test17:
; KNL_X32: ## %bb.0:
+; KNL_X32-NEXT: pushl %ebp
; KNL_X32-NEXT: pushl %ebx
+; KNL_X32-NEXT: pushl %edi
+; KNL_X32-NEXT: pushl %esi
; KNL_X32-NEXT: subl $16, %esp
; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; KNL_X32-NEXT: andl $1, %eax
@@ -3108,41 +3111,48 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x
; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 ## 2-byte Reload
; KNL_X32-NEXT: kandw %k1, %k0, %k0
; KNL_X32-NEXT: kshiftrw $6, %k0, %k1
-; KNL_X32-NEXT: kmovw %k1, %ecx
+; KNL_X32-NEXT: kmovw %k1, %esi
; KNL_X32-NEXT: kshiftrw $5, %k0, %k1
-; KNL_X32-NEXT: kmovw %k1, %eax
-; KNL_X32-NEXT: kshiftrw $1, %k0, %k1
-; KNL_X32-NEXT: kmovw %k1, %edx
-; KNL_X32-NEXT: kshiftrw $2, %k0, %k1
-; KNL_X32-NEXT: kmovw %k0, %ebx
-; KNL_X32-NEXT: andb $1, %bl
-; KNL_X32-NEXT: andb $1, %dl
-; KNL_X32-NEXT: addb %dl, %dl
-; KNL_X32-NEXT: orb %bl, %dl
-; KNL_X32-NEXT: kmovw %k1, %ebx
+; KNL_X32-NEXT: kmovw %k1, %edi
+; KNL_X32-NEXT: kshiftrw $4, %k0, %k1
+; KNL_X32-NEXT: kmovw %k1, %ebp
; KNL_X32-NEXT: kshiftrw $3, %k0, %k1
-; KNL_X32-NEXT: andb $1, %bl
-; KNL_X32-NEXT: shlb $2, %bl
-; KNL_X32-NEXT: orb %dl, %bl
-; KNL_X32-NEXT: kmovw %k1, %edx
-; KNL_X32-NEXT: kshiftrw $4, %k0, %k0
+; KNL_X32-NEXT: kmovw %k1, %ebx
+; KNL_X32-NEXT: kshiftrw $2, %k0, %k1
+; KNL_X32-NEXT: kmovw %k1, %ecx
+; KNL_X32-NEXT: kshiftrw $1, %k0, %k1
+; KNL_X32-NEXT: kmovw %k1, %eax
+; KNL_X32-NEXT: kmovw %k0, %edx
; KNL_X32-NEXT: andb $1, %dl
-; KNL_X32-NEXT: shlb $3, %dl
-; KNL_X32-NEXT: orb %bl, %dl
-; KNL_X32-NEXT: kmovw %k0, %ebx
+; KNL_X32-NEXT: andb $1, %al
+; KNL_X32-NEXT: addb %al, %al
+; KNL_X32-NEXT: orb %dl, %al
+; KNL_X32-NEXT: andb $1, %cl
+; KNL_X32-NEXT: shlb $2, %cl
+; KNL_X32-NEXT: orb %al, %cl
; KNL_X32-NEXT: andb $1, %bl
-; KNL_X32-NEXT: shlb $4, %bl
-; KNL_X32-NEXT: orb %dl, %bl
+; KNL_X32-NEXT: shlb $3, %bl
+; KNL_X32-NEXT: orb %cl, %bl
+; KNL_X32-NEXT: movl %ebp, %eax
; KNL_X32-NEXT: andb $1, %al
-; KNL_X32-NEXT: shlb $5, %al
+; KNL_X32-NEXT: shlb $4, %al
; KNL_X32-NEXT: orb %bl, %al
+; KNL_X32-NEXT: movl %eax, %ecx
+; KNL_X32-NEXT: movl %edi, %eax
+; KNL_X32-NEXT: andb $1, %al
+; KNL_X32-NEXT: shlb $5, %al
+; KNL_X32-NEXT: orb %cl, %al
+; KNL_X32-NEXT: movl %esi, %ecx
; KNL_X32-NEXT: shlb $6, %cl
; KNL_X32-NEXT: orb %al, %cl
; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_X32-NEXT: andb $127, %cl
; KNL_X32-NEXT: movb %cl, (%eax)
; KNL_X32-NEXT: addl $16, %esp
+; KNL_X32-NEXT: popl %esi
+; KNL_X32-NEXT: popl %edi
; KNL_X32-NEXT: popl %ebx
+; KNL_X32-NEXT: popl %ebp
; KNL_X32-NEXT: retl $4
;
; FASTISEL-LABEL: test17:
diff --git a/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll b/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll
index b3a0c7dffae117..85d302abfd1aef 100644
--- a/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll
+++ b/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll
@@ -138,6 +138,7 @@ define dso_local i64 @caller_argv64i1() #0 {
; WIN64-NEXT: movq %rax, %rcx
; WIN64-NEXT: movq %rax, %rdx
; WIN64-NEXT: movq %rax, %rdi
+; WIN64-NEXT: movq %rax, %rsi
; WIN64-NEXT: movq %rax, %r8
; WIN64-NEXT: movq %rax, %r9
; WIN64-NEXT: movq %rax, %r10
@@ -145,7 +146,6 @@ define dso_local i64 @caller_argv64i1() #0 {
; WIN64-NEXT: movq %rax, %r12
; WIN64-NEXT: movq %rax, %r14
; WIN64-NEXT: movq %rax, %r15
-; WIN64-NEXT: movq %rax, %rsi
; WIN64-NEXT: callq test_argv64i1
; WIN64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; WIN64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
@@ -178,13 +178,13 @@ define dso_local i64 @caller_argv64i1() #0 {
; LINUXOSX64-NEXT: movq %rax, %rcx
; LINUXOSX64-NEXT: movq %rax, %rdx
; LINUXOSX64-NEXT: movq %rax, %rdi
+; LINUXOSX64-NEXT: movq %rax, %rsi
; LINUXOSX64-NEXT: movq %rax, %r8
; LINUXOSX64-NEXT: movq %rax, %r9
; LINUXOSX64-NEXT: movq %rax, %r12
; LINUXOSX64-NEXT: movq %rax, %r13
; LINUXOSX64-NEXT: movq %rax, %r14
; LINUXOSX64-NEXT: movq %rax, %r15
-; LINUXOSX64-NEXT: movq %rax, %rsi
; LINUXOSX64-NEXT: pushq %rax
; LINUXOSX64-NEXT: .cfi_adjust_cfa_offset 8
; LINUXOSX64-NEXT: pushq %rax
diff --git a/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll b/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll
index 2081d201704f3a..3aed15caa4ea70 100644
--- a/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll
+++ b/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll
@@ -987,8 +987,6 @@ define dso_local x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %
; WIN64: # %bb.0:
; WIN64-NEXT: pushq %rbp
; WIN64-NEXT: pushq %rbx
-; WIN64-NEXT: # kill: def $edx killed $edx def $rdx
-; WIN64-NEXT: # kill: def $esi killed $esi def $rsi
; WIN64-NEXT: # kill: def $r15d killed $r15d def $r15
; WIN64-NEXT: # kill: def $r14d killed $r14d def $r14
; WIN64-NEXT: # kill: def $r12d killed $r12d def $r12
@@ -996,7 +994,9 @@ define dso_local x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %
; WIN64-NEXT: # kill: def $r10d killed $r10d def $r10
; WIN64-NEXT: # kill: def $r9d killed $r9d def $r9
; WIN64-NEXT: # kill: def $r8d killed $r8d def $r8
+; WIN64-NEXT: # kill: def $esi killed $esi def $rsi
; WIN64-NEXT: # kill: def $edi killed $edi def $rdi
+; WIN64-NEXT: # kill: def $edx killed $edx def $rdx
; WIN64-NEXT: leal (%rdx,%rdi), %ebx
; WIN64-NEXT: movl %edx, %ebp
; WIN64-NEXT: subl %edi, %ebp
@@ -1032,14 +1032,14 @@ define dso_local x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %
;
; LINUXOSX64-LABEL: testi32_inp:
; LINUXOSX64: # %bb.0:
-; LINUXOSX64-NEXT: # kill: def $edx killed $edx def $rdx
-; LINUXOSX64-NEXT: # kill: def $esi killed $esi def $rsi
; LINUXOSX64-NEXT: # kill: def $r14d killed $r14d def $r14
; LINUXOSX64-NEXT: # kill: def $r13d killed $r13d def $r13
; LINUXOSX64-NEXT: # kill: def $r12d killed $r12d def $r12
; LINUXOSX64-NEXT: # kill: def $r9d killed $r9d def $r9
; LINUXOSX64-NEXT: # kill: def $r8d killed $r8d def $r8
+; LINUXOSX64-NEXT: # kill: def $esi killed $esi def $rsi
; LINUXOSX64-NEXT: # kill: def $edi killed $edi def $rdi
+; LINUXOSX64-NEXT: # kill: def $edx killed $edx def $rdx
; LINUXOSX64-NEXT: leal (%rdx,%rdi), %r10d
; LINUXOSX64-NEXT: movl %edx, %r11d
; LINUXOSX64-NEXT: subl %edi, %r11d
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
index 33cc8e96f663f5..c69c6979d3df45 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
@@ -634,22 +634,20 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, ptr %divdst) nounwi
; X86: # %bb.0:
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: pushl %ebx
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %esi
; X86-NEXT: andl $-16, %esp
; X86-NEXT: subl $48, %esp
+; X86-NEXT: movl 8(%ebp), %ecx
; X86-NEXT: movdqa %xmm0, (%esp)
; X86-NEXT: movdqa %xmm1, {{[0-9]+}}(%esp)
; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: idivb {{[0-9]+}}(%esp)
; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: movd %eax, %xmm2
+; X86-NEXT: movd %eax, %xmm3
; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: idivb {{[0-9]+}}(%esp)
; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: movd %eax, %xmm3
-; X86-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; X86-NEXT: movd %eax, %xmm2
+; X86-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: idivb {{[0-9]+}}(%esp)
; X86-NEXT: movzbl %al, %eax
@@ -657,18 +655,18 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, ptr %divdst) nounwi
; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: idivb {{[0-9]+}}(%esp)
; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: movd %eax, %xmm2
-; X86-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; X86-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; X86-NEXT: movd %eax, %xmm3
+; X86-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: idivb {{[0-9]+}}(%esp)
; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: movd %eax, %xmm3
+; X86-NEXT: movd %eax, %xmm2
; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: idivb {{[0-9]+}}(%esp)
; X86-NEXT: movzbl %al, %eax
; X86-NEXT: movd %eax, %xmm4
-; X86-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; X86-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: idivb {{[0-9]+}}(%esp)
; X86-NEXT: movzbl %al, %eax
@@ -676,82 +674,70 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, ptr %divdst) nounwi
; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: idivb {{[0-9]+}}(%esp)
; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: movd %eax, %xmm3
-; X86-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
+; X86-NEXT: movd %eax, %xmm2
+; X86-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
+; X86-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: idivb {{[0-9]+}}(%esp)
; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: movd %eax, %xmm5
+; X86-NEXT: movd %eax, %xmm3
; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: idivb {{[0-9]+}}(%esp)
; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: movd %eax, %xmm6
+; X86-NEXT: movd %eax, %xmm4
+; X86-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: idivb {{[0-9]+}}(%esp)
-; X86-NEXT: movzbl %al, %edx
+; X86-NEXT: movzbl %al, %eax
+; X86-NEXT: movd %eax, %xmm5
; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: idivb {{[0-9]+}}(%esp)
-; X86-NEXT: movzbl %al, %esi
+; X86-NEXT: movzbl %al, %eax
+; X86-NEXT: movd %eax, %xmm3
+; X86-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
+; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: idivb {{[0-9]+}}(%esp)
-; X86-NEXT: movzbl %al, %edi
+; X86-NEXT: movzbl %al, %eax
+; X86-NEXT: movd %eax, %xmm4
; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: idivb {{[0-9]+}}(%esp)
-; X86-NEXT: movzbl %al, %ebx
+; X86-NEXT: movzbl %al, %eax
+; X86-NEXT: movd %eax, %xmm5
+; X86-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: idivb {{[0-9]+}}(%esp)
-; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: movzbl %al, %eax
+; X86-NEXT: movd %eax, %xmm6
; X86-NEXT: movsbl (%esp), %eax
; X86-NEXT: idivb {{[0-9]+}}(%esp)
-; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; X86-NEXT: movd %edx, %xmm7
-; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; X86-NEXT: movd %esi, %xmm4
-; X86-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
-; X86-NEXT: movd %edi, %xmm2
-; X86-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7]
-; X86-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
-; X86-NEXT: movd %ebx, %xmm5
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: movd %ecx, %xmm6
-; X86-NEXT: movl 8(%ebp), %ecx
-; X86-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: movd %eax, %xmm2
-; X86-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
-; X86-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
-; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
-; X86-NEXT: movdqa %xmm2, %xmm4
-; X86-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
-; X86-NEXT: movdqa %xmm4, (%ecx)
-; X86-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X86-NEXT: movdqa %xmm1, %xmm4
-; X86-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; X86-NEXT: pmullw %xmm3, %xmm4
-; X86-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; X86-NEXT: pand %xmm3, %xmm4
+; X86-NEXT: movd %eax, %xmm4
+; X86-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
+; X86-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; X86-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; X86-NEXT: movdqa %xmm4, %xmm3
+; X86-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
+; X86-NEXT: movdqa %xmm3, (%ecx)
; X86-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X86-NEXT: movdqa %xmm1, %xmm3
+; X86-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; X86-NEXT: pmullw %xmm2, %xmm3
+; X86-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; X86-NEXT: pand %xmm2, %xmm3
+; X86-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X86-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X86-NEXT: pmullw %xmm2, %xmm1
-; X86-NEXT: pand %xmm3, %xmm1
-; X86-NEXT: packuswb %xmm4, %xmm1
+; X86-NEXT: pmullw %xmm4, %xmm1
+; X86-NEXT: pand %xmm2, %xmm1
+; X86-NEXT: packuswb %xmm3, %xmm1
; X86-NEXT: psubb %xmm1, %xmm0
-; X86-NEXT: leal -12(%ebp), %esp
-; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
+; X86-NEXT: movl %ebp, %esp
; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; X64-LABEL: vector_i128_i8:
; X64: # %bb.0:
-; X64-NEXT: pushq %rbp
-; X64-NEXT: pushq %r15
-; X64-NEXT: pushq %r14
-; X64-NEXT: pushq %r13
-; X64-NEXT: pushq %r12
-; X64-NEXT: pushq %rbx
-; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)
; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
@@ -760,101 +746,93 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, ptr %divdst) nounwi
; X64-NEXT: movd %eax, %xmm2
; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: idivb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %edi
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: movd %eax, %xmm4
+; X64-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: idivb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %esi
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: movd %eax, %xmm2
; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: idivb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %r8d
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: movd %eax, %xmm3
+; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: idivb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %r9d
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: movd %eax, %xmm2
; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: idivb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %r10d
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: movd %eax, %xmm4
+; X64-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: idivb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %r11d
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: movd %eax, %xmm5
; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: idivb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %ebx
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: movd %eax, %xmm2
+; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
+; X64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: idivb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %ebp
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: movd %eax, %xmm3
; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: idivb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %r14d
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: movd %eax, %xmm4
+; X64-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: idivb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %r15d
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: movd %eax, %xmm5
; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: idivb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %r12d
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: movd %eax, %xmm3
+; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
+; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: idivb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %r13d
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: movd %eax, %xmm4
; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: idivb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %edx
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: movd %eax, %xmm5
+; X64-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: idivb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movl %eax, %ecx
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: movd %eax, %xmm4
; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: idivb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movd %edi, %xmm3
-; X64-NEXT: movd %esi, %xmm4
-; X64-NEXT: movd %r8d, %xmm5
-; X64-NEXT: movd %r9d, %xmm6
-; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; X64-NEXT: movd %r10d, %xmm7
-; X64-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; X64-NEXT: movd %r11d, %xmm4
-; X64-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; X64-NEXT: movd %ebx, %xmm2
-; X64-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
-; X64-NEXT: movd %ebp, %xmm3
-; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; X64-NEXT: movd %r14d, %xmm4
-; X64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3]
-; X64-NEXT: movd %r15d, %xmm6
-; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
-; X64-NEXT: movd %r12d, %xmm5
-; X64-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
-; X64-NEXT: movd %r13d, %xmm3
-; X64-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
-; X64-NEXT: movd %edx, %xmm6
-; X64-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; X64-NEXT: movzbl %cl, %ecx
-; X64-NEXT: movd %ecx, %xmm4
-; X64-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7]
; X64-NEXT: movzbl %al, %eax
-; X64-NEXT: movd %eax, %xmm3
-; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3]
-; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
-; X64-NEXT: movdqa %xmm3, %xmm4
-; X64-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT: movdqa %xmm4, (%rax)
+; X64-NEXT: movd %eax, %xmm6
+; X64-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; X64-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; X64-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
+; X64-NEXT: movdqa %xmm6, %xmm3
+; X64-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
+; X64-NEXT: movdqa %xmm3, (%rdi)
; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X64-NEXT: movdqa %xmm1, %xmm4
-; X64-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; X64-NEXT: pmullw %xmm2, %xmm4
+; X64-NEXT: movdqa %xmm1, %xmm3
+; X64-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; X64-NEXT: pmullw %xmm2, %xmm3
; X64-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; X64-NEXT: pand %xmm2, %xmm4
-; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X64-NEXT: pand %xmm2, %xmm3
+; X64-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X64-NEXT: pmullw %xmm3, %xmm1
+; X64-NEXT: pmullw %xmm6, %xmm1
; X64-NEXT: pand %xmm2, %xmm1
-; X64-NEXT: packuswb %xmm4, %xmm1
+; X64-NEXT: packuswb %xmm3, %xmm1
; X64-NEXT: psubb %xmm1, %xmm0
-; X64-NEXT: popq %rbx
-; X64-NEXT: popq %r12
-; X64-NEXT: popq %r13
-; X64-NEXT: popq %r14
-; X64-NEXT: popq %r15
-; X64-NEXT: popq %rbp
; X64-NEXT: retq
%div = sdiv <16 x i8> %x, %y
store <16 x i8> %div, ptr %divdst, align 16
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
index fa45afbb634c4d..8b3ae0af2c8850 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
@@ -497,10 +497,10 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: imull %ecx, %ebp
; X86-NEXT: addl %edx, %ebp
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: imull %esi, %edi
; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull %ebx
; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: imull %esi, %edi
; X86-NEXT: addl %edx, %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: imull %eax, %ebx
@@ -596,22 +596,20 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, ptr %divdst) nounwi
; X86: # %bb.0:
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: pushl %ebx
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %esi
; X86-NEXT: andl $-16, %esp
; X86-NEXT: subl $48, %esp
+; X86-NEXT: movl 8(%ebp), %ecx
; X86-NEXT: movdqa %xmm0, (%esp)
; X86-NEXT: movdqa %xmm1, {{[0-9]+}}(%esp)
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: divb {{[0-9]+}}(%esp)
; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: movd %eax, %xmm2
+; X86-NEXT: movd %eax, %xmm3
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: divb {{[0-9]+}}(%esp)
; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: movd %eax, %xmm3
-; X86-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; X86-NEXT: movd %eax, %xmm2
+; X86-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: divb {{[0-9]+}}(%esp)
; X86-NEXT: movzbl %al, %eax
@@ -619,18 +617,18 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, ptr %divdst) nounwi
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: divb {{[0-9]+}}(%esp)
; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: movd %eax, %xmm2
-; X86-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; X86-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; X86-NEXT: movd %eax, %xmm3
+; X86-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: divb {{[0-9]+}}(%esp)
; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: movd %eax, %xmm3
+; X86-NEXT: movd %eax, %xmm2
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: divb {{[0-9]+}}(%esp)
; X86-NEXT: movzbl %al, %eax
; X86-NEXT: movd %eax, %xmm4
-; X86-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; X86-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: divb {{[0-9]+}}(%esp)
; X86-NEXT: movzbl %al, %eax
@@ -638,82 +636,70 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, ptr %divdst) nounwi
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: divb {{[0-9]+}}(%esp)
; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: movd %eax, %xmm3
-; X86-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
+; X86-NEXT: movd %eax, %xmm2
+; X86-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
+; X86-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: divb {{[0-9]+}}(%esp)
; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: movd %eax, %xmm5
+; X86-NEXT: movd %eax, %xmm3
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: divb {{[0-9]+}}(%esp)
; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: movd %eax, %xmm6
+; X86-NEXT: movd %eax, %xmm4
+; X86-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: divb {{[0-9]+}}(%esp)
-; X86-NEXT: movzbl %al, %edx
+; X86-NEXT: movzbl %al, %eax
+; X86-NEXT: movd %eax, %xmm5
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: divb {{[0-9]+}}(%esp)
-; X86-NEXT: movzbl %al, %esi
+; X86-NEXT: movzbl %al, %eax
+; X86-NEXT: movd %eax, %xmm3
+; X86-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
+; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: divb {{[0-9]+}}(%esp)
-; X86-NEXT: movzbl %al, %edi
+; X86-NEXT: movzbl %al, %eax
+; X86-NEXT: movd %eax, %xmm4
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: divb {{[0-9]+}}(%esp)
-; X86-NEXT: movzbl %al, %ebx
+; X86-NEXT: movzbl %al, %eax
+; X86-NEXT: movd %eax, %xmm5
+; X86-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: divb {{[0-9]+}}(%esp)
-; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: movzbl %al, %eax
+; X86-NEXT: movd %eax, %xmm6
; X86-NEXT: movzbl (%esp), %eax
; X86-NEXT: divb {{[0-9]+}}(%esp)
-; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; X86-NEXT: movd %edx, %xmm7
-; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; X86-NEXT: movd %esi, %xmm4
-; X86-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
-; X86-NEXT: movd %edi, %xmm2
-; X86-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7]
-; X86-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
-; X86-NEXT: movd %ebx, %xmm5
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: movd %ecx, %xmm6
-; X86-NEXT: movl 8(%ebp), %ecx
-; X86-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: movd %eax, %xmm2
-; X86-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
-; X86-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
-; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
-; X86-NEXT: movdqa %xmm2, %xmm4
-; X86-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
-; X86-NEXT: movdqa %xmm4, (%ecx)
-; X86-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X86-NEXT: movdqa %xmm1, %xmm4
-; X86-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; X86-NEXT: pmullw %xmm3, %xmm4
-; X86-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; X86-NEXT: pand %xmm3, %xmm4
+; X86-NEXT: movd %eax, %xmm4
+; X86-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
+; X86-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; X86-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; X86-NEXT: movdqa %xmm4, %xmm3
+; X86-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
+; X86-NEXT: movdqa %xmm3, (%ecx)
; X86-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X86-NEXT: movdqa %xmm1, %xmm3
+; X86-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; X86-NEXT: pmullw %xmm2, %xmm3
+; X86-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; X86-NEXT: pand %xmm2, %xmm3
+; X86-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X86-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X86-NEXT: pmullw %xmm2, %xmm1
-; X86-NEXT: pand %xmm3, %xmm1
-; X86-NEXT: packuswb %xmm4, %xmm1
+; X86-NEXT: pmullw %xmm4, %xmm1
+; X86-NEXT: pand %xmm2, %xmm1
+; X86-NEXT: packuswb %xmm3, %xmm1
; X86-NEXT: psubb %xmm1, %xmm0
-; X86-NEXT: leal -12(%ebp), %esp
-; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
+; X86-NEXT: movl %ebp, %esp
; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; X64-LABEL: vector_i128_i8:
; X64: # %bb.0:
-; X64-NEXT: pushq %rbp
-; X64-NEXT: pushq %r15
-; X64-NEXT: pushq %r14
-; X64-NEXT: pushq %r13
-; X64-NEXT: pushq %r12
-; X64-NEXT: pushq %rbx
-; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
@@ -722,101 +708,93 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, ptr %divdst) nounwi
; X64-NEXT: movd %eax, %xmm2
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: divb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %edi
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: movd %eax, %xmm4
+; X64-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: divb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %esi
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: movd %eax, %xmm2
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: divb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %r8d
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: movd %eax, %xmm3
+; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: divb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %r9d
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: movd %eax, %xmm2
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: divb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %r10d
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: movd %eax, %xmm4
+; X64-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: divb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %r11d
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: movd %eax, %xmm5
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: divb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %ebx
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: movd %eax, %xmm2
+; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
+; X64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: divb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %ebp
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: movd %eax, %xmm3
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: divb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %r14d
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: movd %eax, %xmm4
+; X64-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: divb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %r15d
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: movd %eax, %xmm5
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: divb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %r12d
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: movd %eax, %xmm3
+; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
+; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: divb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %r13d
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: movd %eax, %xmm4
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: divb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %edx
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: movd %eax, %xmm5
+; X64-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: divb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movl %eax, %ecx
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: movd %eax, %xmm4
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: divb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movd %edi, %xmm3
-; X64-NEXT: movd %esi, %xmm4
-; X64-NEXT: movd %r8d, %xmm5
-; X64-NEXT: movd %r9d, %xmm6
-; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; X64-NEXT: movd %r10d, %xmm7
-; X64-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; X64-NEXT: movd %r11d, %xmm4
-; X64-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; X64-NEXT: movd %ebx, %xmm2
-; X64-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
-; X64-NEXT: movd %ebp, %xmm3
-; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; X64-NEXT: movd %r14d, %xmm4
-; X64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3]
-; X64-NEXT: movd %r15d, %xmm6
-; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
-; X64-NEXT: movd %r12d, %xmm5
-; X64-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
-; X64-NEXT: movd %r13d, %xmm3
-; X64-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
-; X64-NEXT: movd %edx, %xmm6
-; X64-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; X64-NEXT: movzbl %cl, %ecx
-; X64-NEXT: movd %ecx, %xmm4
-; X64-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7]
; X64-NEXT: movzbl %al, %eax
-; X64-NEXT: movd %eax, %xmm3
-; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3]
-; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
-; X64-NEXT: movdqa %xmm3, %xmm4
-; X64-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT: movdqa %xmm4, (%rax)
+; X64-NEXT: movd %eax, %xmm6
+; X64-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; X64-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; X64-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
+; X64-NEXT: movdqa %xmm6, %xmm3
+; X64-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
+; X64-NEXT: movdqa %xmm3, (%rdi)
; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X64-NEXT: movdqa %xmm1, %xmm4
-; X64-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; X64-NEXT: pmullw %xmm2, %xmm4
+; X64-NEXT: movdqa %xmm1, %xmm3
+; X64-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; X64-NEXT: pmullw %xmm2, %xmm3
; X64-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; X64-NEXT: pand %xmm2, %xmm4
-; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X64-NEXT: pand %xmm2, %xmm3
+; X64-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X64-NEXT: pmullw %xmm3, %xmm1
+; X64-NEXT: pmullw %xmm6, %xmm1
; X64-NEXT: pand %xmm2, %xmm1
-; X64-NEXT: packuswb %xmm4, %xmm1
+; X64-NEXT: packuswb %xmm3, %xmm1
; X64-NEXT: psubb %xmm1, %xmm0
-; X64-NEXT: popq %rbx
-; X64-NEXT: popq %r12
-; X64-NEXT: popq %r13
-; X64-NEXT: popq %r14
-; X64-NEXT: popq %r15
-; X64-NEXT: popq %rbp
; X64-NEXT: retq
%div = udiv <16 x i8> %x, %y
store <16 x i8> %div, ptr %divdst, align 16
diff --git a/llvm/test/CodeGen/X86/divrem-by-select.ll b/llvm/test/CodeGen/X86/divrem-by-select.ll
index f9582bb7343ba3..fadcfa1780685c 100644
--- a/llvm/test/CodeGen/X86/divrem-by-select.ll
+++ b/llvm/test/CodeGen/X86/divrem-by-select.ll
@@ -441,16 +441,15 @@ define <2 x i64> @udivrem_identity_const(<2 x i1> %c, <2 x i64> %x) {
; CHECK-X64-V4-NEXT: vpextrq $1, %xmm1, %rax
; CHECK-X64-V4-NEXT: xorl %edx, %edx
; CHECK-X64-V4-NEXT: divq %rcx
-; CHECK-X64-V4-NEXT: movq %rax, %rcx
-; CHECK-X64-V4-NEXT: movq %rdx, %rsi
-; CHECK-X64-V4-NEXT: vmovq %xmm0, %rdi
+; CHECK-X64-V4-NEXT: movq %rdx, %rcx
+; CHECK-X64-V4-NEXT: vmovq %rax, %xmm2
+; CHECK-X64-V4-NEXT: vmovq %xmm0, %rsi
; CHECK-X64-V4-NEXT: vmovq %xmm1, %rax
; CHECK-X64-V4-NEXT: xorl %edx, %edx
-; CHECK-X64-V4-NEXT: divq %rdi
-; CHECK-X64-V4-NEXT: vmovq %rcx, %xmm0
-; CHECK-X64-V4-NEXT: vmovq %rax, %xmm1
-; CHECK-X64-V4-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; CHECK-X64-V4-NEXT: vmovq %rsi, %xmm1
+; CHECK-X64-V4-NEXT: divq %rsi
+; CHECK-X64-V4-NEXT: vmovq %rax, %xmm0
+; CHECK-X64-V4-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; CHECK-X64-V4-NEXT: vmovq %rcx, %xmm1
; CHECK-X64-V4-NEXT: vmovq %rdx, %xmm2
; CHECK-X64-V4-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; CHECK-X64-V4-NEXT: vpaddq %xmm1, %xmm0, %xmm0
@@ -498,16 +497,15 @@ define <2 x i64> @sdivrem_identity_const(<2 x i1> %c, <2 x i64> %x) {
; CHECK-X64-V4-NEXT: vpextrq $1, %xmm1, %rax
; CHECK-X64-V4-NEXT: cqto
; CHECK-X64-V4-NEXT: idivq %rcx
-; CHECK-X64-V4-NEXT: movq %rax, %rcx
-; CHECK-X64-V4-NEXT: movq %rdx, %rsi
-; CHECK-X64-V4-NEXT: vmovq %xmm0, %rdi
+; CHECK-X64-V4-NEXT: movq %rdx, %rcx
+; CHECK-X64-V4-NEXT: vmovq %rax, %xmm2
+; CHECK-X64-V4-NEXT: vmovq %xmm0, %rsi
; CHECK-X64-V4-NEXT: vmovq %xmm1, %rax
; CHECK-X64-V4-NEXT: cqto
-; CHECK-X64-V4-NEXT: idivq %rdi
-; CHECK-X64-V4-NEXT: vmovq %rcx, %xmm0
-; CHECK-X64-V4-NEXT: vmovq %rax, %xmm1
-; CHECK-X64-V4-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; CHECK-X64-V4-NEXT: vmovq %rsi, %xmm1
+; CHECK-X64-V4-NEXT: idivq %rsi
+; CHECK-X64-V4-NEXT: vmovq %rax, %xmm0
+; CHECK-X64-V4-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; CHECK-X64-V4-NEXT: vmovq %rcx, %xmm1
; CHECK-X64-V4-NEXT: vmovq %rdx, %xmm2
; CHECK-X64-V4-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; CHECK-X64-V4-NEXT: vpaddq %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/divrem.ll b/llvm/test/CodeGen/X86/divrem.ll
index ba777b49546113..bdf67254dcf023 100644
--- a/llvm/test/CodeGen/X86/divrem.ll
+++ b/llvm/test/CodeGen/X86/divrem.ll
@@ -57,11 +57,11 @@ define void @si32(i32 %x, i32 %y, ptr %p, ptr %q) nounwind {
; X86-LABEL: si32:
; X86: # %bb.0:
; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: cltd
; X86-NEXT: idivl {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl %eax, (%esi)
; X86-NEXT: movl %edx, (%ecx)
; X86-NEXT: popl %esi
@@ -87,11 +87,11 @@ define void @si16(i16 %x, i16 %y, ptr %p, ptr %q) nounwind {
; X86-LABEL: si16:
; X86: # %bb.0:
; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: cwtd
; X86-NEXT: idivw {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movw %ax, (%esi)
; X86-NEXT: movw %dx, (%ecx)
; X86-NEXT: popl %esi
@@ -118,10 +118,10 @@ define void @si8(i8 %x, i8 %y, ptr %p, ptr %q) nounwind {
; X86-LABEL: si8:
; X86: # %bb.0:
; X86-NEXT: pushl %ebx
-; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: idivb {{[0-9]+}}(%esp)
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: idivb {{[0-9]+}}(%esp)
; X86-NEXT: movsbl %ah, %ebx
; X86-NEXT: movb %al, (%edx)
; X86-NEXT: movb %bl, (%ecx)
@@ -198,11 +198,11 @@ define void @ui32(i32 %x, i32 %y, ptr %p, ptr %q) nounwind {
; X86-LABEL: ui32:
; X86: # %bb.0:
; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: xorl %edx, %edx
; X86-NEXT: divl {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl %eax, (%esi)
; X86-NEXT: movl %edx, (%ecx)
; X86-NEXT: popl %esi
@@ -228,11 +228,11 @@ define void @ui16(i16 %x, i16 %y, ptr %p, ptr %q) nounwind {
; X86-LABEL: ui16:
; X86: # %bb.0:
; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: xorl %edx, %edx
; X86-NEXT: divw {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movw %ax, (%esi)
; X86-NEXT: movw %dx, (%ecx)
; X86-NEXT: popl %esi
@@ -259,10 +259,10 @@ define void @ui8(i8 %x, i8 %y, ptr %p, ptr %q) nounwind {
; X86-LABEL: ui8:
; X86: # %bb.0:
; X86-NEXT: pushl %ebx
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: divb {{[0-9]+}}(%esp)
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: divb {{[0-9]+}}(%esp)
; X86-NEXT: movzbl %ah, %ebx
; X86-NEXT: movb %al, (%edx)
; X86-NEXT: movb %bl, (%ecx)
diff --git a/llvm/test/CodeGen/X86/fold-tied-op.ll b/llvm/test/CodeGen/X86/fold-tied-op.ll
index 5ea2964057588f..0d85a5c7b6c361 100644
--- a/llvm/test/CodeGen/X86/fold-tied-op.ll
+++ b/llvm/test/CodeGen/X86/fold-tied-op.ll
@@ -24,84 +24,88 @@ define i64 @fn1() #0 {
; CHECK-NEXT: .cfi_offset %esi, -20
; CHECK-NEXT: .cfi_offset %edi, -16
; CHECK-NEXT: .cfi_offset %ebx, -12
-; CHECK-NEXT: movl $-1028477379, %ecx # imm = 0xC2B2AE3D
-; CHECK-NEXT: movl $668265295, %esi # imm = 0x27D4EB4F
-; CHECK-NEXT: movl a, %edi
-; CHECK-NEXT: cmpl $0, (%edi)
+; CHECK-NEXT: movl $-1028477379, %edi # imm = 0xC2B2AE3D
+; CHECK-NEXT: movl $668265295, %ecx # imm = 0x27D4EB4F
+; CHECK-NEXT: movl a, %esi
+; CHECK-NEXT: cmpl $0, (%esi)
; CHECK-NEXT: je .LBB0_2
; CHECK-NEXT: # %bb.1: # %if.then
-; CHECK-NEXT: movl 8(%edi), %ecx
-; CHECK-NEXT: movl 12(%edi), %edx
+; CHECK-NEXT: movl 8(%esi), %ecx
+; CHECK-NEXT: movl 12(%esi), %edx
; CHECK-NEXT: movl %edx, %eax
; CHECK-NEXT: shldl $1, %ecx, %eax
; CHECK-NEXT: orl %edx, %eax
; CHECK-NEXT: leal (%ecx,%ecx), %edx
; CHECK-NEXT: orl %ecx, %edx
; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: movl 16(%edi), %ebx
-; CHECK-NEXT: movl 20(%edi), %edx
+; CHECK-NEXT: movl $-1028477379, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; CHECK-NEXT: # imm = 0xC2B2AE3D
+; CHECK-NEXT: movl 16(%esi), %edi
+; CHECK-NEXT: movl 20(%esi), %edx
; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: shldl $2, %ebx, %edx
+; CHECK-NEXT: shldl $2, %edi, %edx
+; CHECK-NEXT: movl $668265295, %ebx # imm = 0x27D4EB4F
; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-NEXT: shldl $31, %ebx, %ecx
-; CHECK-NEXT: shll $2, %ebx
-; CHECK-NEXT: orl %ecx, %ebx
+; CHECK-NEXT: shldl $31, %edi, %ecx
+; CHECK-NEXT: shll $2, %edi
+; CHECK-NEXT: orl %ecx, %edi
; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; CHECK-NEXT: shrl %ecx
; CHECK-NEXT: orl %edx, %ecx
-; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; CHECK-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT: adcl %eax, %ecx
; CHECK-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: movl 24(%edi), %eax
+; CHECK-NEXT: movl 24(%esi), %edi
+; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-NEXT: imull %edi, %eax
; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: movl $-1028477379, %ebx # imm = 0xC2B2AE3D
-; CHECK-NEXT: imull %eax, %ebx
-; CHECK-NEXT: mull %esi
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: mull %ebx
; CHECK-NEXT: movl %eax, %ecx
-; CHECK-NEXT: addl %ebx, %edx
-; CHECK-NEXT: movl 28(%edi), %edi
-; CHECK-NEXT: imull %edi, %esi
-; CHECK-NEXT: addl %edx, %esi
+; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; CHECK-NEXT: movl 28(%esi), %esi
+; CHECK-NEXT: imull %esi, %ebx
+; CHECK-NEXT: addl %edx, %ebx
; CHECK-NEXT: movl $1336530590, %edx # imm = 0x4FA9D69E
-; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; CHECK-NEXT: movl %ebx, %eax
+; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: mull %edx
-; CHECK-NEXT: imull $-2056954758, %ebx, %ebx # imm = 0x85655C7A
-; CHECK-NEXT: addl %edx, %ebx
-; CHECK-NEXT: imull $1336530590, %edi, %edx # imm = 0x4FA9D69E
-; CHECK-NEXT: addl %ebx, %edx
-; CHECK-NEXT: shrdl $3, %esi, %ecx
-; CHECK-NEXT: sarl $3, %esi
-; CHECK-NEXT: orl %edx, %esi
+; CHECK-NEXT: imull $-2056954758, %edi, %edi # imm = 0x85655C7A
+; CHECK-NEXT: addl %edx, %edi
+; CHECK-NEXT: imull $1336530590, %esi, %edx # imm = 0x4FA9D69E
+; CHECK-NEXT: addl %edi, %edx
+; CHECK-NEXT: shrdl $3, %ebx, %ecx
+; CHECK-NEXT: sarl $3, %ebx
+; CHECK-NEXT: orl %edx, %ebx
; CHECK-NEXT: orl %eax, %ecx
-; CHECK-NEXT: movl $-66860409, %ebx # imm = 0xFC03CA87
+; CHECK-NEXT: movl $-66860409, %edx # imm = 0xFC03CA87
+; CHECK-NEXT: imull $326129324, %ecx, %edi # imm = 0x137056AC
; CHECK-NEXT: movl %ecx, %eax
-; CHECK-NEXT: mull %ebx
-; CHECK-NEXT: movl %eax, %edi
-; CHECK-NEXT: imull $326129324, %ecx, %eax # imm = 0x137056AC
-; CHECK-NEXT: addl %edx, %eax
-; CHECK-NEXT: imull $-66860409, %esi, %ecx # imm = 0xFC03CA87
-; CHECK-NEXT: addl %eax, %ecx
+; CHECK-NEXT: mull %edx
+; CHECK-NEXT: movl %eax, %esi
+; CHECK-NEXT: addl %edx, %edi
+; CHECK-NEXT: imull $-66860409, %ebx, %ecx # imm = 0xFC03CA87
+; CHECK-NEXT: addl %edi, %ecx
; CHECK-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; CHECK-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; CHECK-NEXT: movl %edi, b
-; CHECK-NEXT: movl %edi, %eax
-; CHECK-NEXT: mull %ebx
-; CHECK-NEXT: imull $326129324, %edi, %esi # imm = 0x137056AC
+; CHECK-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; CHECK-NEXT: movl %esi, b
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: movl $-66860409, %edx # imm = 0xFC03CA87
+; CHECK-NEXT: mull %edx
+; CHECK-NEXT: imull $326129324, %esi, %esi # imm = 0x137056AC
; CHECK-NEXT: addl %edx, %esi
; CHECK-NEXT: movl %ecx, b+4
; CHECK-NEXT: imull $-66860409, %ecx, %ecx # imm = 0xFC03CA87
; CHECK-NEXT: jmp .LBB0_3
; CHECK-NEXT: .LBB0_2: # %if.else
-; CHECK-NEXT: xorl b+4, %ecx
-; CHECK-NEXT: xorl b, %esi
+; CHECK-NEXT: xorl b+4, %edi
+; CHECK-NEXT: xorl b, %ecx
; CHECK-NEXT: movl $1419758215, %edx # imm = 0x549FCA87
-; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: movl %ecx, %eax
; CHECK-NEXT: mull %edx
-; CHECK-NEXT: imull $93298681, %esi, %esi # imm = 0x58F9FF9
+; CHECK-NEXT: imull $93298681, %ecx, %esi # imm = 0x58F9FF9
; CHECK-NEXT: addl %edx, %esi
-; CHECK-NEXT: imull $1419758215, %ecx, %ecx # imm = 0x549FCA87
+; CHECK-NEXT: imull $1419758215, %edi, %ecx # imm = 0x549FCA87
; CHECK-NEXT: .LBB0_3: # %if.end
; CHECK-NEXT: addl %esi, %ecx
; CHECK-NEXT: addl $-1028477341, %eax # imm = 0xC2B2AE63
diff --git a/llvm/test/CodeGen/X86/funnel-shift.ll b/llvm/test/CodeGen/X86/funnel-shift.ll
index c6f0662cadd6bf..a89d3875010c2f 100644
--- a/llvm/test/CodeGen/X86/funnel-shift.ll
+++ b/llvm/test/CodeGen/X86/funnel-shift.ll
@@ -76,40 +76,39 @@ define i128 @fshl_i128(i128 %x, i128 %y, i128 %z) nounwind {
; X86-SSE2-NEXT: pushl %ebx
; X86-SSE2-NEXT: pushl %edi
; X86-SSE2-NEXT: pushl %esi
-; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE2-NEXT: testb $64, %cl
-; X86-SSE2-NEXT: movl %esi, %eax
-; X86-SSE2-NEXT: cmovnel %ebx, %eax
-; X86-SSE2-NEXT: movl %edx, %ebp
+; X86-SSE2-NEXT: movl %eax, %ebp
; X86-SSE2-NEXT: cmovnel %edi, %ebp
+; X86-SSE2-NEXT: movl %edx, %ebx
+; X86-SSE2-NEXT: cmovnel %esi, %ebx
+; X86-SSE2-NEXT: cmovnel {{[0-9]+}}(%esp), %esi
; X86-SSE2-NEXT: cmovnel {{[0-9]+}}(%esp), %edi
-; X86-SSE2-NEXT: cmovnel {{[0-9]+}}(%esp), %ebx
; X86-SSE2-NEXT: cmovel {{[0-9]+}}(%esp), %edx
-; X86-SSE2-NEXT: cmovel {{[0-9]+}}(%esp), %esi
+; X86-SSE2-NEXT: cmovel {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: testb $32, %cl
-; X86-SSE2-NEXT: cmovnel %esi, %edx
-; X86-SSE2-NEXT: cmovnel %ebp, %esi
-; X86-SSE2-NEXT: cmovnel %eax, %ebp
-; X86-SSE2-NEXT: cmovel %edi, %ebx
-; X86-SSE2-NEXT: cmovel %eax, %edi
-; X86-SSE2-NEXT: movl %edi, %eax
-; X86-SSE2-NEXT: shldl %cl, %ebx, %eax
-; X86-SSE2-NEXT: movl %ebp, %ebx
-; X86-SSE2-NEXT: shldl %cl, %edi, %ebx
-; X86-SSE2-NEXT: movl %esi, %edi
-; X86-SSE2-NEXT: shldl %cl, %ebp, %edi
+; X86-SSE2-NEXT: cmovnel %eax, %edx
+; X86-SSE2-NEXT: cmovnel %ebx, %eax
+; X86-SSE2-NEXT: cmovnel %ebp, %ebx
+; X86-SSE2-NEXT: cmovel %esi, %edi
+; X86-SSE2-NEXT: cmovel %ebp, %esi
+; X86-SSE2-NEXT: movl %esi, %ebp
+; X86-SSE2-NEXT: shldl %cl, %edi, %ebp
+; X86-SSE2-NEXT: movl %ebx, %edi
+; X86-SSE2-NEXT: shldl %cl, %esi, %edi
+; X86-SSE2-NEXT: movl %eax, %esi
+; X86-SSE2-NEXT: shldl %cl, %ebx, %esi
; X86-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-SSE2-NEXT: shldl %cl, %esi, %edx
-; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT: movl %edx, 12(%ecx)
-; X86-SSE2-NEXT: movl %edi, 8(%ecx)
-; X86-SSE2-NEXT: movl %ebx, 4(%ecx)
-; X86-SSE2-NEXT: movl %eax, (%ecx)
-; X86-SSE2-NEXT: movl %ecx, %eax
+; X86-SSE2-NEXT: shldl %cl, %eax, %edx
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT: movl %edx, 12(%eax)
+; X86-SSE2-NEXT: movl %esi, 8(%eax)
+; X86-SSE2-NEXT: movl %edi, 4(%eax)
+; X86-SSE2-NEXT: movl %ebp, (%eax)
; X86-SSE2-NEXT: popl %esi
; X86-SSE2-NEXT: popl %edi
; X86-SSE2-NEXT: popl %ebx
diff --git a/llvm/test/CodeGen/X86/h-registers-1.ll b/llvm/test/CodeGen/X86/h-registers-1.ll
index 07d85d260a37a7..ca6fba69ebec28 100644
--- a/llvm/test/CodeGen/X86/h-registers-1.ll
+++ b/llvm/test/CodeGen/X86/h-registers-1.ll
@@ -16,23 +16,26 @@ define i64 @foo(i64 %a, i64 %b, i64 %c, i64 %d, i64 %e, i64 %f, i64 %g, i64 %h)
; CHECK-NEXT: .cfi_def_cfa_offset 24
; CHECK-NEXT: .cfi_offset %rbx, -24
; CHECK-NEXT: .cfi_offset %rbp, -16
-; CHECK-NEXT: movq %rsi, %rax
-; CHECK-NEXT: movq %rdi, %rbx
-; CHECK-NEXT: movzbl %bh, %esi
-; CHECK-NEXT: movzbl %ah, %edi
-; CHECK-NEXT: movzbl %dh, %edx
-; CHECK-NEXT: movzbl %ch, %ebp
-; CHECK-NEXT: movq %r8, %rax
-; CHECK-NEXT: movzbl %ah, %ecx
-; CHECK-NEXT: movq %r9, %rax
-; CHECK-NEXT: movzbl %ah, %ebx
+; CHECK-NEXT: movq %rcx, %r11
+; CHECK-NEXT: movq %rdx, %rbx
+; CHECK-NEXT: movq %rsi, %rdx
+; CHECK-NEXT: movq %rdi, %rcx
; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %r8d
-; CHECK-NEXT: addq %rdi, %rsi
-; CHECK-NEXT: addq %rbp, %rdx
+; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
+; CHECK-NEXT: movzbl %ch, %edi
+; CHECK-NEXT: movzbl %dh, %ebp
+; CHECK-NEXT: movzbl %bh, %edx
+; CHECK-NEXT: movq %r11, %rcx
+; CHECK-NEXT: movzbl %ch, %esi
+; CHECK-NEXT: movq %r8, %rcx
+; CHECK-NEXT: movzbl %ch, %ecx
+; CHECK-NEXT: movq %r9, %rbx
+; CHECK-NEXT: movzbl %bh, %ebx
+; CHECK-NEXT: addq %rbp, %rdi
; CHECK-NEXT: addq %rsi, %rdx
+; CHECK-NEXT: addq %rdi, %rdx
; CHECK-NEXT: addq %rbx, %rcx
-; CHECK-NEXT: addq %r8, %rax
+; CHECK-NEXT: addq %r10, %rax
; CHECK-NEXT: addq %rcx, %rax
; CHECK-NEXT: addq %rdx, %rax
; CHECK-NEXT: popq %rbx
@@ -49,23 +52,26 @@ define i64 @foo(i64 %a, i64 %b, i64 %c, i64 %d, i64 %e, i64 %f, i64 %g, i64 %h)
; GNUX32-NEXT: .cfi_def_cfa_offset 24
; GNUX32-NEXT: .cfi_offset %rbx, -24
; GNUX32-NEXT: .cfi_offset %rbp, -16
-; GNUX32-NEXT: movq %rsi, %rax
-; GNUX32-NEXT: movq %rdi, %rbx
-; GNUX32-NEXT: movzbl %bh, %esi
-; GNUX32-NEXT: movzbl %ah, %edi
-; GNUX32-NEXT: movzbl %dh, %edx
-; GNUX32-NEXT: movzbl %ch, %ebp
-; GNUX32-NEXT: movq %r8, %rax
-; GNUX32-NEXT: movzbl %ah, %ecx
-; GNUX32-NEXT: movq %r9, %rax
-; GNUX32-NEXT: movzbl %ah, %ebx
+; GNUX32-NEXT: movq %rcx, %r11
+; GNUX32-NEXT: movq %rdx, %rbx
+; GNUX32-NEXT: movq %rsi, %rdx
+; GNUX32-NEXT: movq %rdi, %rcx
; GNUX32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; GNUX32-NEXT: movzbl {{[0-9]+}}(%esp), %r8d
-; GNUX32-NEXT: addq %rdi, %rsi
-; GNUX32-NEXT: addq %rbp, %rdx
+; GNUX32-NEXT: movzbl {{[0-9]+}}(%esp), %r10d
+; GNUX32-NEXT: movzbl %ch, %edi
+; GNUX32-NEXT: movzbl %dh, %ebp
+; GNUX32-NEXT: movzbl %bh, %edx
+; GNUX32-NEXT: movq %r11, %rcx
+; GNUX32-NEXT: movzbl %ch, %esi
+; GNUX32-NEXT: movq %r8, %rcx
+; GNUX32-NEXT: movzbl %ch, %ecx
+; GNUX32-NEXT: movq %r9, %rbx
+; GNUX32-NEXT: movzbl %bh, %ebx
+; GNUX32-NEXT: addq %rbp, %rdi
; GNUX32-NEXT: addq %rsi, %rdx
+; GNUX32-NEXT: addq %rdi, %rdx
; GNUX32-NEXT: addq %rbx, %rcx
-; GNUX32-NEXT: addq %r8, %rax
+; GNUX32-NEXT: addq %r10, %rax
; GNUX32-NEXT: addq %rcx, %rax
; GNUX32-NEXT: addq %rdx, %rax
; GNUX32-NEXT: popq %rbx
diff --git a/llvm/test/CodeGen/X86/isel-sdiv.ll b/llvm/test/CodeGen/X86/isel-sdiv.ll
index 6a6b2da8dc2f8d..6b47da9ce1b961 100644
--- a/llvm/test/CodeGen/X86/isel-sdiv.ll
+++ b/llvm/test/CodeGen/X86/isel-sdiv.ll
@@ -22,8 +22,8 @@ define i8 @test_sdiv_i8(i8 %arg1, i8 %arg2) nounwind {
; GISEL-X86-LABEL: test_sdiv_i8:
; GISEL-X86: # %bb.0:
; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; GISEL-X86-NEXT: cbtw
; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; GISEL-X86-NEXT: cbtw
; GISEL-X86-NEXT: idivb %cl
; GISEL-X86-NEXT: retl
%ret = sdiv i8 %arg1, %arg2
diff --git a/llvm/test/CodeGen/X86/isel-srem.ll b/llvm/test/CodeGen/X86/isel-srem.ll
index 56716e10a9d996..4d43981da3e512 100644
--- a/llvm/test/CodeGen/X86/isel-srem.ll
+++ b/llvm/test/CodeGen/X86/isel-srem.ll
@@ -49,8 +49,8 @@ define i8 @test_srem_i8(i8 %arg1, i8 %arg2) nounwind {
; GISEL-X86-LABEL: test_srem_i8:
; GISEL-X86: # %bb.0:
; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; GISEL-X86-NEXT: cbtw
; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; GISEL-X86-NEXT: cbtw
; GISEL-X86-NEXT: idivb %cl
; GISEL-X86-NEXT: movb %ah, %al
; GISEL-X86-NEXT: retl
diff --git a/llvm/test/CodeGen/X86/isel-udiv.ll b/llvm/test/CodeGen/X86/isel-udiv.ll
index b56b8b112fe471..b30a8f12d82f89 100644
--- a/llvm/test/CodeGen/X86/isel-udiv.ll
+++ b/llvm/test/CodeGen/X86/isel-udiv.ll
@@ -22,8 +22,8 @@ define i8 @test_udiv_i8(i8 %arg1, i8 %arg2) nounwind {
; GISEL-X86-LABEL: test_udiv_i8:
; GISEL-X86: # %bb.0:
; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; GISEL-X86-NEXT: movzbl %al, %eax
; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; GISEL-X86-NEXT: movzbl %al, %eax
; GISEL-X86-NEXT: divb %cl
; GISEL-X86-NEXT: retl
%ret = udiv i8 %arg1, %arg2
diff --git a/llvm/test/CodeGen/X86/isel-urem.ll b/llvm/test/CodeGen/X86/isel-urem.ll
index 50b9c1250ff875..56e65008e10196 100644
--- a/llvm/test/CodeGen/X86/isel-urem.ll
+++ b/llvm/test/CodeGen/X86/isel-urem.ll
@@ -49,8 +49,8 @@ define i8 @test_urem_i8(i8 %arg1, i8 %arg2) nounwind {
; GISEL-X86-LABEL: test_urem_i8:
; GISEL-X86: # %bb.0:
; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; GISEL-X86-NEXT: movzbl %al, %eax
; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; GISEL-X86-NEXT: movzbl %al, %eax
; GISEL-X86-NEXT: divb %cl
; GISEL-X86-NEXT: movb %ah, %al
; GISEL-X86-NEXT: retl
diff --git a/llvm/test/CodeGen/X86/legalize-shift-64.ll b/llvm/test/CodeGen/X86/legalize-shift-64.ll
index 53208de7ea27e8..c4e6b26657ba14 100644
--- a/llvm/test/CodeGen/X86/legalize-shift-64.ll
+++ b/llvm/test/CodeGen/X86/legalize-shift-64.ll
@@ -89,32 +89,32 @@ define <2 x i64> @test5(<2 x i64> %A, <2 x i64> %B) {
; CHECK-NEXT: .cfi_offset %ebp, -8
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movb {{[0-9]+}}(%esp), %ch
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
; CHECK-NEXT: movb {{[0-9]+}}(%esp), %cl
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebx
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi
; CHECK-NEXT: movl %ebx, %edi
; CHECK-NEXT: shll %cl, %edi
; CHECK-NEXT: shldl %cl, %ebx, %esi
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
; CHECK-NEXT: testb $32, %cl
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebx
; CHECK-NEXT: je .LBB4_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: movl %edi, %esi
; CHECK-NEXT: xorl %edi, %edi
; CHECK-NEXT: .LBB4_2:
-; CHECK-NEXT: movl %edx, %ebx
+; CHECK-NEXT: movl %edx, %ebp
; CHECK-NEXT: movb %ch, %cl
-; CHECK-NEXT: shll %cl, %ebx
-; CHECK-NEXT: shldl %cl, %edx, %ebp
+; CHECK-NEXT: shll %cl, %ebp
+; CHECK-NEXT: shldl %cl, %edx, %ebx
; CHECK-NEXT: testb $32, %ch
; CHECK-NEXT: je .LBB4_4
; CHECK-NEXT: # %bb.3:
-; CHECK-NEXT: movl %ebx, %ebp
-; CHECK-NEXT: xorl %ebx, %ebx
+; CHECK-NEXT: movl %ebp, %ebx
+; CHECK-NEXT: xorl %ebp, %ebp
; CHECK-NEXT: .LBB4_4:
-; CHECK-NEXT: movl %ebp, 12(%eax)
-; CHECK-NEXT: movl %ebx, 8(%eax)
+; CHECK-NEXT: movl %ebx, 12(%eax)
+; CHECK-NEXT: movl %ebp, 8(%eax)
; CHECK-NEXT: movl %esi, 4(%eax)
; CHECK-NEXT: movl %edi, (%eax)
; CHECK-NEXT: popl %esi
diff --git a/llvm/test/CodeGen/X86/mul-i1024.ll b/llvm/test/CodeGen/X86/mul-i1024.ll
index 0299773aa67add..613d4e9ad0212c 100644
--- a/llvm/test/CodeGen/X86/mul-i1024.ll
+++ b/llvm/test/CodeGen/X86/mul-i1024.ll
@@ -15,63 +15,64 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: movl 60(%eax), %ebp
; X86-NEXT: movl 56(%eax), %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl (%edx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl (%edx), %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %ebp, %eax
; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: mull %esi
+; X86-NEXT: mull %edi
; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl %edi, %esi
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: addl %esi, %edi
; X86-NEXT: adcl $0, %ecx
; X86-NEXT: movl 4(%ebx), %ebx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: mull %ebx
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: addl %esi, %eax
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: addl %edi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %ecx, %edi
+; X86-NEXT: adcl %ecx, %esi
; X86-NEXT: setb %cl
; X86-NEXT: movl %ebp, %eax
; X86-NEXT: mull %ebx
-; X86-NEXT: addl %edi, %eax
+; X86-NEXT: addl %esi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movzbl %cl, %eax
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl 48(%ecx), %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl 52(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 48(%eax), %ebx
; X86-NEXT: movl %ebx, %eax
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; X86-NEXT: mull %ebp
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl 52(%ecx), %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: mull %ebp
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, %esi
; X86-NEXT: addl %edi, %esi
; X86-NEXT: adcl $0, %ecx
; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: mull %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: mull %ebx
; X86-NEXT: movl %edx, %ebp
; X86-NEXT: addl %esi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl %ecx, %ebp
-; X86-NEXT: setb %bl
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull %edi
+; X86-NEXT: setb %cl
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: mull %ebx
; X86-NEXT: addl %ebp, %eax
-; X86-NEXT: movzbl %bl, %ecx
+; X86-NEXT: movzbl %cl, %ecx
; X86-NEXT: adcl %ecx, %edx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -82,12 +83,12 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl 8(%eax), %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ebx
; X86-NEXT: movl %eax, %ebp
@@ -95,24 +96,24 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: adcl $0, %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl 12(%eax), %ecx
-; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: addl %ebp, %eax
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: adcl %ebx, %esi
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: adcl %ebx, %edi
; X86-NEXT: setb %bl
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: mull %ecx
; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl %esi, %ebp
+; X86-NEXT: addl %edi, %ebp
; X86-NEXT: movzbl %bl, %eax
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, %ebp
; X86-NEXT: adcl $0, %edx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
@@ -153,63 +154,63 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl 40(%ecx), %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl 44(%eax), %ebp
+; X86-NEXT: movl 40(%eax), %ebx
; X86-NEXT: movl %ebx, %eax
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: mull %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl 44(%ecx), %ebp
; X86-NEXT: movl %ebp, %eax
; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: mull %esi
+; X86-NEXT: mull %edi
; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl %edi, %esi
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: addl %esi, %edi
; X86-NEXT: adcl $0, %ecx
; X86-NEXT: movl %ebx, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: mull %ebx
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: addl %esi, %eax
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: addl %edi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %ecx, %edi
+; X86-NEXT: adcl %ecx, %esi
; X86-NEXT: setb %cl
; X86-NEXT: movl %ebp, %eax
; X86-NEXT: mull %ebx
-; X86-NEXT: addl %edi, %eax
+; X86-NEXT: addl %esi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movzbl %cl, %eax
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl 32(%ecx), %ebp
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl 36(%eax), %ebp
+; X86-NEXT: movl 32(%eax), %edi
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %ebx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl 36(%ecx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ebp, %eax
; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl %edi, %esi
+; X86-NEXT: addl %ebx, %esi
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull %ebx
-; X86-NEXT: movl %ebx, %ebp
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: mull %edi
; X86-NEXT: movl %edx, %ebx
; X86-NEXT: addl %esi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl %ecx, %ebx
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: mull %ebp
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: mull %edi
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: addl %ebx, %ecx
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
@@ -223,26 +224,26 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: movl %esi, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: mull %ebx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl %ebp, %eax
; X86-NEXT: mull %ebx
; X86-NEXT: movl %edx, %ebx
; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT: addl %edi, %ebp
; X86-NEXT: adcl $0, %ebx
; X86-NEXT: movl %esi, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: addl %ebp, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %ebx, %esi
+; X86-NEXT: adcl %ebx, %edi
; X86-NEXT: setb %bl
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: mull %edi
+; X86-NEXT: mull %esi
; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl %esi, %ebp
+; X86-NEXT: addl %edi, %ebp
; X86-NEXT: movzbl %bl, %eax
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
@@ -364,39 +365,39 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl 24(%eax), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull %edi
+; X86-NEXT: movl 24(%eax), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %ebx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull %edi
+; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, %ebp
; X86-NEXT: addl %ebx, %ebp
; X86-NEXT: adcl $0, %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl 28(%eax), %edi
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 28(%eax), %esi
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %edx, %ebx
; X86-NEXT: addl %ebp, %eax
-; X86-NEXT: movl %eax, %esi
+; X86-NEXT: movl %eax, %edi
; X86-NEXT: adcl %ecx, %ebx
; X86-NEXT: setb %cl
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: mull %edi
+; X86-NEXT: mull %esi
; X86-NEXT: movl %eax, %ebp
; X86-NEXT: addl %ebx, %ebp
; X86-NEXT: movzbl %cl, %eax
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, %ebp
; X86-NEXT: adcl $0, %edx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
@@ -463,26 +464,26 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: movl %ebx, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; X86-NEXT: movl %ebp, %eax
; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl %edi, %esi
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: addl %esi, %edi
; X86-NEXT: adcl $0, %ecx
; X86-NEXT: movl %ebx, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: mull %ebx
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: addl %esi, %eax
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: addl %edi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %ecx, %edi
+; X86-NEXT: adcl %ecx, %esi
; X86-NEXT: setb %cl
; X86-NEXT: movl %ebp, %eax
; X86-NEXT: mull %ebx
-; X86-NEXT: addl %edi, %eax
+; X86-NEXT: addl %esi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movzbl %cl, %eax
; X86-NEXT: adcl %eax, %edx
@@ -519,30 +520,30 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: movl %esi, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: mull %edi
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull %edi
+; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %ebx
; X86-NEXT: movl %eax, %ebp
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: addl %ebp, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %ebx, %esi
+; X86-NEXT: adcl %ebx, %edi
; X86-NEXT: setb %bl
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: mull %edi
+; X86-NEXT: mull %esi
; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl %esi, %ebp
+; X86-NEXT: addl %edi, %ebp
; X86-NEXT: movzbl %bl, %eax
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
@@ -603,15 +604,15 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl 24(%ecx), %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl 28(%eax), %ebp
+; X86-NEXT: movl 24(%eax), %ebx
; X86-NEXT: movl %ebx, %eax
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X86-NEXT: mull %edi
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl 28(%ecx), %ebp
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %ebp, %eax
; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: mull %edi
@@ -635,16 +636,17 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: movzbl %cl, %eax
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl 16(%ecx), %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl 20(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 16(%eax), %ebx
; X86-NEXT: movl %ebx, %eax
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: mull %esi
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl 20(%ecx), %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, %esi
@@ -738,15 +740,15 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, %edx
; X86-NEXT: movl %edx, (%esp) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl 8(%ecx), %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl 12(%eax), %ebp
+; X86-NEXT: movl 8(%eax), %ebx
; X86-NEXT: movl %ebx, %eax
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: mull %esi
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl 12(%ecx), %ebp
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %ebp, %eax
; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: mull %esi
@@ -771,13 +773,13 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl (%ecx), %ebx
+; X86-NEXT: movl 4(%ecx), %ebp
; X86-NEXT: movl %ebx, %eax
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X86-NEXT: mull %edi
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl 4(%ecx), %ebp
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %ebp, %eax
; X86-NEXT: mull %edi
; X86-NEXT: movl %edx, %ecx
@@ -912,30 +914,30 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %ebx, %eax
; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl %ebx, %edi
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X86-NEXT: adcl $0, %esi
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: mull %ebx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull %ebp
+; X86-NEXT: movl %ebp, %ecx
; X86-NEXT: movl %edx, %ebp
; X86-NEXT: addl %edi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl %esi, %ebp
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: movl %ecx, %esi
-; X86-NEXT: mull %ebx
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl %ebx, %esi
+; X86-NEXT: mull %ecx
; X86-NEXT: addl %ebp, %eax
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
; X86-NEXT: adcl %ecx, %edx
@@ -1037,37 +1039,36 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: adcl (%esp), %esi # 4-byte Folded Reload
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %ebx, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT: movl %ebp, %eax
; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, %edi
; X86-NEXT: addl %esi, %edi
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT: mull %ebp
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: mull %ebx
; X86-NEXT: movl %edx, %esi
; X86-NEXT: addl %edi, %eax
; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
; X86-NEXT: adcl %ecx, %esi
; X86-NEXT: setb %cl
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: mull %ebp
-; X86-NEXT: movl %ebp, %ebx
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: mull %ebx
; X86-NEXT: addl %esi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movzbl %cl, %eax
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -1076,20 +1077,20 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: movl %ebp, %eax
; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: mull %ebx
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: addl %edi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %ecx, %esi
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: addl %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl %ecx, %edi
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X86-NEXT: movl %ebp, %eax
; X86-NEXT: mull %ebx
; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: addl %esi, %ecx
+; X86-NEXT: addl %edi, %ecx
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
@@ -1097,38 +1098,38 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %edi, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: mull %esi
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: mull %edi
; X86-NEXT: movl %edx, %ebx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %eax, %ebp
; X86-NEXT: addl %ebx, %ebp
-; X86-NEXT: adcl $0, %esi
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: mull %edi
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %ebx
; X86-NEXT: addl %ebp, %eax
; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: adcl %esi, %ebx
+; X86-NEXT: adcl %edi, %ebx
; X86-NEXT: setb (%esp) # 1-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl %ebx, %esi
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: addl %ebx, %edi
; X86-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl $0, %esi
+; X86-NEXT: adcl $0, %edi
; X86-NEXT: adcl $0, %edx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
@@ -1136,30 +1137,30 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: movl %ebp, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: addl %edi, %ebx
+; X86-NEXT: addl %esi, %ebx
; X86-NEXT: adcl $0, %ecx
; X86-NEXT: movl %ebp, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; X86-NEXT: mull %ebp
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: addl %ebx, %eax
; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT: adcl %ecx, %edi
+; X86-NEXT: adcl %ecx, %esi
; X86-NEXT: setb %cl
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: mull %ebp
-; X86-NEXT: addl %edi, %eax
+; X86-NEXT: addl %esi, %eax
; X86-NEXT: movl %eax, %ebp
; X86-NEXT: movzbl %cl, %eax
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: addl %esi, %ebx
+; X86-NEXT: addl %edi, %ebx
; X86-NEXT: movl (%esp), %eax # 4-byte Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
@@ -1228,35 +1229,36 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: movl %edi, %eax
; X86-NEXT: mull %esi
; X86-NEXT: movl %esi, %edi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %edx, %esi
; X86-NEXT: addl %ebx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl %ecx, %esi
; X86-NEXT: setb %cl
; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl %edi, %ebp
; X86-NEXT: mull %edi
; X86-NEXT: addl %esi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movzbl %cl, %eax
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %ebx, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X86-NEXT: movl %edi, %eax
; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl %ebx, %esi
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: mull %ebx
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: mull %ebp
+; X86-NEXT: movl %ebp, %ebx
+; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %edx, %ebp
; X86-NEXT: addl %esi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -2115,25 +2117,25 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: movl %ebx, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; X86-NEXT: movl %ebp, %eax
; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl %edi, %esi
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: addl %esi, %edi
; X86-NEXT: adcl $0, %ecx
; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: mull %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: addl %esi, %eax
+; X86-NEXT: addl %edi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl %ecx, %ebx
; X86-NEXT: setb %cl
; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull %edi
+; X86-NEXT: mull %esi
; X86-NEXT: addl %ebx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movzbl %cl, %eax
@@ -2145,22 +2147,22 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ebx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl %ebx, %esi
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: addl %ebx, %edi
; X86-NEXT: adcl $0, %ecx
; X86-NEXT: movl %ebp, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: mull %ebx
; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: addl %esi, %eax
+; X86-NEXT: addl %edi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl %ecx, %ebp
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull %ebx
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: addl %ebp, %ecx
@@ -2171,30 +2173,30 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: mull %ebx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull %ebx
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %eax, %ebp
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT: adcl $0, %edi
-; X86-NEXT: movl %esi, %eax
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: mull %ebx
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: addl %ebp, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %edi, %esi
+; X86-NEXT: adcl %esi, %edi
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: mull %ebx
; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl %esi, %ebp
+; X86-NEXT: addl %edi, %ebp
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
@@ -2206,30 +2208,30 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %eax, %ebx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT: adcl $0, %edi
-; X86-NEXT: movl %esi, %eax
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: addl %ebx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %edi, %esi
+; X86-NEXT: adcl %esi, %edi
; X86-NEXT: setb %bl
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: mull %ecx
; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: addl %esi, %ecx
+; X86-NEXT: addl %edi, %ecx
; X86-NEXT: movzbl %bl, %eax
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
@@ -2326,24 +2328,25 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: movl %ebx, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %esi, %edi
; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl %edi, %ebp
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X86-NEXT: adcl $0, %ecx
; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: mull %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %ebx
; X86-NEXT: addl %ebp, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl %ecx, %ebx
; X86-NEXT: setb %cl
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull %edi
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: mull %esi
; X86-NEXT: movl %eax, %ebp
; X86-NEXT: addl %ebx, %ebp
; X86-NEXT: movzbl %cl, %eax
@@ -3274,18 +3277,18 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl 104(%ecx), %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl 108(%eax), %ebp
+; X86-NEXT: movl 104(%eax), %ebx
; X86-NEXT: movl %ebx, %eax
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl 108(%ecx), %ebp
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %ebp, %eax
; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: mull %edi
+; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, %edi
; X86-NEXT: addl %esi, %edi
@@ -3305,162 +3308,162 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: movzbl %cl, %eax
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl 96(%esi), %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl 100(%eax), %ebp
+; X86-NEXT: movl 96(%eax), %ebx
; X86-NEXT: movl %ebx, %eax
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: mull %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl 100(%esi), %ebp
; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull %ecx
+; X86-NEXT: mull %edi
; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl %edi, %esi
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: addl %esi, %edi
; X86-NEXT: adcl $0, %ecx
; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: addl %esi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: mull %ebx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: addl %edi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %ecx, %ebx
-; X86-NEXT: setb %cl
+; X86-NEXT: adcl %ecx, %esi
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X86-NEXT: movl %ebp, %eax
; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl %ebx, %esi
-; X86-NEXT: movzbl %cl, %eax
+; X86-NEXT: mull %ebx
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: addl %esi, %ecx
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X86-NEXT: adcl %eax, %edx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X86-NEXT: movl %edi, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: mull %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %eax, %ebx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: adcl $0, %esi
; X86-NEXT: movl %edi, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; X86-NEXT: mull %ebp
; X86-NEXT: movl %edx, %edi
; X86-NEXT: addl %ebx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %ecx, %edi
+; X86-NEXT: adcl %esi, %edi
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: mull %ebp
; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: addl %edi, %ecx
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: addl %edi, %esi
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X86-NEXT: adcl %eax, %ebx
-; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: adcl $0, %esi
; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X86-NEXT: movl %edi, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: mull %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, %ebp
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT: adcl $0, %esi
+; X86-NEXT: adcl $0, %ecx
; X86-NEXT: movl %edi, %eax
; X86-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
; X86-NEXT: movl %edx, %edi
; X86-NEXT: addl %ebp, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %esi, %edi
-; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT: adcl %ecx, %edi
+; X86-NEXT: setb %cl
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; X86-NEXT: mull %ebp
; X86-NEXT: addl %edi, %eax
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: movzbl %cl, %eax
; X86-NEXT: adcl %eax, %edx
-; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: adcl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT: adcl %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl %eax, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl 112(%esi), %edi
-; X86-NEXT: movl %ebp, %ecx
-; X86-NEXT: imull %edi, %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl 112(%ecx), %edi
+; X86-NEXT: movl %ebp, %esi
+; X86-NEXT: imull %edi, %esi
; X86-NEXT: movl %edi, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: mull %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT: mull %ebp
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: addl %ecx, %edx
-; X86-NEXT: movl 116(%esi), %eax
+; X86-NEXT: addl %esi, %edx
+; X86-NEXT: movl 116(%ecx), %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: imull %eax, %ebx
-; X86-NEXT: addl %edx, %ebx
-; X86-NEXT: movl 120(%esi), %eax
-; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: imull %eax, %ebp
+; X86-NEXT: addl %edx, %ebp
+; X86-NEXT: movl 120(%ecx), %eax
+; X86-NEXT: movl %eax, %ebx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: imull %esi, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT: mull %ebp
-; X86-NEXT: addl %ecx, %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl 124(%ecx), %ecx
-; X86-NEXT: imull %ebp, %ecx
-; X86-NEXT: addl %edx, %ecx
+; X86-NEXT: imull %esi, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: mull %ecx
+; X86-NEXT: addl %ebx, %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl 124(%ebx), %ebx
+; X86-NEXT: imull %ecx, %ebx
+; X86-NEXT: addl %edx, %ebx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %ebx, %ecx
-; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: adcl %ebp, %ebx
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %edx, %ebp
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl %ebx, %edi
-; X86-NEXT: adcl $0, %esi
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: mull %ebx
+; X86-NEXT: addl %ebp, %edi
+; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %ebp
; X86-NEXT: addl %edi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %esi, %ebp
-; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT: adcl %ecx, %ebp
+; X86-NEXT: setb %cl
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: mull %ebx
+; X86-NEXT: mull %esi
; X86-NEXT: addl %ebp, %eax
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
-; X86-NEXT: adcl %esi, %edx
+; X86-NEXT: movzbl %cl, %ecx
+; X86-NEXT: adcl %ecx, %edx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %ecx, %edx
+; X86-NEXT: adcl %ebx, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
@@ -3557,99 +3560,101 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl 88(%ecx), %ebx
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl 92(%ecx), %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl 92(%eax), %edi
+; X86-NEXT: movl 88(%eax), %ebp
; X86-NEXT: movl %ebp, %eax
; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: mull %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl %edi, %esi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: addl %esi, %ebx
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: addl %esi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %ecx, %ebx
-; X86-NEXT: setb %cl
; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT: mull %ebp
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: addl %ebx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl %ecx, %esi
+; X86-NEXT: setb %cl
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: mull %ebp
+; X86-NEXT: addl %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movzbl %cl, %eax
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl 80(%ecx), %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl 84(%eax), %edi
+; X86-NEXT: movl 80(%eax), %ebx
; X86-NEXT: movl %ebx, %eax
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: mull %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %ebp
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl 84(%ecx), %ebp
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull %edi
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl %esi, %edi
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: addl %ebp, %esi
; X86-NEXT: adcl $0, %ecx
; X86-NEXT: movl %ebx, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: mull %ebx
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: addl %edi, %eax
+; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: addl %esi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %ecx, %esi
-; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl %ecx, %ebp
+; X86-NEXT: setb %cl
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: mull %ebx
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: addl %esi, %ecx
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT: adcl %eax, %edx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: addl %ebp, %eax
+; X86-NEXT: movzbl %cl, %ecx
+; X86-NEXT: adcl %ecx, %edx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: movl %ebx, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %edx, %ebx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl %edi, %ebp
-; X86-NEXT: adcl $0, %esi
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: mull %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull %edi
; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: addl %ebx, %ebp
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %ebx
; X86-NEXT: addl %ebp, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %esi, %edi
+; X86-NEXT: adcl %edi, %ebx
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: mull %ebx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull %esi
; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl %edi, %ebp
+; X86-NEXT: addl %ebx, %ebp
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X86-NEXT: adcl %eax, %edx
-; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: adcl $0, %ebp
@@ -3658,97 +3663,98 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl %esi, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %eax, %ebx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT: adcl $0, %esi
-; X86-NEXT: movl %edi, %eax
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl %esi, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: addl %ebx, %eax
; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: adcl %esi, %edi
+; X86-NEXT: adcl %edi, %esi
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: mull %ecx
-; X86-NEXT: addl %edi, %eax
-; X86-NEXT: movl %eax, %edi
+; X86-NEXT: addl %esi, %eax
+; X86-NEXT: movl %eax, %esi
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: addl %ebp, (%esp) # 4-byte Folded Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT: adcl %eax, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl 72(%ecx), %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl 76(%eax), %edi
+; X86-NEXT: movl 72(%eax), %ebp
; X86-NEXT: movl %ebp, %eax
; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: mull %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: movl 76(%ecx), %esi
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: mull %edi
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl %ebx, %edi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: addl %esi, %ebx
; X86-NEXT: adcl $0, %ecx
; X86-NEXT: movl %ebp, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; X86-NEXT: mull %ebp
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: addl %edi, %eax
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: addl %ebx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %ecx, %ebx
+; X86-NEXT: adcl %ecx, %esi
; X86-NEXT: setb %cl
-; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: mull %ebp
-; X86-NEXT: addl %ebx, %eax
+; X86-NEXT: addl %esi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movzbl %cl, %eax
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl 64(%esi), %ebx
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl 68(%eax), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 64(%eax), %edi
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %ebx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: movl 68(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl %ebp, %edi
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: addl %ebx, %esi
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: mull %esi
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: mull %ebp
+; X86-NEXT: movl %ebp, %edi
; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: addl %edi, %eax
+; X86-NEXT: addl %esi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl %ecx, %ebp
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: mull %esi
+; X86-NEXT: mull %edi
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: addl %ebp, %ecx
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
@@ -3758,30 +3764,30 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %edi, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: mull %esi
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: mull %edi
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %eax, %ebp
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT: adcl $0, %esi
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: mull %ebx
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %ebx
; X86-NEXT: addl %ebp, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %esi, %edi
+; X86-NEXT: adcl %edi, %ebx
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: mull %ebx
+; X86-NEXT: mull %esi
; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl %edi, %ebp
+; X86-NEXT: addl %ebx, %ebp
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
@@ -3793,47 +3799,46 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: addl %edi, %ebx
+; X86-NEXT: addl %esi, %ebx
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: addl %ebx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %ecx, %edi
+; X86-NEXT: adcl %ecx, %esi
; X86-NEXT: setb %cl
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: mull %esi
-; X86-NEXT: addl %edi, %eax
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: movzbl %cl, %eax
+; X86-NEXT: mull %edi
+; X86-NEXT: addl %esi, %eax
+; X86-NEXT: movzbl %cl, %ecx
; X86-NEXT: movl %edx, %esi
-; X86-NEXT: adcl %eax, %esi
+; X86-NEXT: adcl %ecx, %esi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: addl %ebp, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT: adcl %eax, %edi
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload
+; X86-NEXT: adcl %edi, %eax
; X86-NEXT: adcl $0, %esi
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, (%esp) # 4-byte Folded Spill
@@ -3842,60 +3847,60 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: movl %ebx, %edi
-; X86-NEXT: mull %ecx
+; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: addl %ecx, %edi
; X86-NEXT: adcl $0, %esi
; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: mull %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: addl %edi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl %esi, %ebp
-; X86-NEXT: setb %cl
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: mull %ebx
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: mull %ecx
; X86-NEXT: addl %ebp, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movzbl %cl, %eax
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: mull %esi
+; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: addl %ecx, %ebx
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X86-NEXT: adcl $0, %esi
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: addl %edi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl %esi, %ebx
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT: mull %ebp
-; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl %ebp, %esi
+; X86-NEXT: mull %ecx
; X86-NEXT: addl %ebx, %eax
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT: adcl %ecx, %edx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %esi, %ecx
-; X86-NEXT: setb %bl
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: mull %ebp
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl %ecx, %esi
-; X86-NEXT: movzbl %bl, %eax
-; X86-NEXT: adcl %eax, %edx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
@@ -3904,39 +3909,40 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: movl %ebp, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl %ebx, %edi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: addl %edi, %ebx
; X86-NEXT: adcl $0, %ecx
; X86-NEXT: movl %ebp, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; X86-NEXT: mull %ebp
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: addl %edi, %eax
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: addl %ebx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %ecx, %ebx
+; X86-NEXT: adcl %ecx, %edi
; X86-NEXT: setb %cl
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull %ebp
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl %ebx, %edi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: addl %edi, %ebx
; X86-NEXT: movzbl %cl, %eax
; X86-NEXT: adcl %eax, %edx
-; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: adcl $0, %edi
+; X86-NEXT: adcl $0, %ebx
; X86-NEXT: adcl $0, %edx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -3947,21 +3953,21 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: movl %eax, %ebp
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X86-NEXT: adcl $0, %esi
-; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: addl %ebp, %eax
; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: adcl %esi, %ebx
+; X86-NEXT: adcl %esi, %edi
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: mull %ecx
-; X86-NEXT: addl %ebx, %eax
+; X86-NEXT: addl %edi, %eax
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
; X86-NEXT: adcl %esi, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: addl %edi, %esi
+; X86-NEXT: addl %ebx, %esi
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
; X86-NEXT: adcl %ecx, %eax
@@ -3988,68 +3994,68 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %edx, %ebp
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl %esi, %edi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: addl %ebp, %ebx
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: mull %ebx
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: addl %edi, %eax
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT: mull %ebp
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: addl %ebx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %ecx, %esi
+; X86-NEXT: adcl %ecx, %edi
; X86-NEXT: setb %cl
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull %ebx
-; X86-NEXT: addl %esi, %eax
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: mull %ebp
+; X86-NEXT: addl %edi, %eax
; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
; X86-NEXT: movzbl %cl, %eax
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl %esi, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; X86-NEXT: movl %ebp, %eax
; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl %esi, %edi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: addl %edi, %ebx
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: mull %ebx
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: addl %edi, %eax
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: addl %ebx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %ecx, %esi
-; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT: adcl %ecx, %edi
+; X86-NEXT: setb %bl
; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull %ebx
+; X86-NEXT: mull %esi
; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: addl %esi, %ecx
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT: addl %edi, %ecx
+; X86-NEXT: movzbl %bl, %eax
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, (%esp) # 4-byte Folded Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %ebx, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -4057,34 +4063,34 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: movl %ebp, %eax
; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X86-NEXT: adcl $0, %esi
-; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl %ebx, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; X86-NEXT: mull %ebp
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: addl %ebx, %eax
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: addl %edi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %esi, %edi
+; X86-NEXT: adcl %esi, %ebx
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: mull %ebp
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: addl %edi, %ebx
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: addl %ebx, %edi
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: adcl $0, %edi
; X86-NEXT: adcl $0, %edx
-; X86-NEXT: addl (%esp), %ebx # 4-byte Folded Reload
+; X86-NEXT: addl (%esp), %edi # 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: setb (%esp) # 1-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %ebx, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -4095,25 +4101,25 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: movl %eax, %ebp
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X86-NEXT: adcl $0, %esi
-; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl %ebx, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %edx, %ebx
; X86-NEXT: addl %ebp, %eax
; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: adcl %esi, %edi
+; X86-NEXT: adcl %esi, %ebx
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: mull %ecx
-; X86-NEXT: addl %edi, %eax
-; X86-NEXT: movl %eax, %edi
+; X86-NEXT: addl %ebx, %eax
+; X86-NEXT: movl %eax, %ebx
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: addl %ebx, %ecx
+; X86-NEXT: addl %edi, %ecx
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X86-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload
-; X86-NEXT: adcl %eax, %edi
+; X86-NEXT: adcl %eax, %ebx
; X86-NEXT: adcl $0, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
@@ -4128,8 +4134,8 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, %ebp
; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl $0, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -4274,7 +4280,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: imull %eax, %ecx
; X86-NEXT: movl (%esp), %esi # 4-byte Reload
; X86-NEXT: mull %esi
-; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: movl %eax, %ebx
; X86-NEXT: addl %ecx, %edx
; X86-NEXT: movl %esi, %eax
; X86-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
@@ -4282,37 +4288,36 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: movl %eax, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: imull %ebx, %esi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: imull %edi, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %eax, %ebp
; X86-NEXT: addl %esi, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: imull %edi, %esi
+; X86-NEXT: imull %ecx, %esi
; X86-NEXT: addl %edx, %esi
-; X86-NEXT: addl %ebp, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl %ebx, %ebp
; X86-NEXT: adcl (%esp), %esi # 4-byte Folded Reload
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: movl %edi, %ebp
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl %ecx, %ebx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, %edi
; X86-NEXT: addl %esi, %edi
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl %ebx, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: mull %ebx
; X86-NEXT: movl %edx, %esi
; X86-NEXT: addl %edi, %eax
-; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl %ecx, %esi
; X86-NEXT: setb %cl
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -4320,65 +4325,65 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: addl %esi, %eax
; X86-NEXT: movzbl %cl, %ecx
; X86-NEXT: adcl %ecx, %edx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: addl %ebp, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: movl 120(%ebx), %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl 120(%edi), %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT: movl %ebp, %eax
; X86-NEXT: mull %esi
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
; X86-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-NEXT: addl %edx, %esi
-; X86-NEXT: movl 124(%ebx), %eax
-; X86-NEXT: imull %ecx, %eax
+; X86-NEXT: movl 124(%edi), %eax
+; X86-NEXT: imull %ebp, %eax
; X86-NEXT: addl %eax, %esi
-; X86-NEXT: movl 112(%ebx), %edi
-; X86-NEXT: movl 116(%ebx), %ebp
+; X86-NEXT: movl 112(%edi), %ecx
+; X86-NEXT: movl 116(%edi), %edi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: imull %ebp, %ebx
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: mull %edi
+; X86-NEXT: imull %edi, %ebx
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: mull %ecx
; X86-NEXT: addl %ebx, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: imull %edi, %ecx
-; X86-NEXT: addl %edx, %ecx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: imull %ecx, %ebx
+; X86-NEXT: addl %edx, %ebx
+; X86-NEXT: addl (%esp), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: adcl %esi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl %ebp, %esi
+; X86-NEXT: mull %ebp
+; X86-NEXT: movl %edx, %ebp
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %esi, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %edi, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull %ecx
+; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl %ebx, %ebp
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: addl %ebp, %ebx
; X86-NEXT: adcl $0, %esi
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: mull %ebx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT: mull %ebp
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl %ebp, %edi
+; X86-NEXT: addl %ebx, %edi
; X86-NEXT: adcl %esi, %ecx
-; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT: setb %bl
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: mull %ebx
+; X86-NEXT: mull %ebp
; X86-NEXT: addl %ecx, %eax
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT: movzbl %bl, %ecx
; X86-NEXT: adcl %ecx, %edx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: addl (%esp), %eax # 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: adcl (%esp), %edi # 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
@@ -4395,202 +4400,200 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ebp
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: addl %ebp, %ecx
-; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: addl %ebp, %esi
+; X86-NEXT: adcl $0, %ecx
; X86-NEXT: movl %ebx, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: mull %ebx
; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: addl %esi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %edi, %ebp
+; X86-NEXT: adcl %ecx, %ebp
; X86-NEXT: setb %cl
-; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: mull %ebx
; X86-NEXT: addl %ebp, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movzbl %cl, %eax
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: addl %ecx, %ebx
-; X86-NEXT: adcl $0, %edi
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: mull %esi
+; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: addl %ebx, %eax
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: addl %edi, %esi
+; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: mull %ebx
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: addl %esi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %edi, %ecx
-; X86-NEXT: setb %bl
+; X86-NEXT: adcl %ecx, %edi
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: movl %ebp, %edi
-; X86-NEXT: mull %esi
+; X86-NEXT: movl %ebp, %ecx
+; X86-NEXT: mull %ebx
; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl %ecx, %ebp
-; X86-NEXT: movzbl %bl, %eax
+; X86-NEXT: addl %edi, %ebp
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: mull %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: mull %edi
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: addl %esi, %eax
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: addl %ebx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %ecx, %ebx
+; X86-NEXT: adcl %edi, %esi
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl %ebx, %esi
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: addl %esi, %ebx
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT: adcl %eax, %ecx
+; X86-NEXT: adcl %eax, %edi
; X86-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: adcl $0, %esi
-; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: mull %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, %ebp
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT: adcl $0, %edi
-; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: addl %ebp, %eax
-; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: adcl %edi, %ebx
-; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl %ecx, %esi
+; X86-NEXT: setb %cl
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: mull %edi
-; X86-NEXT: addl %ebx, %eax
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT: mull %ebp
+; X86-NEXT: addl %esi, %eax
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: movzbl %cl, %eax
; X86-NEXT: adcl %eax, %edx
-; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: adcl %ecx, %ebp
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT: adcl %eax, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %edi, %esi
-; X86-NEXT: imull %eax, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: mull %ebx
+; X86-NEXT: movl %ebp, %ecx
+; X86-NEXT: imull %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: mull %esi
; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl %esi, %edx
-; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: addl %ecx, %edx
+; X86-NEXT: movl %esi, %eax
; X86-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: addl %edx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, %esi
+; X86-NEXT: movl %eax, %edi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: imull %ebx, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: addl %esi, %edx
+; X86-NEXT: imull %ebx, %edi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: imull %edi, %esi
-; X86-NEXT: addl %edx, %esi
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: addl %edi, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: imull %esi, %edi
+; X86-NEXT: addl %edx, %edi
; X86-NEXT: addl %ebp, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: movl %edi, %ebp
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl %esi, %ebp
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %ebx, %eax
; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl %edi, %esi
-; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: addl %edi, %ebx
+; X86-NEXT: adcl $0, %esi
; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: addl %esi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: addl %ebx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %ecx, %ebx
-; X86-NEXT: setb %cl
+; X86-NEXT: adcl %esi, %edi
+; X86-NEXT: setb %bl
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: mull %edi
-; X86-NEXT: addl %ebx, %eax
-; X86-NEXT: movzbl %cl, %ecx
+; X86-NEXT: mull %ecx
+; X86-NEXT: addl %edi, %eax
+; X86-NEXT: movzbl %bl, %ecx
; X86-NEXT: adcl %ecx, %edx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: imull %esi, %ecx
-; X86-NEXT: movl %esi, %eax
+; X86-NEXT: imull %eax, %esi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: mull %ebx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: addl %ecx, %edx
+; X86-NEXT: addl %esi, %edx
; X86-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
; X86-NEXT: addl %edx, %ebx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: movl %eax, %esi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: imull %edi, %ecx
+; X86-NEXT: imull %edi, %esi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; X86-NEXT: mull %ebp
-; X86-NEXT: addl %ecx, %edx
+; X86-NEXT: addl %esi, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: imull %ebp, %ecx
; X86-NEXT: addl %edx, %ecx
@@ -4599,14 +4602,15 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: adcl %ebx, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %ebx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %edi, %eax
-; X86-NEXT: mull %esi
+; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl %ecx, %esi
+; X86-NEXT: addl %ebx, %esi
; X86-NEXT: adcl $0, %edi
; X86-NEXT: movl %ebp, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
@@ -4829,51 +4833,52 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: movq 40(%rdi), %rbx
; X64-NEXT: movq 32(%rdi), %r12
-; X64-NEXT: movq 56(%rdi), %r15
+; X64-NEXT: movq 56(%rdi), %r14
; X64-NEXT: movq 48(%rdi), %r10
-; X64-NEXT: movq (%rsi), %r11
-; X64-NEXT: movq 8(%rsi), %r14
+; X64-NEXT: movq (%rsi), %r15
+; X64-NEXT: movq 8(%rsi), %r11
; X64-NEXT: movq %rsi, %r13
; X64-NEXT: movq %r10, %rax
; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %r11
+; X64-NEXT: mulq %r15
; X64-NEXT: movq %rdx, %rcx
; X64-NEXT: movq %rax, %rdi
-; X64-NEXT: movq %r15, %rax
-; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %r11
+; X64-NEXT: movq %r14, %rax
+; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: mulq %r15
; X64-NEXT: movq %rdx, %rsi
; X64-NEXT: movq %rax, %r9
; X64-NEXT: addq %rcx, %r9
; X64-NEXT: adcq $0, %rsi
; X64-NEXT: movq %r10, %rax
-; X64-NEXT: mulq %r14
+; X64-NEXT: mulq %r11
; X64-NEXT: movq %rdx, %r10
; X64-NEXT: movq %rax, %r8
; X64-NEXT: addq %r9, %r8
; X64-NEXT: adcq %rsi, %r10
; X64-NEXT: setb %al
; X64-NEXT: movzbl %al, %r9d
-; X64-NEXT: movq %r15, %rax
-; X64-NEXT: mulq %r14
-; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: movq %r14, %rax
+; X64-NEXT: mulq %r11
+; X64-NEXT: movq %r11, %rcx
+; X64-NEXT: movq %rdx, %r14
; X64-NEXT: movq %rax, %rsi
; X64-NEXT: addq %r10, %rsi
-; X64-NEXT: adcq %r9, %rcx
+; X64-NEXT: adcq %r9, %r14
; X64-NEXT: movq %r12, %rax
-; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %r11
+; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: mulq %r15
; X64-NEXT: movq %rdx, %r9
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: movq %rbx, %rax
-; X64-NEXT: mulq %r11
+; X64-NEXT: mulq %r15
; X64-NEXT: movq %rdx, %r10
; X64-NEXT: movq %rax, %r11
; X64-NEXT: addq %r9, %r11
; X64-NEXT: adcq $0, %r10
; X64-NEXT: movq %r12, %rax
-; X64-NEXT: mulq %r14
+; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: mulq %rcx
; X64-NEXT: movq %rdx, %r9
; X64-NEXT: addq %r11, %rax
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -4881,7 +4886,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X64-NEXT: setb %r10b
; X64-NEXT: movq %rbx, %r11
; X64-NEXT: movq %rbx, %rax
-; X64-NEXT: mulq %r14
+; X64-NEXT: mulq %rcx
; X64-NEXT: movq %rdx, %rbx
; X64-NEXT: movq %rax, %r15
; X64-NEXT: addq %r9, %r15
@@ -4890,7 +4895,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X64-NEXT: addq %rdi, %r15
; X64-NEXT: adcq %r8, %rbx
; X64-NEXT: adcq $0, %rsi
-; X64-NEXT: adcq $0, %rcx
+; X64-NEXT: adcq $0, %r14
; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: movq 16(%r13), %r8
; X64-NEXT: movq %r12, %r10
@@ -4898,7 +4903,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X64-NEXT: movq %r12, %rax
; X64-NEXT: mulq %r8
; X64-NEXT: movq %rdx, %rdi
-; X64-NEXT: movq %rax, %r14
+; X64-NEXT: movq %rax, %rcx
; X64-NEXT: movq %r11, %rax
; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: mulq %r8
@@ -4921,29 +4926,29 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X64-NEXT: addq %r13, %r9
; X64-NEXT: movzbl %r10b, %eax
; X64-NEXT: adcq %rax, %rdi
-; X64-NEXT: addq %r15, %r14
-; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: addq %r15, %rcx
+; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: adcq %rbx, %r12
; X64-NEXT: movq %r12, (%rsp) # 8-byte Spill
; X64-NEXT: adcq $0, %r9
; X64-NEXT: adcq $0, %rdi
; X64-NEXT: addq %rsi, %r9
-; X64-NEXT: adcq %rcx, %rdi
+; X64-NEXT: adcq %r14, %rdi
; X64-NEXT: setb %r10b
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; X64-NEXT: movq %r15, %rax
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; X64-NEXT: movq %r12, %rax
; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: mulq %r8
; X64-NEXT: movq %rdx, %rcx
; X64-NEXT: movq %rax, %r11
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; X64-NEXT: movq %r14, %rax
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; X64-NEXT: movq %r15, %rax
; X64-NEXT: mulq %r8
; X64-NEXT: movq %rdx, %rsi
; X64-NEXT: movq %rax, %rbx
; X64-NEXT: addq %rcx, %rbx
; X64-NEXT: adcq $0, %rsi
-; X64-NEXT: movq %r15, %rax
+; X64-NEXT: movq %r12, %rax
; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: mulq %rbp
; X64-NEXT: movq %rdx, %rcx
@@ -4951,7 +4956,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X64-NEXT: movq %rax, %rbx
; X64-NEXT: adcq %rsi, %rcx
; X64-NEXT: setb %sil
-; X64-NEXT: movq %r14, %rax
+; X64-NEXT: movq %r15, %rax
; X64-NEXT: mulq %rbp
; X64-NEXT: addq %rcx, %rax
; X64-NEXT: movq %rax, %rcx
@@ -4966,17 +4971,17 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: adcq $0, %rdx
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; X64-NEXT: movq 16(%r14), %r11
-; X64-NEXT: movq %r11, %rax
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; X64-NEXT: mulq %r10
-; X64-NEXT: movq %rax, %r9
+; X64-NEXT: movq 24(%r10), %r8
+; X64-NEXT: movq 16(%r10), %r11
+; X64-NEXT: movq %r11, %rax
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; X64-NEXT: mulq %r14
; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq 24(%r14), %r8
+; X64-NEXT: movq %rax, %r9
; X64-NEXT: movq %r8, %rax
; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %r10
+; X64-NEXT: mulq %r14
; X64-NEXT: movq %rdx, %rsi
; X64-NEXT: movq %rax, %rdi
; X64-NEXT: addq %rcx, %rdi
@@ -4997,14 +5002,15 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X64-NEXT: addq %r15, %rdi
; X64-NEXT: movzbl %sil, %eax
; X64-NEXT: adcq %rax, %rcx
-; X64-NEXT: movq (%r14), %rbp
+; X64-NEXT: movq (%r10), %rbp
+; X64-NEXT: movq 8(%r10), %r10
; X64-NEXT: movq %rbp, %rax
-; X64-NEXT: mulq %r10
-; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: mulq %r14
; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: movq 8(%r14), %r14
-; X64-NEXT: movq %r14, %rax
-; X64-NEXT: mulq %r10
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %r10, %rax
+; X64-NEXT: movq %r10, %r8
+; X64-NEXT: mulq %r14
; X64-NEXT: movq %rdx, %r15
; X64-NEXT: movq %rax, %r12
; X64-NEXT: addq %rsi, %r12
@@ -5016,8 +5022,8 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: adcq %r15, %rsi
; X64-NEXT: setb %r10b
-; X64-NEXT: movq %r14, %r15
-; X64-NEXT: movq %r14, %rax
+; X64-NEXT: movq %r8, %r15
+; X64-NEXT: movq %r8, %rax
; X64-NEXT: mulq %r13
; X64-NEXT: movq %rdx, %r12
; X64-NEXT: movq %rax, %r13
@@ -5030,13 +5036,14 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X64-NEXT: adcq $0, %rcx
; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: movq %rbp, %rax
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; X64-NEXT: mulq %r8
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; X64-NEXT: mulq %r9
; X64-NEXT: movq %rdx, %rsi
; X64-NEXT: movq %rax, %r10
-; X64-NEXT: movq %r14, %rax
-; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %r8
+; X64-NEXT: movq %r8, %rax
+; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: mulq %r9
+; X64-NEXT: movq %r9, %r8
; X64-NEXT: movq %rdx, %r9
; X64-NEXT: movq %rax, %rbx
; X64-NEXT: addq %rsi, %rbx
@@ -5137,20 +5144,20 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X64-NEXT: addq %r15, %r11
; X64-NEXT: movzbl %r10b, %eax
; X64-NEXT: adcq %rax, %rdi
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; X64-NEXT: movq %r9, %rax
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; X64-NEXT: movq %r14, %rax
; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: mulq %rcx
; X64-NEXT: movq %rdx, %r15
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; X64-NEXT: movq %r14, %rax
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; X64-NEXT: movq %r9, %rax
; X64-NEXT: mulq %rcx
; X64-NEXT: movq %rdx, %r13
; X64-NEXT: movq %rax, %rbp
; X64-NEXT: addq %r15, %rbp
; X64-NEXT: adcq $0, %r13
-; X64-NEXT: movq %r9, %rax
+; X64-NEXT: movq %r14, %rax
; X64-NEXT: movq %rbx, %rcx
; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: mulq %rbx
@@ -5159,7 +5166,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X64-NEXT: movq %rax, (%rsp) # 8-byte Spill
; X64-NEXT: adcq %r13, %r10
; X64-NEXT: setb %bl
-; X64-NEXT: movq %r14, %rax
+; X64-NEXT: movq %r9, %rax
; X64-NEXT: mulq %rcx
; X64-NEXT: movq %rdx, %r15
; X64-NEXT: movq %rax, %rbp
@@ -5171,19 +5178,19 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X64-NEXT: adcq $0, %r11
; X64-NEXT: adcq $0, %rdi
; X64-NEXT: movq 48(%r8), %rcx
-; X64-NEXT: movq %r9, %rax
+; X64-NEXT: movq %r14, %rax
; X64-NEXT: mulq %rcx
; X64-NEXT: movq %rdx, %rsi
; X64-NEXT: movq %rax, %rbx
-; X64-NEXT: movq %r14, %rax
-; X64-NEXT: movq %r14, %r12
+; X64-NEXT: movq %r9, %rax
+; X64-NEXT: movq %r9, %r12
; X64-NEXT: mulq %rcx
; X64-NEXT: movq %rdx, %r10
; X64-NEXT: movq %rax, %r13
; X64-NEXT: addq %rsi, %r13
; X64-NEXT: adcq $0, %r10
; X64-NEXT: movq 56(%r8), %rsi
-; X64-NEXT: movq %r9, %rax
+; X64-NEXT: movq %r14, %rax
; X64-NEXT: mulq %rsi
; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: movq %rdx, %r14
@@ -5650,25 +5657,25 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT: movq 88(%rcx), %r10
+; X64-NEXT: movq 80(%rcx), %r11
+; X64-NEXT: movq %rcx, %r14
+; X64-NEXT: movq %r11, %rax
+; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT: movq 80(%rdi), %r10
+; X64-NEXT: mulq %rdi
+; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: movq %rax, %rsi
; X64-NEXT: movq %r10, %rax
; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rax, %rsi
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq 88(%rdi), %r15
-; X64-NEXT: movq %rdi, %r14
-; X64-NEXT: movq %r15, %rax
-; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %r8
-; X64-NEXT: movq %r8, %rbx
+; X64-NEXT: mulq %rdi
+; X64-NEXT: movq %rdi, %rbx
; X64-NEXT: movq %rdx, %r8
; X64-NEXT: movq %rax, %r9
; X64-NEXT: addq %rcx, %r9
; X64-NEXT: adcq $0, %r8
-; X64-NEXT: movq %r10, %rax
+; X64-NEXT: movq %r11, %rax
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
; X64-NEXT: mulq %r11
; X64-NEXT: movq %rdx, %rcx
@@ -5676,36 +5683,35 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X64-NEXT: addq %r9, %rdi
; X64-NEXT: adcq %r8, %rcx
; X64-NEXT: setb %r8b
-; X64-NEXT: movq %r15, %rax
+; X64-NEXT: movq %r10, %rax
; X64-NEXT: mulq %r11
-; X64-NEXT: movq %r11, %r10
+; X64-NEXT: movq %r11, %r13
; X64-NEXT: movq %rdx, %r15
; X64-NEXT: movq %rax, %r12
; X64-NEXT: addq %rcx, %r12
; X64-NEXT: movzbl %r8b, %eax
; X64-NEXT: adcq %rax, %r15
-; X64-NEXT: movq 64(%r14), %rcx
-; X64-NEXT: movq %rcx, %rax
+; X64-NEXT: movq 72(%r14), %r10
+; X64-NEXT: movq 64(%r14), %r9
+; X64-NEXT: movq %r9, %rax
; X64-NEXT: mulq %rbx
-; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: movq %rdx, %r11
-; X64-NEXT: movq 72(%r14), %r8
-; X64-NEXT: movq %r8, %rax
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %r10, %rax
; X64-NEXT: mulq %rbx
; X64-NEXT: movq %rdx, %rbx
; X64-NEXT: movq %rax, %r14
; X64-NEXT: addq %r11, %r14
; X64-NEXT: adcq $0, %rbx
-; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: movq %rcx, %r9
-; X64-NEXT: mulq %r10
+; X64-NEXT: movq %r9, %rax
+; X64-NEXT: mulq %r13
; X64-NEXT: movq %rdx, %r11
; X64-NEXT: addq %r14, %rax
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: adcq %rbx, %r11
; X64-NEXT: setb %cl
-; X64-NEXT: movq %r8, %rax
-; X64-NEXT: mulq %r10
+; X64-NEXT: movq %r10, %rax
+; X64-NEXT: mulq %r13
; X64-NEXT: movq %rdx, %rbx
; X64-NEXT: movq %rax, %rbp
; X64-NEXT: addq %r11, %rbp
@@ -5722,9 +5728,8 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X64-NEXT: mulq %r14
; X64-NEXT: movq %rdx, %rsi
; X64-NEXT: movq %rax, %r9
-; X64-NEXT: movq %r8, %rax
-; X64-NEXT: movq %r8, %r10
-; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %r10, %rax
+; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: mulq %r14
; X64-NEXT: movq %rdx, %rdi
; X64-NEXT: movq %rax, %r11
@@ -5800,40 +5805,39 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X64-NEXT: movq %r14, %rax
; X64-NEXT: imulq %r9, %rax
; X64-NEXT: addq %rdx, %rax
-; X64-NEXT: movq %rax, %r11
+; X64-NEXT: movq %rax, %rbx
; X64-NEXT: movq 112(%rcx), %rax
-; X64-NEXT: movq %rcx, %r14
+; X64-NEXT: movq %rcx, %r11
; X64-NEXT: movq %rax, %rcx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; X64-NEXT: imulq %r14, %rcx
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; X64-NEXT: imulq %r10, %rcx
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; X64-NEXT: mulq %rbx
+; X64-NEXT: mulq %r10
; X64-NEXT: movq %rax, %r8
; X64-NEXT: addq %rcx, %rdx
-; X64-NEXT: movq 120(%r14), %r13
-; X64-NEXT: imulq %rbx, %r13
-; X64-NEXT: addq %rdx, %r13
+; X64-NEXT: movq 120(%r11), %r11
+; X64-NEXT: imulq %r10, %r11
+; X64-NEXT: addq %rdx, %r11
; X64-NEXT: addq %rdi, %r8
-; X64-NEXT: adcq %r11, %r13
-; X64-NEXT: movq %rbx, %rax
-; X64-NEXT: movq %rbx, %rcx
+; X64-NEXT: adcq %rbx, %r11
+; X64-NEXT: movq %r10, %rax
; X64-NEXT: mulq %rsi
; X64-NEXT: movq %rdx, %rdi
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %r10, %rax
+; X64-NEXT: movq %r14, %rax
; X64-NEXT: mulq %rsi
-; X64-NEXT: movq %rdx, %r11
-; X64-NEXT: movq %rax, %rbx
-; X64-NEXT: addq %rdi, %rbx
-; X64-NEXT: adcq $0, %r11
-; X64-NEXT: movq %rcx, %rax
+; X64-NEXT: movq %rdx, %rbx
+; X64-NEXT: movq %rax, %r13
+; X64-NEXT: addq %rdi, %r13
+; X64-NEXT: adcq $0, %rbx
+; X64-NEXT: movq %r10, %rax
; X64-NEXT: mulq %r9
; X64-NEXT: movq %rdx, %rcx
; X64-NEXT: movq %rax, %r12
-; X64-NEXT: addq %rbx, %r12
-; X64-NEXT: adcq %r11, %rcx
+; X64-NEXT: addq %r13, %r12
+; X64-NEXT: adcq %rbx, %rcx
; X64-NEXT: setb %sil
-; X64-NEXT: movq %r10, %rax
+; X64-NEXT: movq %r14, %rax
; X64-NEXT: mulq %r9
; X64-NEXT: movq %rdx, %rbx
; X64-NEXT: movq %rax, %r9
@@ -5841,7 +5845,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X64-NEXT: movzbl %sil, %eax
; X64-NEXT: adcq %rax, %rbx
; X64-NEXT: addq %r8, %r9
-; X64-NEXT: adcq %r13, %rbx
+; X64-NEXT: adcq %r11, %rbx
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
; X64-NEXT: imulq %r10, %rdi
@@ -5854,23 +5858,23 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
; X64-NEXT: imulq %r14, %rax
; X64-NEXT: addq %rdx, %rax
-; X64-NEXT: movq %rax, %r13
+; X64-NEXT: movq %rax, %r11
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; X64-NEXT: movq %rax, %rsi
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
; X64-NEXT: imulq %r8, %rsi
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
; X64-NEXT: mulq %rdi
-; X64-NEXT: movq %rax, %r11
+; X64-NEXT: movq %rax, %r13
; X64-NEXT: addq %rsi, %rdx
; X64-NEXT: imulq %rdi, %rbp
; X64-NEXT: addq %rdx, %rbp
-; X64-NEXT: addq %rcx, %r11
-; X64-NEXT: adcq %r13, %rbp
+; X64-NEXT: addq %rcx, %r13
+; X64-NEXT: adcq %r11, %rbp
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: mulq %r10
; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %r13
+; X64-NEXT: movq %rax, %r11
; X64-NEXT: movq %r8, %rax
; X64-NEXT: movq %r8, %r15
; X64-NEXT: mulq %r10
@@ -5890,13 +5894,13 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X64-NEXT: addq %rdi, %rax
; X64-NEXT: movzbl %sil, %esi
; X64-NEXT: adcq %rsi, %rdx
-; X64-NEXT: addq %r11, %rax
+; X64-NEXT: addq %r13, %rax
; X64-NEXT: adcq %rbp, %rdx
-; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
; X64-NEXT: adcq %r12, %rcx
; X64-NEXT: adcq %r9, %rax
; X64-NEXT: adcq %rbx, %rdx
-; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
@@ -5906,9 +5910,9 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
@@ -5917,8 +5921,8 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
; X64-NEXT: movq %rdi, %r9
; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
+; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
@@ -5942,8 +5946,8 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X64-NEXT: movq %r8, 64(%rsi)
; X64-NEXT: movq %r9, 72(%rsi)
; X64-NEXT: movq %r10, 80(%rsi)
-; X64-NEXT: movq %r11, 88(%rsi)
-; X64-NEXT: movq %r13, 96(%rsi)
+; X64-NEXT: movq %rbx, 88(%rsi)
+; X64-NEXT: movq %r11, 96(%rsi)
; X64-NEXT: movq %rcx, 104(%rsi)
; X64-NEXT: movq %rax, 112(%rsi)
; X64-NEXT: movq %rdx, 120(%rsi)
diff --git a/llvm/test/CodeGen/X86/mul-i256.ll b/llvm/test/CodeGen/X86/mul-i256.ll
index 2d7737bfdd3c2e..f5593f3316e63e 100644
--- a/llvm/test/CodeGen/X86/mul-i256.ll
+++ b/llvm/test/CodeGen/X86/mul-i256.ll
@@ -48,17 +48,17 @@ define void @test(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: movzbl %cl, %eax
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl (%esi), %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl (%ecx), %ebx
+; X86-NEXT: movl 4(%ecx), %ebp
; X86-NEXT: movl %ebx, %eax
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl 4(%esi), %ebp
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull %ecx
+; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, %esi
; X86-NEXT: addl %edi, %esi
diff --git a/llvm/test/CodeGen/X86/mul-i512.ll b/llvm/test/CodeGen/X86/mul-i512.ll
index 64f6746e616ede..82961e7d627ddc 100644
--- a/llvm/test/CodeGen/X86/mul-i512.ll
+++ b/llvm/test/CodeGen/X86/mul-i512.ll
@@ -45,16 +45,17 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: movzbl %cl, %eax
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl 16(%ecx), %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl 20(%eax), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 16(%eax), %ebx
; X86-NEXT: movl %ebx, %eax
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: mull %esi
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl 20(%ecx), %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, %esi
@@ -155,15 +156,15 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl 8(%ecx), %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl 12(%eax), %ebx
+; X86-NEXT: movl 8(%eax), %ebp
; X86-NEXT: movl %ebp, %eax
; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: mull %esi
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl 12(%ecx), %ebx
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %ebx, %eax
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: mull %esi
@@ -186,16 +187,17 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: movzbl %cl, %eax
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl (%esi), %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl (%ecx), %ebx
+; X86-NEXT: movl 4(%ecx), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %ebx, %eax
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: mull %ecx
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl 4(%esi), %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, %esi
@@ -864,18 +866,18 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl 40(%ecx), %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl 44(%eax), %ebp
+; X86-NEXT: movl 40(%eax), %ebx
; X86-NEXT: movl %ebx, %eax
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl 44(%ecx), %ebp
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %ebp, %eax
; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: mull %edi
+; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, %edi
; X86-NEXT: addl %esi, %edi
@@ -895,17 +897,17 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: movzbl %cl, %eax
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl 32(%esi), %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl 36(%eax), %ebp
+; X86-NEXT: movl 32(%eax), %ebx
; X86-NEXT: movl %ebx, %eax
; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl 36(%esi), %ebp
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull %ecx
+; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, %esi
; X86-NEXT: addl %edi, %esi
diff --git a/llvm/test/CodeGen/X86/musttail-varargs.ll b/llvm/test/CodeGen/X86/musttail-varargs.ll
index d3ded0b2a03d87..22fa09a83173f9 100644
--- a/llvm/test/CodeGen/X86/musttail-varargs.ll
+++ b/llvm/test/CodeGen/X86/musttail-varargs.ll
@@ -37,7 +37,6 @@ define void @f_thunk(ptr %this, ...) {
; LINUX-NEXT: .cfi_offset %r14, -32
; LINUX-NEXT: .cfi_offset %r15, -24
; LINUX-NEXT: .cfi_offset %rbp, -16
-; LINUX-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; LINUX-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; LINUX-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; LINUX-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -46,6 +45,7 @@ define void @f_thunk(ptr %this, ...) {
; LINUX-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; LINUX-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; LINUX-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; LINUX-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; LINUX-NEXT: movq %r9, %r14
; LINUX-NEXT: movq %r8, %r15
; LINUX-NEXT: movq %rcx, %r12
@@ -82,7 +82,6 @@ define void @f_thunk(ptr %this, ...) {
; LINUX-NEXT: movq %r13, %rdx
; LINUX-NEXT: movq %r12, %rcx
; LINUX-NEXT: movq %r15, %r8
-; LINUX-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; LINUX-NEXT: movq %r14, %r9
; LINUX-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; LINUX-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
@@ -92,6 +91,7 @@ define void @f_thunk(ptr %this, ...) {
; LINUX-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; LINUX-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; LINUX-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; LINUX-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; LINUX-NEXT: addq $360, %rsp # imm = 0x168
; LINUX-NEXT: .cfi_def_cfa_offset 56
; LINUX-NEXT: popq %rbx
@@ -130,7 +130,6 @@ define void @f_thunk(ptr %this, ...) {
; LINUX-X32-NEXT: .cfi_offset %r14, -32
; LINUX-X32-NEXT: .cfi_offset %r15, -24
; LINUX-X32-NEXT: .cfi_offset %rbp, -16
-; LINUX-X32-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
; LINUX-X32-NEXT: movaps %xmm7, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; LINUX-X32-NEXT: movaps %xmm6, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; LINUX-X32-NEXT: movaps %xmm5, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
@@ -139,6 +138,7 @@ define void @f_thunk(ptr %this, ...) {
; LINUX-X32-NEXT: movaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; LINUX-X32-NEXT: movaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; LINUX-X32-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; LINUX-X32-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
; LINUX-X32-NEXT: movq %r9, %r14
; LINUX-X32-NEXT: movq %r8, %r15
; LINUX-X32-NEXT: movq %rcx, %r12
@@ -175,7 +175,6 @@ define void @f_thunk(ptr %this, ...) {
; LINUX-X32-NEXT: movq %r13, %rdx
; LINUX-X32-NEXT: movq %r12, %rcx
; LINUX-X32-NEXT: movq %r15, %r8
-; LINUX-X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; LINUX-X32-NEXT: movq %r14, %r9
; LINUX-X32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
; LINUX-X32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
@@ -185,6 +184,7 @@ define void @f_thunk(ptr %this, ...) {
; LINUX-X32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm5 # 16-byte Reload
; LINUX-X32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm6 # 16-byte Reload
; LINUX-X32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm7 # 16-byte Reload
+; LINUX-X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; LINUX-X32-NEXT: addl $344, %esp # imm = 0x158
; LINUX-X32-NEXT: .cfi_def_cfa_offset 56
; LINUX-X32-NEXT: popq %rbx
diff --git a/llvm/test/CodeGen/X86/peephole-na-phys-copy-folding.ll b/llvm/test/CodeGen/X86/peephole-na-phys-copy-folding.ll
index f3741dc202dc58..221be1e09f9899 100644
--- a/llvm/test/CodeGen/X86/peephole-na-phys-copy-folding.ll
+++ b/llvm/test/CodeGen/X86/peephole-na-phys-copy-folding.ll
@@ -271,25 +271,21 @@ f:
define i64 @test_two_live_flags(ptr %foo0, i64 %bar0, i64 %baz0, ptr %foo1, i64 %bar1, i64 %baz1) nounwind {
; CHECK32-LABEL: test_two_live_flags:
; CHECK32: # %bb.0: # %entry
-; CHECK32-NEXT: pushl %ebp
; CHECK32-NEXT: pushl %ebx
-; CHECK32-NEXT: pushl %edi
; CHECK32-NEXT: pushl %esi
; CHECK32-NEXT: pushl %eax
-; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edi
; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edx
; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ebx
; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %esi
; CHECK32-NEXT: lock cmpxchg8b (%esi)
-; CHECK32-NEXT: sete {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK32-NEXT: movl %ebp, %edx
-; CHECK32-NEXT: movl %edi, %ecx
+; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edx
; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; CHECK32-NEXT: sete {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; CHECK32-NEXT: lock cmpxchg8b (%esi)
; CHECK32-NEXT: sete %al
; CHECK32-NEXT: andb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload
@@ -304,9 +300,7 @@ define i64 @test_two_live_flags(ptr %foo0, i64 %bar0, i64 %baz0, ptr %foo1, i64
; CHECK32-NEXT: xorl %edx, %edx
; CHECK32-NEXT: addl $4, %esp
; CHECK32-NEXT: popl %esi
-; CHECK32-NEXT: popl %edi
; CHECK32-NEXT: popl %ebx
-; CHECK32-NEXT: popl %ebp
; CHECK32-NEXT: retl
;
; CHECK64-LABEL: test_two_live_flags:
diff --git a/llvm/test/CodeGen/X86/pr38539.ll b/llvm/test/CodeGen/X86/pr38539.ll
index ace78b38d53edb..29336ccc641540 100644
--- a/llvm/test/CodeGen/X86/pr38539.ll
+++ b/llvm/test/CodeGen/X86/pr38539.ll
@@ -22,32 +22,32 @@ define void @f() nounwind {
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $160, %esp
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: subl $176, %esp
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movzbl (%eax), %eax
; X86-NEXT: movzbl (%eax), %ecx
; X86-NEXT: movzbl %al, %eax
; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
; X86-NEXT: divb %cl
-; X86-NEXT: movl %edx, %eax
+; X86-NEXT: movl %esi, %eax
; X86-NEXT: shll $30, %eax
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: sarl $30, %ecx
; X86-NEXT: sarl $31, %eax
; X86-NEXT: shrdl $1, %eax, %ecx
-; X86-NEXT: xorl %eax, %edx
+; X86-NEXT: xorl %eax, %esi
; X86-NEXT: xorl %eax, %edi
-; X86-NEXT: xorl %ecx, %esi
-; X86-NEXT: subl %ecx, %esi
+; X86-NEXT: xorl %ecx, %edx
+; X86-NEXT: subl %ecx, %edx
; X86-NEXT: sbbl %eax, %edi
-; X86-NEXT: sbbl %eax, %edx
-; X86-NEXT: andl $3, %edx
+; X86-NEXT: sbbl %eax, %esi
+; X86-NEXT: andl $3, %esi
; X86-NEXT: testl %edi, %edi
; X86-NEXT: jne .LBB0_1
; X86-NEXT: # %bb.2: # %BB_udiv-special-cases
-; X86-NEXT: bsrl %esi, %eax
+; X86-NEXT: bsrl %edx, %eax
; X86-NEXT: xorl $31, %eax
; X86-NEXT: addl $32, %eax
; X86-NEXT: jmp .LBB0_3
@@ -56,14 +56,14 @@ define void @f() nounwind {
; X86-NEXT: xorl $31, %eax
; X86-NEXT: .LBB0_3: # %BB_udiv-special-cases
; X86-NEXT: xorl %ecx, %ecx
-; X86-NEXT: testl %edx, %edx
+; X86-NEXT: testl %esi, %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: jne .LBB0_4
; X86-NEXT: # %bb.5: # %BB_udiv-special-cases
; X86-NEXT: addl $64, %eax
; X86-NEXT: jmp .LBB0_6
; X86-NEXT: .LBB0_4:
-; X86-NEXT: bsrl %edx, %eax
+; X86-NEXT: bsrl %esi, %eax
; X86-NEXT: xorl $31, %eax
; X86-NEXT: addl $32, %eax
; X86-NEXT: .LBB0_6: # %BB_udiv-special-cases
@@ -92,9 +92,8 @@ define void @f() nounwind {
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, %ebx
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: andl $3, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: adcl $0, %esi
; X86-NEXT: movb $65, %cl
; X86-NEXT: subb %al, %cl
; X86-NEXT: movb %cl, %ch
@@ -102,7 +101,7 @@ define void @f() nounwind {
; X86-NEXT: shrb $3, %cl
; X86-NEXT: andb $15, %cl
; X86-NEXT: negb %cl
-; X86-NEXT: movsbl %cl, %eax
+; X86-NEXT: movsbl %cl, %ebx
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -112,28 +111,29 @@ define void @f() nounwind {
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 120(%esp,%eax), %edi
+; X86-NEXT: movl 136(%esp,%ebx), %edi
; X86-NEXT: movb %ch, %cl
; X86-NEXT: shll %cl, %edi
; X86-NEXT: notb %cl
-; X86-NEXT: movl 112(%esp,%eax), %esi
+; X86-NEXT: movl 132(%esp,%ebx), %eax
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 116(%esp,%eax), %edx
-; X86-NEXT: movl %edx, %eax
-; X86-NEXT: shrl %eax
-; X86-NEXT: shrl %cl, %eax
+; X86-NEXT: movl %eax, %edx
+; X86-NEXT: shrl %edx
+; X86-NEXT: shrl %cl, %edx
+; X86-NEXT: movl 128(%esp,%ebx), %ebx
; X86-NEXT: movb %ch, %cl
-; X86-NEXT: shldl %cl, %esi, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shll %cl, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shldl %cl, %ebx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shll %cl, %ebx
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: orl %ebx, %ecx
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: andl $3, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %esi, %eax
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: je .LBB0_11
; X86-NEXT: # %bb.9: # %udiv-preheader
-; X86-NEXT: orl %eax, %edi
+; X86-NEXT: orl %edx, %edi
; X86-NEXT: andl $3, %edi
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -151,13 +151,13 @@ define void @f() nounwind {
; X86-NEXT: shrb $3, %al
; X86-NEXT: andb $15, %al
; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: movl 64(%esp,%eax), %edi
-; X86-NEXT: movl 68(%esp,%eax), %edx
+; X86-NEXT: movl 80(%esp,%eax), %edi
+; X86-NEXT: movl 84(%esp,%eax), %edx
; X86-NEXT: movl %edx, %esi
; X86-NEXT: movb %ch, %cl
; X86-NEXT: shrl %cl, %esi
; X86-NEXT: notb %cl
-; X86-NEXT: movl 72(%esp,%eax), %ebx
+; X86-NEXT: movl 88(%esp,%eax), %ebx
; X86-NEXT: addl %ebx, %ebx
; X86-NEXT: shll %cl, %ebx
; X86-NEXT: orl %esi, %ebx
@@ -180,30 +180,32 @@ define void @f() nounwind {
; X86-NEXT: .p2align 4, 0x90
; X86-NEXT: .LBB0_10: # %udiv-do-while
; X86-NEXT: # =>This Inner Loop Header: Depth=1
-; X86-NEXT: movl %ebx, %edi
; X86-NEXT: shldl $1, %ebx, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: shldl $1, %ebx, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shldl $1, %edi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: movl %esi, %edx
; X86-NEXT: andl $2, %edx
; X86-NEXT: shrl %edx
-; X86-NEXT: leal (%edx,%ebx,2), %ebx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl $1, %edx, %esi
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: leal (%edx,%edi,2), %edi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl $1, %eax, %edx
+; X86-NEXT: shldl $1, %eax, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: orl %ebx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl $1, %edx, %eax
+; X86-NEXT: orl %ebx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl %edx, %edx
; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: addl %eax, %eax
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: andl $3, %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: cmpl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: sbbl %edi, %edx
+; X86-NEXT: sbbl %ebx, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: sbbl %ecx, %esi
; X86-NEXT: shll $30, %esi
@@ -218,10 +220,9 @@ define void @f() nounwind {
; X86-NEXT: movl %esi, %eax
; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: subl %edx, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: sbbl %esi, %edi
-; X86-NEXT: movl %edi, %ebx
+; X86-NEXT: subl %edx, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %esi, %ebx
; X86-NEXT: sbbl %eax, %ecx
; X86-NEXT: andl $3, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
diff --git a/llvm/test/CodeGen/X86/sdiv_fix.ll b/llvm/test/CodeGen/X86/sdiv_fix.ll
index ce0b212aa4c26c..06f15aeb3b5a47 100644
--- a/llvm/test/CodeGen/X86/sdiv_fix.ll
+++ b/llvm/test/CodeGen/X86/sdiv_fix.ll
@@ -439,93 +439,89 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X64-LABEL: vec:
; X64: # %bb.0:
; X64-NEXT: pxor %xmm2, %xmm2
-; X64-NEXT: pcmpgtd %xmm1, %xmm2
+; X64-NEXT: pxor %xmm3, %xmm3
+; X64-NEXT: pcmpgtd %xmm1, %xmm3
; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
-; X64-NEXT: movdqa %xmm1, %xmm3
-; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; X64-NEXT: movq %xmm3, %rcx
-; X64-NEXT: pxor %xmm5, %xmm5
-; X64-NEXT: pcmpgtd %xmm0, %xmm5
+; X64-NEXT: movdqa %xmm1, %xmm5
+; X64-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
+; X64-NEXT: movq %xmm5, %rcx
+; X64-NEXT: pxor %xmm6, %xmm6
+; X64-NEXT: pcmpgtd %xmm0, %xmm6
; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
+; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
; X64-NEXT: psllq $31, %xmm0
; X64-NEXT: movq %xmm0, %rax
; X64-NEXT: cqto
; X64-NEXT: idivq %rcx
-; X64-NEXT: movq %rax, %rsi
-; X64-NEXT: movq %rdx, %r8
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
-; X64-NEXT: movq %xmm3, %rcx
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X64-NEXT: movq %xmm3, %rax
+; X64-NEXT: movq %rax, %rcx
+; X64-NEXT: movq %rdx, %xmm6
+; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
+; X64-NEXT: movq %xmm5, %rsi
+; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,2,3]
+; X64-NEXT: movq %xmm5, %rax
; X64-NEXT: cqto
-; X64-NEXT: idivq %rcx
-; X64-NEXT: movq %rax, %rdi
-; X64-NEXT: movq %rdx, %r10
+; X64-NEXT: idivq %rsi
+; X64-NEXT: movq %rdx, %xmm5
+; X64-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm5[0]
+; X64-NEXT: pcmpeqd %xmm2, %xmm6
+; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,0,3,2]
+; X64-NEXT: pand %xmm6, %xmm5
+; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1]
+; X64-NEXT: pxor %xmm6, %xmm6
+; X64-NEXT: pcmpgtd %xmm3, %xmm6
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; X64-NEXT: pxor %xmm3, %xmm3
-; X64-NEXT: pcmpgtd %xmm4, %xmm3
-; X64-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; X64-NEXT: movq %xmm4, %rcx
+; X64-NEXT: pcmpgtd %xmm0, %xmm3
+; X64-NEXT: pxor %xmm6, %xmm3
+; X64-NEXT: pandn %xmm3, %xmm5
+; X64-NEXT: movq %rcx, %xmm0
+; X64-NEXT: movq %rax, %xmm3
+; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; X64-NEXT: movdqa %xmm5, %xmm6
+; X64-NEXT: pandn %xmm0, %xmm6
+; X64-NEXT: pcmpeqd %xmm3, %xmm3
+; X64-NEXT: paddq %xmm3, %xmm0
+; X64-NEXT: pand %xmm5, %xmm0
+; X64-NEXT: por %xmm6, %xmm0
; X64-NEXT: pxor %xmm5, %xmm5
-; X64-NEXT: pcmpgtd %xmm1, %xmm5
-; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
+; X64-NEXT: pcmpgtd %xmm4, %xmm5
+; X64-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
+; X64-NEXT: movq %xmm4, %rcx
+; X64-NEXT: pxor %xmm6, %xmm6
+; X64-NEXT: pcmpgtd %xmm1, %xmm6
+; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1]
; X64-NEXT: psllq $31, %xmm1
; X64-NEXT: movq %xmm1, %rax
; X64-NEXT: cqto
; X64-NEXT: idivq %rcx
; X64-NEXT: movq %rax, %rcx
-; X64-NEXT: movq %rdx, %r9
+; X64-NEXT: movq %rdx, %xmm6
; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
-; X64-NEXT: movq %xmm4, %r11
+; X64-NEXT: movq %xmm4, %rsi
; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
; X64-NEXT: movq %xmm4, %rax
; X64-NEXT: cqto
-; X64-NEXT: idivq %r11
-; X64-NEXT: movq %r8, %xmm5
-; X64-NEXT: movq %r10, %xmm6
-; X64-NEXT: pxor %xmm4, %xmm4
-; X64-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0]
-; X64-NEXT: pcmpeqd %xmm4, %xmm5
-; X64-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,0,3,2]
-; X64-NEXT: pand %xmm5, %xmm6
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
-; X64-NEXT: pxor %xmm5, %xmm5
-; X64-NEXT: pcmpgtd %xmm2, %xmm5
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; X64-NEXT: pxor %xmm2, %xmm2
-; X64-NEXT: pcmpgtd %xmm0, %xmm2
-; X64-NEXT: movq %rsi, %xmm0
-; X64-NEXT: pxor %xmm5, %xmm2
-; X64-NEXT: movq %rdi, %xmm5
-; X64-NEXT: pandn %xmm2, %xmm6
-; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0]
-; X64-NEXT: movdqa %xmm6, %xmm5
-; X64-NEXT: pandn %xmm0, %xmm5
-; X64-NEXT: pcmpeqd %xmm2, %xmm2
-; X64-NEXT: paddq %xmm2, %xmm0
-; X64-NEXT: pand %xmm6, %xmm0
-; X64-NEXT: por %xmm5, %xmm0
-; X64-NEXT: movq %r9, %xmm5
-; X64-NEXT: movq %rdx, %xmm6
-; X64-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0]
-; X64-NEXT: pcmpeqd %xmm4, %xmm5
-; X64-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,0,3,2]
-; X64-NEXT: pand %xmm5, %xmm6
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1]
-; X64-NEXT: pxor %xmm5, %xmm5
-; X64-NEXT: pcmpgtd %xmm3, %xmm5
+; X64-NEXT: idivq %rsi
+; X64-NEXT: movq %rdx, %xmm4
+; X64-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm4[0]
+; X64-NEXT: pcmpeqd %xmm2, %xmm6
+; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,0,3,2]
+; X64-NEXT: pand %xmm6, %xmm4
+; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,1,1]
+; X64-NEXT: pxor %xmm6, %xmm6
+; X64-NEXT: pcmpgtd %xmm5, %xmm6
; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X64-NEXT: pcmpgtd %xmm1, %xmm4
-; X64-NEXT: pxor %xmm5, %xmm4
-; X64-NEXT: pandn %xmm4, %xmm6
+; X64-NEXT: pcmpgtd %xmm1, %xmm2
+; X64-NEXT: pxor %xmm6, %xmm2
+; X64-NEXT: pandn %xmm2, %xmm4
; X64-NEXT: movq %rcx, %xmm1
-; X64-NEXT: movq %rax, %xmm3
-; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
-; X64-NEXT: movdqa %xmm6, %xmm3
-; X64-NEXT: pandn %xmm1, %xmm3
-; X64-NEXT: paddq %xmm2, %xmm1
-; X64-NEXT: pand %xmm6, %xmm1
-; X64-NEXT: por %xmm3, %xmm1
+; X64-NEXT: movq %rax, %xmm2
+; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; X64-NEXT: movdqa %xmm4, %xmm2
+; X64-NEXT: pandn %xmm1, %xmm2
+; X64-NEXT: paddq %xmm3, %xmm1
+; X64-NEXT: pand %xmm4, %xmm1
+; X64-NEXT: por %xmm2, %xmm1
; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; X64-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/shift-i128.ll b/llvm/test/CodeGen/X86/shift-i128.ll
index 4fbe05cd1b2f2f..279743e549b121 100644
--- a/llvm/test/CodeGen/X86/shift-i128.ll
+++ b/llvm/test/CodeGen/X86/shift-i128.ll
@@ -13,16 +13,16 @@ define void @test_lshr_i128(i128 %x, i128 %a, ptr nocapture %r) nounwind {
; i686-NEXT: pushl %ebx
; i686-NEXT: pushl %edi
; i686-NEXT: pushl %esi
-; i686-NEXT: subl $32, %esp
+; i686-NEXT: subl $40, %esp
; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx
; i686-NEXT: movl {{[0-9]+}}(%esp), %eax
+; i686-NEXT: movl {{[0-9]+}}(%esp), %edx
; i686-NEXT: movl {{[0-9]+}}(%esp), %esi
; i686-NEXT: movl {{[0-9]+}}(%esp), %edi
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
; i686-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %eax, (%esp)
+; i686-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %eax, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -31,28 +31,31 @@ define void @test_lshr_i128(i128 %x, i128 %a, ptr nocapture %r) nounwind {
; i686-NEXT: andb $7, %al
; i686-NEXT: shrb $3, %cl
; i686-NEXT: andb $15, %cl
-; i686-NEXT: movzbl %cl, %ebp
-; i686-NEXT: movl 4(%esp,%ebp), %edx
-; i686-NEXT: movl %edx, %esi
+; i686-NEXT: movzbl %cl, %edx
+; i686-NEXT: movl 8(%esp,%edx), %ecx
+; i686-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; i686-NEXT: movl 12(%esp,%edx), %ebp
+; i686-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; i686-NEXT: movl %eax, %ecx
-; i686-NEXT: shrl %cl, %esi
+; i686-NEXT: shrl %cl, %ebp
; i686-NEXT: notb %cl
-; i686-NEXT: movl 8(%esp,%ebp), %ebx
-; i686-NEXT: leal (%ebx,%ebx), %edi
-; i686-NEXT: shll %cl, %edi
-; i686-NEXT: orl %esi, %edi
-; i686-NEXT: movl (%esp,%ebp), %esi
-; i686-NEXT: movl 12(%esp,%ebp), %ebp
+; i686-NEXT: movl 16(%esp,%edx), %ebx
+; i686-NEXT: leal (%ebx,%ebx), %esi
+; i686-NEXT: shll %cl, %esi
+; i686-NEXT: movl 20(%esp,%edx), %edx
; i686-NEXT: movl %eax, %ecx
-; i686-NEXT: shrdl %cl, %ebp, %ebx
-; i686-NEXT: shrdl %cl, %edx, %esi
-; i686-NEXT: shrl %cl, %ebp
+; i686-NEXT: shrdl %cl, %edx, %ebx
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; i686-NEXT: shrdl %cl, %edi, (%esp) # 4-byte Folded Spill
+; i686-NEXT: orl %ebp, %esi
+; i686-NEXT: shrl %cl, %edx
; i686-NEXT: movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT: movl %ebp, 12(%eax)
+; i686-NEXT: movl %edx, 12(%eax)
; i686-NEXT: movl %ebx, 8(%eax)
-; i686-NEXT: movl %esi, (%eax)
-; i686-NEXT: movl %edi, 4(%eax)
-; i686-NEXT: addl $32, %esp
+; i686-NEXT: movl (%esp), %ecx # 4-byte Reload
+; i686-NEXT: movl %ecx, (%eax)
+; i686-NEXT: movl %esi, 4(%eax)
+; i686-NEXT: addl $40, %esp
; i686-NEXT: popl %esi
; i686-NEXT: popl %edi
; i686-NEXT: popl %ebx
@@ -84,47 +87,50 @@ define void @test_ashr_i128(i128 %x, i128 %a, ptr nocapture %r) nounwind {
; i686-NEXT: pushl %ebx
; i686-NEXT: pushl %edi
; i686-NEXT: pushl %esi
-; i686-NEXT: subl $32, %esp
+; i686-NEXT: subl $40, %esp
; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx
; i686-NEXT: movl {{[0-9]+}}(%esp), %eax
+; i686-NEXT: movl {{[0-9]+}}(%esp), %edx
; i686-NEXT: movl {{[0-9]+}}(%esp), %esi
; i686-NEXT: movl {{[0-9]+}}(%esp), %edi
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
; i686-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %eax, (%esp)
-; i686-NEXT: sarl $31, %ebx
-; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; i686-NEXT: sarl $31, %edi
+; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
; i686-NEXT: movl %ecx, %eax
; i686-NEXT: andb $7, %al
; i686-NEXT: shrb $3, %cl
; i686-NEXT: andb $15, %cl
-; i686-NEXT: movzbl %cl, %ebp
-; i686-NEXT: movl 4(%esp,%ebp), %edx
-; i686-NEXT: movl %edx, %esi
+; i686-NEXT: movzbl %cl, %edx
+; i686-NEXT: movl 8(%esp,%edx), %ecx
+; i686-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; i686-NEXT: movl 12(%esp,%edx), %ebp
+; i686-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; i686-NEXT: movl %eax, %ecx
-; i686-NEXT: shrl %cl, %esi
+; i686-NEXT: shrl %cl, %ebp
; i686-NEXT: notb %cl
-; i686-NEXT: movl 8(%esp,%ebp), %ebx
-; i686-NEXT: leal (%ebx,%ebx), %edi
-; i686-NEXT: shll %cl, %edi
-; i686-NEXT: orl %esi, %edi
-; i686-NEXT: movl (%esp,%ebp), %esi
-; i686-NEXT: movl 12(%esp,%ebp), %ebp
+; i686-NEXT: movl 16(%esp,%edx), %ebx
+; i686-NEXT: leal (%ebx,%ebx), %esi
+; i686-NEXT: shll %cl, %esi
+; i686-NEXT: movl 20(%esp,%edx), %edx
; i686-NEXT: movl %eax, %ecx
-; i686-NEXT: shrdl %cl, %ebp, %ebx
-; i686-NEXT: shrdl %cl, %edx, %esi
-; i686-NEXT: sarl %cl, %ebp
+; i686-NEXT: shrdl %cl, %edx, %ebx
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; i686-NEXT: shrdl %cl, %edi, (%esp) # 4-byte Folded Spill
+; i686-NEXT: orl %ebp, %esi
+; i686-NEXT: sarl %cl, %edx
; i686-NEXT: movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT: movl %ebp, 12(%eax)
+; i686-NEXT: movl %edx, 12(%eax)
; i686-NEXT: movl %ebx, 8(%eax)
-; i686-NEXT: movl %esi, (%eax)
-; i686-NEXT: movl %edi, 4(%eax)
-; i686-NEXT: addl $32, %esp
+; i686-NEXT: movl (%esp), %ecx # 4-byte Reload
+; i686-NEXT: movl %ecx, (%eax)
+; i686-NEXT: movl %esi, 4(%eax)
+; i686-NEXT: addl $40, %esp
; i686-NEXT: popl %esi
; i686-NEXT: popl %edi
; i686-NEXT: popl %ebx
@@ -157,7 +163,7 @@ define void @test_shl_i128(i128 %x, i128 %a, ptr nocapture %r) nounwind {
; i686-NEXT: pushl %ebx
; i686-NEXT: pushl %edi
; i686-NEXT: pushl %esi
-; i686-NEXT: subl $32, %esp
+; i686-NEXT: subl $36, %esp
; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx
; i686-NEXT: movl {{[0-9]+}}(%esp), %eax
; i686-NEXT: movl {{[0-9]+}}(%esp), %edx
@@ -170,38 +176,37 @@ define void @test_shl_i128(i128 %x, i128 %a, ptr nocapture %r) nounwind {
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
-; i686-NEXT: movl $0, (%esp)
+; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl %ecx, %eax
; i686-NEXT: andb $7, %al
; i686-NEXT: shrb $3, %cl
; i686-NEXT: andb $15, %cl
; i686-NEXT: negb %cl
-; i686-NEXT: movsbl %cl, %ebp
-; i686-NEXT: movl 24(%esp,%ebp), %ebx
-; i686-NEXT: movl %ebx, %edx
+; i686-NEXT: movsbl %cl, %edx
+; i686-NEXT: movl 28(%esp,%edx), %ebp
+; i686-NEXT: movl %ebp, %esi
; i686-NEXT: movl %eax, %ecx
-; i686-NEXT: shll %cl, %edx
+; i686-NEXT: shll %cl, %esi
+; i686-NEXT: movl %esi, (%esp) # 4-byte Spill
; i686-NEXT: notb %cl
-; i686-NEXT: movl 20(%esp,%ebp), %edi
+; i686-NEXT: movl 20(%esp,%edx), %ebx
+; i686-NEXT: movl 24(%esp,%edx), %edi
; i686-NEXT: movl %edi, %esi
; i686-NEXT: shrl %esi
; i686-NEXT: shrl %cl, %esi
-; i686-NEXT: orl %edx, %esi
-; i686-NEXT: movl 16(%esp,%ebp), %edx
-; i686-NEXT: movl 28(%esp,%ebp), %ebp
-; i686-NEXT: movl %eax, %ecx
-; i686-NEXT: shldl %cl, %ebx, %ebp
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT: movl %ebp, 12(%ecx)
-; i686-NEXT: movl %edx, %ebx
+; i686-NEXT: movl 32(%esp,%edx), %edx
; i686-NEXT: movl %eax, %ecx
-; i686-NEXT: shll %cl, %ebx
-; i686-NEXT: shldl %cl, %edx, %edi
+; i686-NEXT: shldl %cl, %ebp, %edx
+; i686-NEXT: movl %ebx, %ebp
+; i686-NEXT: shll %cl, %ebp
+; i686-NEXT: shldl %cl, %ebx, %edi
+; i686-NEXT: orl (%esp), %esi # 4-byte Folded Reload
; i686-NEXT: movl {{[0-9]+}}(%esp), %eax
+; i686-NEXT: movl %edx, 12(%eax)
; i686-NEXT: movl %edi, 4(%eax)
-; i686-NEXT: movl %ebx, (%eax)
+; i686-NEXT: movl %ebp, (%eax)
; i686-NEXT: movl %esi, 8(%eax)
-; i686-NEXT: addl $32, %esp
+; i686-NEXT: addl $36, %esp
; i686-NEXT: popl %esi
; i686-NEXT: popl %edi
; i686-NEXT: popl %ebx
@@ -267,22 +272,21 @@ define void @test_lshr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no
; i686-NEXT: pushl %ebx
; i686-NEXT: pushl %edi
; i686-NEXT: pushl %esi
-; i686-NEXT: subl $100, %esp
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT: movl {{[0-9]+}}(%esp), %esi
+; i686-NEXT: subl $108, %esp
+; i686-NEXT: movl {{[0-9]+}}(%esp), %edi
; i686-NEXT: movl {{[0-9]+}}(%esp), %eax
; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx
; i686-NEXT: movl {{[0-9]+}}(%esp), %edx
-; i686-NEXT: movl {{[0-9]+}}(%esp), %edi
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; i686-NEXT: movl {{[0-9]+}}(%esp), %esi
+; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx
; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp
; i686-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp
; i686-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp
; i686-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %esi, {{[0-9]+}}(%esp)
; i686-NEXT: movl %edx, {{[0-9]+}}(%esp)
; i686-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; i686-NEXT: movl %eax, {{[0-9]+}}(%esp)
@@ -290,78 +294,84 @@ define void @test_lshr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %esi, %ecx
+; i686-NEXT: movl %edi, %ebp
+; i686-NEXT: movl %edi, %ecx
; i686-NEXT: andl $7, %ecx
; i686-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: shrl $3, %esi
-; i686-NEXT: andl $15, %esi
-; i686-NEXT: movl 40(%esp,%esi), %eax
-; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: shrl %cl, %eax
-; i686-NEXT: notl %ecx
-; i686-NEXT: movl 44(%esp,%esi), %edx
+; i686-NEXT: shrl $3, %ebp
+; i686-NEXT: andl $15, %ebp
+; i686-NEXT: movl 48(%esp,%ebp), %edx
; i686-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: addl %edx, %edx
-; i686-NEXT: # kill: def $cl killed $cl killed $ecx
-; i686-NEXT: shll %cl, %edx
-; i686-NEXT: orl %eax, %edx
+; i686-NEXT: shrl %cl, %edx
; i686-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: movl 36(%esp,%esi), %eax
+; i686-NEXT: notl %ecx
+; i686-NEXT: movl 52(%esp,%ebp), %eax
; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: addl %eax, %eax
+; i686-NEXT: # kill: def $cl killed $cl killed $ecx
+; i686-NEXT: shll %cl, %eax
+; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: movl 44(%esp,%ebp), %eax
+; i686-NEXT: movl %eax, (%esp) # 4-byte Spill
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %ebx, %edx
-; i686-NEXT: andl $7, %edx
-; i686-NEXT: shrl $3, %ebx
-; i686-NEXT: andl $15, %ebx
-; i686-NEXT: movl 72(%esp,%ebx), %ebp
-; i686-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: movl %edx, %ecx
-; i686-NEXT: shrl %cl, %ebp
-; i686-NEXT: movl %edx, %ecx
-; i686-NEXT: notl %ecx
-; i686-NEXT: movl 76(%esp,%ebx), %eax
+; i686-NEXT: movl {{[0-9]+}}(%esp), %edx
+; i686-NEXT: movl %edx, %ebx
+; i686-NEXT: andl $7, %ebx
+; i686-NEXT: shrl $3, %edx
+; i686-NEXT: andl $15, %edx
+; i686-NEXT: movl 80(%esp,%edx), %eax
+; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: movl %ebx, %ecx
+; i686-NEXT: shrl %cl, %eax
; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: leal (%eax,%eax), %edi
+; i686-NEXT: movl %ebx, %ecx
+; i686-NEXT: notl %ecx
+; i686-NEXT: movl 84(%esp,%edx), %edi
+; i686-NEXT: leal (%edi,%edi), %eax
; i686-NEXT: # kill: def $cl killed $cl killed $ecx
-; i686-NEXT: shll %cl, %edi
-; i686-NEXT: orl %ebp, %edi
-; i686-NEXT: movl 48(%esp,%esi), %esi
+; i686-NEXT: shll %cl, %eax
+; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: movl 56(%esp,%ebp), %esi
; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; i686-NEXT: movl %eax, %ecx
; i686-NEXT: shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT: movl 68(%esp,%ebx), %ecx
-; i686-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; i686-NEXT: movl 80(%esp,%ebx), %esi
-; i686-NEXT: movl %edx, %ecx
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; i686-NEXT: shrdl %cl, %esi, %ebx
+; i686-NEXT: movl 88(%esp,%edx), %esi
+; i686-NEXT: movl %ebx, %ecx
+; i686-NEXT: shrdl %cl, %esi, %edi
+; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; i686-NEXT: movl %eax, %ecx
; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; i686-NEXT: shrdl %cl, %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; i686-NEXT: shrl %cl, %ebp
-; i686-NEXT: movl %edx, %ecx
+; i686-NEXT: shrdl %cl, %ebp, (%esp) # 4-byte Folded Spill
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; i686-NEXT: shrl %cl, %edi
+; i686-NEXT: movl 76(%esp,%edx), %edx
+; i686-NEXT: movl %ebx, %ecx
; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; i686-NEXT: shrdl %cl, %eax, (%esp) # 4-byte Folded Spill
+; i686-NEXT: shrdl %cl, %eax, %edx
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; i686-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; i686-NEXT: movl %ebx, %ecx
; i686-NEXT: shrl %cl, %esi
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT: movl %esi, 28(%ecx)
-; i686-NEXT: movl %ebx, 24(%ecx)
-; i686-NEXT: movl (%esp), %eax # 4-byte Reload
-; i686-NEXT: movl %eax, 16(%ecx)
-; i686-NEXT: movl %ebp, 12(%ecx)
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; i686-NEXT: movl %edx, 8(%ecx)
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; i686-NEXT: movl %edx, (%ecx)
-; i686-NEXT: movl %edi, 20(%ecx)
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; i686-NEXT: movl %eax, 4(%ecx)
-; i686-NEXT: addl $100, %esp
+; i686-NEXT: movl {{[0-9]+}}(%esp), %eax
+; i686-NEXT: movl %esi, 28(%eax)
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT: movl %ecx, 24(%eax)
+; i686-NEXT: movl %edx, 16(%eax)
+; i686-NEXT: movl %edi, 12(%eax)
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT: movl %ecx, 8(%eax)
+; i686-NEXT: movl (%esp), %ecx # 4-byte Reload
+; i686-NEXT: movl %ecx, (%eax)
+; i686-NEXT: movl %ebp, 20(%eax)
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT: movl %ecx, 4(%eax)
+; i686-NEXT: addl $108, %esp
; i686-NEXT: popl %esi
; i686-NEXT: popl %edi
; i686-NEXT: popl %ebx
@@ -405,21 +415,20 @@ define void @test_ashr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no
; i686-NEXT: pushl %ebx
; i686-NEXT: pushl %edi
; i686-NEXT: pushl %esi
-; i686-NEXT: subl $92, %esp
+; i686-NEXT: subl $104, %esp
; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT: movl {{[0-9]+}}(%esp), %edi
; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx
; i686-NEXT: movl {{[0-9]+}}(%esp), %edx
; i686-NEXT: movl {{[0-9]+}}(%esp), %esi
; i686-NEXT: movl {{[0-9]+}}(%esp), %eax
+; i686-NEXT: movl {{[0-9]+}}(%esp), %edi
; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx
; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx
; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx
; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
; i686-NEXT: sarl $31, %ebx
; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
@@ -434,75 +443,79 @@ define void @test_ashr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no
; i686-NEXT: movl %eax, {{[0-9]+}}(%esp)
; i686-NEXT: movl %eax, {{[0-9]+}}(%esp)
; i686-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %edi, %ebx
-; i686-NEXT: andl $7, %ebx
-; i686-NEXT: shrl $3, %edi
-; i686-NEXT: andl $15, %edi
-; i686-NEXT: movl 32(%esp,%edi), %eax
-; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: movl %ebx, %ecx
-; i686-NEXT: shrl %cl, %eax
-; i686-NEXT: movl %ebx, %ecx
-; i686-NEXT: notl %ecx
-; i686-NEXT: movl 36(%esp,%edi), %edx
-; i686-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: addl %edx, %edx
-; i686-NEXT: # kill: def $cl killed $cl killed $ecx
-; i686-NEXT: shll %cl, %edx
-; i686-NEXT: orl %eax, %edx
+; i686-NEXT: movl %ebp, %ecx
+; i686-NEXT: andl $7, %ecx
+; i686-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: shrl $3, %ebp
+; i686-NEXT: andl $15, %ebp
+; i686-NEXT: movl 44(%esp,%ebp), %edx
+; i686-NEXT: movl %edx, (%esp) # 4-byte Spill
+; i686-NEXT: shrl %cl, %edx
; i686-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: movl %ebp, %eax
-; i686-NEXT: movl %ebp, %edx
-; i686-NEXT: andl $7, %edx
-; i686-NEXT: shrl $3, %eax
-; i686-NEXT: andl $15, %eax
-; i686-NEXT: movl 64(%esp,%eax), %ebp
-; i686-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: movl %eax, (%esp) # 4-byte Spill
-; i686-NEXT: movl %edx, %ecx
-; i686-NEXT: shrl %cl, %ebp
-; i686-NEXT: movl %edx, %ecx
; i686-NEXT: notl %ecx
-; i686-NEXT: movl 68(%esp,%eax), %esi
-; i686-NEXT: leal (%esi,%esi), %eax
+; i686-NEXT: movl 48(%esp,%ebp), %edx
+; i686-NEXT: leal (%edx,%edx), %eax
+; i686-NEXT: movl %edx, %edi
; i686-NEXT: # kill: def $cl killed $cl killed $ecx
; i686-NEXT: shll %cl, %eax
-; i686-NEXT: orl %ebp, %eax
-; i686-NEXT: movl 28(%esp,%edi), %ecx
+; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: movl {{[0-9]+}}(%esp), %edx
+; i686-NEXT: movl %edx, %ecx
+; i686-NEXT: andl $7, %ecx
; i686-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: movl 40(%esp,%edi), %edi
+; i686-NEXT: shrl $3, %edx
+; i686-NEXT: andl $15, %edx
+; i686-NEXT: movl 76(%esp,%edx), %eax
+; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: shrl %cl, %eax
+; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: notl %ecx
+; i686-NEXT: movl 80(%esp,%edx), %eax
+; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: leal (%eax,%eax), %esi
+; i686-NEXT: # kill: def $cl killed $cl killed $ecx
+; i686-NEXT: shll %cl, %esi
+; i686-NEXT: movl 52(%esp,%ebp), %eax
+; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; i686-NEXT: movl %ebx, %ecx
+; i686-NEXT: shrdl %cl, %eax, %edi
+; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: movl 84(%esp,%edx), %edi
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT: # kill: def $cl killed $cl killed $ecx
; i686-NEXT: shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT: movl (%esp), %ecx # 4-byte Reload
-; i686-NEXT: movl 60(%esp,%ecx), %ebp
-; i686-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: movl 72(%esp,%ecx), %ebp
-; i686-NEXT: movl %edx, %ecx
-; i686-NEXT: shrdl %cl, %ebp, %esi
-; i686-NEXT: movl %esi, (%esp) # 4-byte Spill
+; i686-NEXT: movl 40(%esp,%ebp), %eax
; i686-NEXT: movl %ebx, %ecx
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; i686-NEXT: shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT: sarl %cl, %edi
-; i686-NEXT: movl %edx, %ecx
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; i686-NEXT: shrdl %cl, %esi, %ebx
+; i686-NEXT: movl (%esp), %ebp # 4-byte Reload
+; i686-NEXT: shrdl %cl, %ebp, %eax
+; i686-NEXT: movl %eax, (%esp) # 4-byte Spill
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; i686-NEXT: sarl %cl, %ebp
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT: movl %ebp, 28(%ecx)
-; i686-NEXT: movl (%esp), %edx # 4-byte Reload
-; i686-NEXT: movl %edx, 24(%ecx)
-; i686-NEXT: movl %ebx, 16(%ecx)
-; i686-NEXT: movl %edi, 12(%ecx)
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; i686-NEXT: movl %edx, 8(%ecx)
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; i686-NEXT: movl %edx, (%ecx)
-; i686-NEXT: movl %eax, 20(%ecx)
+; i686-NEXT: movl 72(%esp,%edx), %edx
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; i686-NEXT: movl %ebx, %ecx
; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; i686-NEXT: movl %eax, 4(%ecx)
-; i686-NEXT: addl $92, %esp
+; i686-NEXT: shrdl %cl, %eax, %edx
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; i686-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; i686-NEXT: movl %ebx, %ecx
+; i686-NEXT: sarl %cl, %edi
+; i686-NEXT: movl {{[0-9]+}}(%esp), %eax
+; i686-NEXT: movl %edi, 28(%eax)
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT: movl %ecx, 24(%eax)
+; i686-NEXT: movl %edx, 16(%eax)
+; i686-NEXT: movl %ebp, 12(%eax)
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT: movl %ecx, 8(%eax)
+; i686-NEXT: movl (%esp), %ecx # 4-byte Reload
+; i686-NEXT: movl %ecx, (%eax)
+; i686-NEXT: movl %esi, 20(%eax)
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT: movl %ecx, 4(%eax)
+; i686-NEXT: addl $104, %esp
; i686-NEXT: popl %esi
; i686-NEXT: popl %edi
; i686-NEXT: popl %ebx
@@ -549,7 +562,7 @@ define void @test_shl_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) nou
; i686-NEXT: pushl %ebx
; i686-NEXT: pushl %edi
; i686-NEXT: pushl %esi
-; i686-NEXT: subl $100, %esp
+; i686-NEXT: subl $104, %esp
; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp
; i686-NEXT: movl {{[0-9]+}}(%esp), %eax
; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx
@@ -576,82 +589,81 @@ define void @test_shl_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) nou
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
-; i686-NEXT: movl 8(%eax), %edx
-; i686-NEXT: movl %edx, (%esp) # 4-byte Spill
+; i686-NEXT: movl 8(%eax), %esi
+; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; i686-NEXT: andl $7, %ecx
; i686-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: shll %cl, %edx
-; i686-NEXT: movl 4(%eax), %esi
+; i686-NEXT: shll %cl, %esi
; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: movl 4(%eax), %esi
+; i686-NEXT: movl %esi, (%esp) # 4-byte Spill
; i686-NEXT: shrl %esi
; i686-NEXT: notl %ecx
; i686-NEXT: # kill: def $cl killed $cl killed $ecx
; i686-NEXT: shrl %cl, %esi
-; i686-NEXT: orl %edx, %esi
; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: movl (%eax), %esi
; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT: movl (%eax), %eax
-; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; i686-NEXT: movl %ebx, %edx
; i686-NEXT: shrl $3, %edx
; i686-NEXT: andl $15, %edx
-; i686-NEXT: leal {{[0-9]+}}(%esp), %esi
-; i686-NEXT: subl %edx, %esi
+; i686-NEXT: leal {{[0-9]+}}(%esp), %ecx
+; i686-NEXT: subl %edx, %ecx
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: andl $7, %ebx
-; i686-NEXT: movl 8(%esi), %edi
-; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: movl 8(%ecx), %eax
+; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: movl %ecx, %edi
+; i686-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; i686-NEXT: movl %ebx, %ecx
-; i686-NEXT: shll %cl, %edi
-; i686-NEXT: movl 4(%esi), %eax
+; i686-NEXT: shll %cl, %eax
+; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: movl 4(%edi), %eax
; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; i686-NEXT: shrl %eax
; i686-NEXT: movl %ebx, %ecx
; i686-NEXT: notl %ecx
; i686-NEXT: # kill: def $cl killed $cl killed $ecx
; i686-NEXT: shrl %cl, %eax
-; i686-NEXT: orl %edi, %eax
-; i686-NEXT: movl (%esi), %ecx
-; i686-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; i686-NEXT: movl %esi, %edi
; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT: shll %cl, %edi
-; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; i686-NEXT: shll %cl, %esi
+; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: shldl %cl, %edi, (%esp) # 4-byte Folded Spill
; i686-NEXT: negl %ebp
-; i686-NEXT: movl 64(%esp,%ebp), %esi
+; i686-NEXT: movl 68(%esp,%ebp), %ebp
; i686-NEXT: # kill: def $cl killed $cl killed $ecx
-; i686-NEXT: movl (%esp), %edi # 4-byte Reload
-; i686-NEXT: shldl %cl, %edi, %esi
-; i686-NEXT: movl %esi, (%esp) # 4-byte Spill
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; i686-NEXT: movl %esi, %edi
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; i686-NEXT: shldl %cl, %edi, %ebp
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT: movl (%ecx), %edi
+; i686-NEXT: movl %edi, %esi
; i686-NEXT: movl %ebx, %ecx
-; i686-NEXT: shll %cl, %edi
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; i686-NEXT: shldl %cl, %esi, %ebp
+; i686-NEXT: shll %cl, %esi
+; i686-NEXT: shldl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; i686-NEXT: negl %edx
-; i686-NEXT: movl 96(%esp,%edx), %edx
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; i686-NEXT: shldl %cl, %ebx, %edx
+; i686-NEXT: movl 100(%esp,%edx), %edx
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; i686-NEXT: shldl %cl, %edi, %edx
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; i686-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; i686-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx
; i686-NEXT: movl %edx, 28(%ecx)
-; i686-NEXT: movl %ebp, 20(%ecx)
-; i686-NEXT: movl %edi, 16(%ecx)
-; i686-NEXT: movl (%esp), %edx # 4-byte Reload
-; i686-NEXT: movl %edx, 12(%ecx)
; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; i686-NEXT: movl %edx, 20(%ecx)
+; i686-NEXT: movl %esi, 16(%ecx)
+; i686-NEXT: movl %ebp, 12(%ecx)
+; i686-NEXT: movl (%esp), %edx # 4-byte Reload
; i686-NEXT: movl %edx, 4(%ecx)
; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; i686-NEXT: movl %edx, (%ecx)
; i686-NEXT: movl %eax, 24(%ecx)
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; i686-NEXT: movl %eax, 8(%ecx)
-; i686-NEXT: addl $100, %esp
+; i686-NEXT: movl %edi, 8(%ecx)
+; i686-NEXT: addl $104, %esp
; i686-NEXT: popl %esi
; i686-NEXT: popl %edi
; i686-NEXT: popl %ebx
diff --git a/llvm/test/CodeGen/X86/shift-i256.ll b/llvm/test/CodeGen/X86/shift-i256.ll
index e1466aebf42258..e16a3f81bde906 100644
--- a/llvm/test/CodeGen/X86/shift-i256.ll
+++ b/llvm/test/CodeGen/X86/shift-i256.ll
@@ -11,7 +11,7 @@ define void @shift1(i256 %x, i256 %a, ptr nocapture %r) nounwind readnone {
; CHECK-NEXT: pushl %ebx
; CHECK-NEXT: pushl %edi
; CHECK-NEXT: pushl %esi
-; CHECK-NEXT: subl $92, %esp
+; CHECK-NEXT: subl $100, %esp
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
@@ -42,64 +42,71 @@ define void @shift1(i256 %x, i256 %a, ptr nocapture %r) nounwind readnone {
; CHECK-NEXT: andb $7, %al
; CHECK-NEXT: shrb $3, %cl
; CHECK-NEXT: movzbl %cl, %ebp
-; CHECK-NEXT: movl 32(%esp,%ebp), %esi
-; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: movl 40(%esp,%ebp), %ecx
+; CHECK-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: movl %ecx, %edx
; CHECK-NEXT: movl %eax, %ecx
-; CHECK-NEXT: shrl %cl, %esi
+; CHECK-NEXT: shrl %cl, %edx
+; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT: movl %eax, %edx
; CHECK-NEXT: notb %dl
-; CHECK-NEXT: movl 36(%esp,%ebp), %ecx
-; CHECK-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: leal (%ecx,%ecx), %edi
+; CHECK-NEXT: movl 44(%esp,%ebp), %ecx
+; CHECK-NEXT: leal (%ecx,%ecx), %esi
+; CHECK-NEXT: movl %ecx, %ebx
; CHECK-NEXT: movl %edx, %ecx
-; CHECK-NEXT: shll %cl, %edi
-; CHECK-NEXT: orl %esi, %edi
-; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: movl 40(%esp,%ebp), %esi
+; CHECK-NEXT: shll %cl, %esi
; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: movl 48(%esp,%ebp), %esi
+; CHECK-NEXT: movl %esi, (%esp) # 4-byte Spill
; CHECK-NEXT: movl %eax, %ecx
; CHECK-NEXT: shrl %cl, %esi
-; CHECK-NEXT: movl 44(%esp,%ebp), %ecx
-; CHECK-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; CHECK-NEXT: leal (%ecx,%ecx), %edi
+; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: movl 52(%esp,%ebp), %ecx
+; CHECK-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: leal (%ecx,%ecx), %esi
; CHECK-NEXT: movl %edx, %ecx
-; CHECK-NEXT: shll %cl, %edi
-; CHECK-NEXT: orl %esi, %edi
-; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: movl 48(%esp,%ebp), %ebx
-; CHECK-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: shll %cl, %esi
+; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: movl 56(%esp,%ebp), %esi
+; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT: movl %eax, %ecx
-; CHECK-NEXT: shrl %cl, %ebx
-; CHECK-NEXT: movl 52(%esp,%ebp), %edi
+; CHECK-NEXT: shrl %cl, %esi
+; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: movl 60(%esp,%ebp), %edi
; CHECK-NEXT: leal (%edi,%edi), %esi
; CHECK-NEXT: movl %edx, %ecx
; CHECK-NEXT: shll %cl, %esi
-; CHECK-NEXT: orl %ebx, %esi
; CHECK-NEXT: movl %eax, %ecx
+; CHECK-NEXT: movl (%esp), %edx # 4-byte Reload
+; CHECK-NEXT: shrdl %cl, %edx, %ebx
+; CHECK-NEXT: movl %ebx, (%esp) # 4-byte Spill
; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; CHECK-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; CHECK-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill
-; CHECK-NEXT: movl 28(%esp,%ebp), %edx
-; CHECK-NEXT: movl 56(%esp,%ebp), %ebx
+; CHECK-NEXT: movl 64(%esp,%ebp), %ebx
; CHECK-NEXT: shrdl %cl, %ebx, %edi
+; CHECK-NEXT: movl 36(%esp,%ebp), %edx
; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; CHECK-NEXT: shrdl %cl, %ebp, %edx
+; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; CHECK-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; CHECK-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; CHECK-NEXT: movl %eax, %ecx
; CHECK-NEXT: sarl %cl, %ebx
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl %ebx, 28(%eax)
; CHECK-NEXT: movl %edi, 24(%eax)
-; CHECK-NEXT: movl (%esp), %ecx # 4-byte Reload
-; CHECK-NEXT: movl %ecx, 16(%eax)
; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-NEXT: movl %ecx, 16(%eax)
+; CHECK-NEXT: movl (%esp), %ecx # 4-byte Reload
; CHECK-NEXT: movl %ecx, 8(%eax)
; CHECK-NEXT: movl %edx, (%eax)
; CHECK-NEXT: movl %esi, 20(%eax)
; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; CHECK-NEXT: movl %ecx, 12(%eax)
-; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-NEXT: movl %ecx, 4(%eax)
-; CHECK-NEXT: addl $92, %esp
+; CHECK-NEXT: movl %ebp, 4(%eax)
+; CHECK-NEXT: addl $100, %esp
; CHECK-NEXT: popl %esi
; CHECK-NEXT: popl %edi
; CHECK-NEXT: popl %ebx
@@ -202,8 +209,8 @@ define i256 @shift2(i256 %c) nounwind
; CHECK-NEXT: pushl %ebx
; CHECK-NEXT: pushl %edi
; CHECK-NEXT: pushl %esi
-; CHECK-NEXT: subl $92, %esp
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: subl $100, %esp
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -220,68 +227,75 @@ define i256 @shift2(i256 %c) nounwind
; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
-; CHECK-NEXT: movb %al, %ch
-; CHECK-NEXT: andb $7, %ch
-; CHECK-NEXT: shrb $3, %al
-; CHECK-NEXT: negb %al
-; CHECK-NEXT: movsbl %al, %eax
-; CHECK-NEXT: movl 68(%esp,%eax), %edx
-; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: movb %ch, %cl
+; CHECK-NEXT: movl %ecx, %eax
+; CHECK-NEXT: andb $7, %al
+; CHECK-NEXT: shrb $3, %cl
+; CHECK-NEXT: negb %cl
+; CHECK-NEXT: movsbl %cl, %ebp
+; CHECK-NEXT: movl 76(%esp,%ebp), %edx
+; CHECK-NEXT: movl %edx, (%esp) # 4-byte Spill
+; CHECK-NEXT: movl %eax, %ecx
; CHECK-NEXT: shll %cl, %edx
-; CHECK-NEXT: notb %cl
-; CHECK-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; CHECK-NEXT: movl 64(%esp,%eax), %ebp
-; CHECK-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: shrl %ebp
-; CHECK-NEXT: shrl %cl, %ebp
-; CHECK-NEXT: orl %edx, %ebp
-; CHECK-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: movl 76(%esp,%eax), %edx
; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: movb %ch, %cl
-; CHECK-NEXT: shll %cl, %edx
-; CHECK-NEXT: movl 72(%esp,%eax), %ebx
-; CHECK-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: shrl %ebx
-; CHECK-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-NEXT: shrl %cl, %ebx
-; CHECK-NEXT: orl %edx, %ebx
-; CHECK-NEXT: movl 84(%esp,%eax), %esi
+; CHECK-NEXT: movl %eax, %edx
+; CHECK-NEXT: notb %dl
+; CHECK-NEXT: movl 72(%esp,%ebp), %esi
; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: movb %ch, %cl
+; CHECK-NEXT: shrl %esi
+; CHECK-NEXT: movl %edx, %ecx
+; CHECK-NEXT: shrl %cl, %esi
+; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: movl 84(%esp,%ebp), %esi
+; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: movl %eax, %ecx
; CHECK-NEXT: shll %cl, %esi
-; CHECK-NEXT: movl 80(%esp,%eax), %edi
-; CHECK-NEXT: movl %edi, %edx
-; CHECK-NEXT: shrl %edx
-; CHECK-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-NEXT: shrl %cl, %edx
-; CHECK-NEXT: orl %esi, %edx
-; CHECK-NEXT: movb %ch, %cl
+; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: movl 80(%esp,%ebp), %ecx
+; CHECK-NEXT: movl %ecx, %ebx
+; CHECK-NEXT: movl %ecx, %esi
+; CHECK-NEXT: shrl %ebx
+; CHECK-NEXT: movl %edx, %ecx
+; CHECK-NEXT: shrl %cl, %ebx
+; CHECK-NEXT: movl 92(%esp,%ebp), %edi
+; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: movl %eax, %ecx
+; CHECK-NEXT: shll %cl, %edi
+; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: movl 88(%esp,%ebp), %edi
+; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: shrl %edi
+; CHECK-NEXT: movl %edx, %ecx
+; CHECK-NEXT: shrl %cl, %edi
+; CHECK-NEXT: movl %eax, %ecx
+; CHECK-NEXT: movl (%esp), %edx # 4-byte Reload
+; CHECK-NEXT: shldl %cl, %edx, %esi
+; CHECK-NEXT: movl %esi, (%esp) # 4-byte Spill
+; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; CHECK-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; CHECK-NEXT: movl 96(%esp,%ebp), %edx
; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; CHECK-NEXT: shldl %cl, %esi, %edx
+; CHECK-NEXT: movl 68(%esp,%ebp), %esi
+; CHECK-NEXT: movl %esi, %ebp
+; CHECK-NEXT: shll %cl, %ebp
; CHECK-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; CHECK-NEXT: shldl %cl, %esi, %edi
-; CHECK-NEXT: movl 60(%esp,%eax), %ebp
-; CHECK-NEXT: movl 88(%esp,%eax), %esi
-; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-NEXT: shldl %cl, %eax, %esi
+; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; CHECK-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; CHECK-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: movl %esi, 28(%eax)
-; CHECK-NEXT: movl %edi, 20(%eax)
-; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; CHECK-NEXT: movl %esi, 12(%eax)
-; CHECK-NEXT: movl %ebp, %esi
-; CHECK-NEXT: shll %cl, %esi
-; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; CHECK-NEXT: shldl %cl, %ebp, %edi
-; CHECK-NEXT: movl %edi, 4(%eax)
-; CHECK-NEXT: movl %esi, (%eax)
-; CHECK-NEXT: movl %edx, 24(%eax)
+; CHECK-NEXT: movl %edx, 28(%eax)
+; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; CHECK-NEXT: movl %edx, 20(%eax)
+; CHECK-NEXT: movl (%esp), %edx # 4-byte Reload
+; CHECK-NEXT: movl %edx, 12(%eax)
+; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; CHECK-NEXT: movl %edx, 4(%eax)
+; CHECK-NEXT: movl %ebp, (%eax)
+; CHECK-NEXT: movl %edi, 24(%eax)
; CHECK-NEXT: movl %ebx, 16(%eax)
-; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; CHECK-NEXT: movl %ecx, 8(%eax)
-; CHECK-NEXT: addl $92, %esp
+; CHECK-NEXT: addl $100, %esp
; CHECK-NEXT: popl %esi
; CHECK-NEXT: popl %edi
; CHECK-NEXT: popl %ebx
diff --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll
index 62051d17099403..a23aafe7e9d2b3 100644
--- a/llvm/test/CodeGen/X86/shrink_vmul.ll
+++ b/llvm/test/CodeGen/X86/shrink_vmul.ll
@@ -1974,90 +1974,84 @@ entry:
define void @PR34947(ptr %p0, ptr %p1) nounwind {
; X86-SSE-LABEL: PR34947:
; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: pushl %ebp
; X86-SSE-NEXT: pushl %ebx
; X86-SSE-NEXT: pushl %edi
; X86-SSE-NEXT: pushl %esi
-; X86-SSE-NEXT: pushl %eax
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT: movzwl 16(%eax), %edx
-; X86-SSE-NEXT: movl %edx, (%esp) # 4-byte Spill
-; X86-SSE-NEXT: movdqa (%eax), %xmm2
-; X86-SSE-NEXT: pxor %xmm1, %xmm1
-; X86-SSE-NEXT: movdqa %xmm2, %xmm0
-; X86-SSE-NEXT: pextrw $7, %xmm2, %eax
-; X86-SSE-NEXT: pextrw $4, %xmm2, %esi
-; X86-SSE-NEXT: pextrw $1, %xmm2, %edi
-; X86-SSE-NEXT: pextrw $0, %xmm2, %ebx
-; X86-SSE-NEXT: pextrw $3, %xmm2, %ebp
-; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; X86-SSE-NEXT: xorl %edx, %edx
-; X86-SSE-NEXT: divl 28(%ecx)
-; X86-SSE-NEXT: movd %edx, %xmm1
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X86-SSE-NEXT: movd %xmm3, %eax
+; X86-SSE-NEXT: movzwl 16(%eax), %ecx
+; X86-SSE-NEXT: movdqa (%eax), %xmm1
+; X86-SSE-NEXT: pxor %xmm2, %xmm2
+; X86-SSE-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; X86-SSE-NEXT: pextrw $7, %xmm1, %eax
; X86-SSE-NEXT: xorl %edx, %edx
-; X86-SSE-NEXT: divl 24(%ecx)
+; X86-SSE-NEXT: divl 28(%ebx)
; X86-SSE-NEXT: movd %edx, %xmm3
-; X86-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; X86-SSE-NEXT: movl %esi, %eax
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
+; X86-SSE-NEXT: movd %xmm4, %eax
; X86-SSE-NEXT: xorl %edx, %edx
-; X86-SSE-NEXT: divl 16(%ecx)
-; X86-SSE-NEXT: movd %edx, %xmm1
+; X86-SSE-NEXT: divl 24(%ebx)
+; X86-SSE-NEXT: movd %edx, %xmm4
+; X86-SSE-NEXT: pextrw $4, %xmm1, %eax
+; X86-SSE-NEXT: xorl %edx, %edx
+; X86-SSE-NEXT: divl 16(%ebx)
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
; X86-SSE-NEXT: movd %xmm0, %eax
-; X86-SSE-NEXT: xorl %edx, %edx
-; X86-SSE-NEXT: divl 20(%ecx)
; X86-SSE-NEXT: movd %edx, %xmm0
-; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
-; X86-SSE-NEXT: movl %edi, %eax
; X86-SSE-NEXT: xorl %edx, %edx
-; X86-SSE-NEXT: divl 4(%ecx)
-; X86-SSE-NEXT: movd %edx, %xmm3
-; X86-SSE-NEXT: movl %ebx, %eax
+; X86-SSE-NEXT: divl 20(%ebx)
+; X86-SSE-NEXT: pextrw $1, %xmm1, %eax
+; X86-SSE-NEXT: pextrw $0, %xmm1, %edi
+; X86-SSE-NEXT: pextrw $3, %xmm1, %esi
+; X86-SSE-NEXT: movdqa %xmm1, %xmm5
+; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
+; X86-SSE-NEXT: movd %edx, %xmm1
; X86-SSE-NEXT: xorl %edx, %edx
-; X86-SSE-NEXT: divl (%ecx)
-; X86-SSE-NEXT: movd %edx, %xmm0
-; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; X86-SSE-NEXT: movl %ebp, %eax
+; X86-SSE-NEXT: divl 4(%ebx)
+; X86-SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; X86-SSE-NEXT: movd %edx, %xmm2
+; X86-SSE-NEXT: movl %edi, %eax
; X86-SSE-NEXT: xorl %edx, %edx
-; X86-SSE-NEXT: divl 12(%ecx)
-; X86-SSE-NEXT: movd %edx, %xmm3
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
-; X86-SSE-NEXT: movd %xmm2, %eax
+; X86-SSE-NEXT: divl (%ebx)
+; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
+; X86-SSE-NEXT: movd %edx, %xmm1
+; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X86-SSE-NEXT: movl %esi, %eax
; X86-SSE-NEXT: xorl %edx, %edx
-; X86-SSE-NEXT: divl 8(%ecx)
+; X86-SSE-NEXT: divl 12(%ebx)
; X86-SSE-NEXT: movd %edx, %xmm2
-; X86-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; X86-SSE-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,3,2,3]
+; X86-SSE-NEXT: movd %xmm3, %eax
+; X86-SSE-NEXT: xorl %edx, %edx
+; X86-SSE-NEXT: divl 8(%ebx)
+; X86-SSE-NEXT: movd %edx, %xmm3
+; X86-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; X86-SSE-NEXT: movl %ecx, %eax
; X86-SSE-NEXT: xorl %edx, %edx
-; X86-SSE-NEXT: divl 32(%ecx)
+; X86-SSE-NEXT: divl 32(%ebx)
; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [8199,8199,8199,8199]
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; X86-SSE-NEXT: pmuludq %xmm2, %xmm0
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-SSE-NEXT: pmuludq %xmm2, %xmm3
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
; X86-SSE-NEXT: pmuludq %xmm2, %xmm1
; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X86-SSE-NEXT: pmuludq %xmm2, %xmm3
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; X86-SSE-NEXT: pmuludq %xmm2, %xmm0
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-SSE-NEXT: pmuludq %xmm2, %xmm3
; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
-; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; X86-SSE-NEXT: imull $8199, %edx, %eax # imm = 0x2007
; X86-SSE-NEXT: movl %eax, (%eax)
-; X86-SSE-NEXT: movdqa %xmm1, (%eax)
; X86-SSE-NEXT: movdqa %xmm0, (%eax)
-; X86-SSE-NEXT: addl $4, %esp
+; X86-SSE-NEXT: movdqa %xmm1, (%eax)
; X86-SSE-NEXT: popl %esi
; X86-SSE-NEXT: popl %edi
; X86-SSE-NEXT: popl %ebx
-; X86-SSE-NEXT: popl %ebp
; X86-SSE-NEXT: retl
;
; X86-AVX1-LABEL: PR34947:
@@ -2188,76 +2182,74 @@ define void @PR34947(ptr %p0, ptr %p1) nounwind {
; X64-SSE-LABEL: PR34947:
; X64-SSE: # %bb.0:
; X64-SSE-NEXT: movzwl 16(%rdi), %ecx
-; X64-SSE-NEXT: movdqa (%rdi), %xmm2
-; X64-SSE-NEXT: pxor %xmm1, %xmm1
-; X64-SSE-NEXT: movdqa %xmm2, %xmm0
-; X64-SSE-NEXT: pextrw $7, %xmm2, %eax
-; X64-SSE-NEXT: pextrw $4, %xmm2, %edi
-; X64-SSE-NEXT: pextrw $1, %xmm2, %r8d
-; X64-SSE-NEXT: pextrw $0, %xmm2, %r9d
-; X64-SSE-NEXT: pextrw $3, %xmm2, %r10d
-; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X64-SSE-NEXT: movdqa (%rdi), %xmm1
+; X64-SSE-NEXT: pxor %xmm2, %xmm2
+; X64-SSE-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; X64-SSE-NEXT: pextrw $7, %xmm1, %eax
; X64-SSE-NEXT: xorl %edx, %edx
; X64-SSE-NEXT: divl 28(%rsi)
-; X64-SSE-NEXT: movd %edx, %xmm1
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X64-SSE-NEXT: movd %xmm3, %eax
+; X64-SSE-NEXT: movd %edx, %xmm3
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
+; X64-SSE-NEXT: movd %xmm4, %eax
; X64-SSE-NEXT: xorl %edx, %edx
; X64-SSE-NEXT: divl 24(%rsi)
-; X64-SSE-NEXT: movd %edx, %xmm3
-; X64-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; X64-SSE-NEXT: movl %edi, %eax
+; X64-SSE-NEXT: movd %edx, %xmm4
+; X64-SSE-NEXT: pextrw $4, %xmm1, %eax
; X64-SSE-NEXT: xorl %edx, %edx
; X64-SSE-NEXT: divl 16(%rsi)
-; X64-SSE-NEXT: movd %edx, %xmm1
; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
; X64-SSE-NEXT: movd %xmm0, %eax
+; X64-SSE-NEXT: movd %edx, %xmm0
; X64-SSE-NEXT: xorl %edx, %edx
; X64-SSE-NEXT: divl 20(%rsi)
-; X64-SSE-NEXT: movd %edx, %xmm0
-; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
-; X64-SSE-NEXT: movl %r8d, %eax
+; X64-SSE-NEXT: pextrw $1, %xmm1, %eax
+; X64-SSE-NEXT: pextrw $0, %xmm1, %edi
+; X64-SSE-NEXT: pextrw $3, %xmm1, %r8d
+; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X64-SSE-NEXT: movd %edx, %xmm2
; X64-SSE-NEXT: xorl %edx, %edx
; X64-SSE-NEXT: divl 4(%rsi)
-; X64-SSE-NEXT: movd %edx, %xmm0
-; X64-SSE-NEXT: movl %r9d, %eax
+; X64-SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; X64-SSE-NEXT: movd %edx, %xmm3
+; X64-SSE-NEXT: movl %edi, %eax
; X64-SSE-NEXT: xorl %edx, %edx
; X64-SSE-NEXT: divl (%rsi)
-; X64-SSE-NEXT: movd %edx, %xmm3
-; X64-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
-; X64-SSE-NEXT: movl %r10d, %eax
+; X64-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
+; X64-SSE-NEXT: movd %edx, %xmm2
+; X64-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; X64-SSE-NEXT: movl %r8d, %eax
; X64-SSE-NEXT: xorl %edx, %edx
; X64-SSE-NEXT: divl 12(%rsi)
-; X64-SSE-NEXT: movd %edx, %xmm0
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
-; X64-SSE-NEXT: movd %xmm2, %eax
+; X64-SSE-NEXT: movd %edx, %xmm3
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; X64-SSE-NEXT: movd %xmm1, %eax
; X64-SSE-NEXT: xorl %edx, %edx
; X64-SSE-NEXT: divl 8(%rsi)
-; X64-SSE-NEXT: movd %edx, %xmm2
-; X64-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
+; X64-SSE-NEXT: movd %edx, %xmm1
+; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
; X64-SSE-NEXT: movl %ecx, %eax
; X64-SSE-NEXT: xorl %edx, %edx
; X64-SSE-NEXT: divl 32(%rsi)
-; X64-SSE-NEXT: movdqa {{.*#+}} xmm0 = [8199,8199,8199,8199]
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
-; X64-SSE-NEXT: pmuludq %xmm0, %xmm3
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; X64-SSE-NEXT: pmuludq %xmm0, %xmm2
+; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = [8199,8199,8199,8199]
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; X64-SSE-NEXT: pmuludq %xmm1, %xmm2
; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; X64-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; X64-SSE-NEXT: pmuludq %xmm0, %xmm1
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X64-SSE-NEXT: pmuludq %xmm0, %xmm2
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
-; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X64-SSE-NEXT: pmuludq %xmm1, %xmm3
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; X64-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; X64-SSE-NEXT: pmuludq %xmm1, %xmm0
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-SSE-NEXT: pmuludq %xmm1, %xmm3
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
+; X64-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X64-SSE-NEXT: imull $8199, %edx, %eax # imm = 0x2007
; X64-SSE-NEXT: movl %eax, (%rax)
-; X64-SSE-NEXT: movdqa %xmm1, (%rax)
-; X64-SSE-NEXT: movdqa %xmm3, (%rax)
+; X64-SSE-NEXT: movdqa %xmm0, (%rax)
+; X64-SSE-NEXT: movdqa %xmm2, (%rax)
; X64-SSE-NEXT: retq
;
; X64-AVX1-LABEL: PR34947:
diff --git a/llvm/test/CodeGen/X86/smax.ll b/llvm/test/CodeGen/X86/smax.ll
index 2d59422953eb3d..601a1f758afabf 100644
--- a/llvm/test/CodeGen/X86/smax.ll
+++ b/llvm/test/CodeGen/X86/smax.ll
@@ -511,33 +511,36 @@ define <16 x i8> @test_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: subl $40, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: cmpb %bl, %al
-; X86-NEXT: cmovgl %eax, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: cmpb %dl, %al
-; X86-NEXT: cmovgl %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: cmpb %al, %dl
+; X86-NEXT: cmovgl %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: cmpb %cl, %al
; X86-NEXT: cmovgl %eax, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: cmpb %bl, %al
+; X86-NEXT: cmovgl %eax, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %ebp, %ecx
; X86-NEXT: cmpb %cl, %al
; X86-NEXT: cmovgl %eax, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %edi, %ecx
; X86-NEXT: cmpb %cl, %al
; X86-NEXT: cmovgl %eax, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %esi, %ecx
; X86-NEXT: cmpb %cl, %al
; X86-NEXT: cmovgl %eax, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
diff --git a/llvm/test/CodeGen/X86/smin.ll b/llvm/test/CodeGen/X86/smin.ll
index bde61d5738ed5c..31246b3ae7fb4d 100644
--- a/llvm/test/CodeGen/X86/smin.ll
+++ b/llvm/test/CodeGen/X86/smin.ll
@@ -512,33 +512,36 @@ define <16 x i8> @test_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: subl $40, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: cmpb %bl, %al
-; X86-NEXT: cmovll %eax, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: cmpb %dl, %al
-; X86-NEXT: cmovll %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: cmpb %al, %dl
+; X86-NEXT: cmovll %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: cmpb %cl, %al
; X86-NEXT: cmovll %eax, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: cmpb %bl, %al
+; X86-NEXT: cmovll %eax, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %ebp, %ecx
; X86-NEXT: cmpb %cl, %al
; X86-NEXT: cmovll %eax, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %edi, %ecx
; X86-NEXT: cmpb %cl, %al
; X86-NEXT: cmovll %eax, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %esi, %ecx
; X86-NEXT: cmpb %cl, %al
; X86-NEXT: cmovll %eax, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
diff --git a/llvm/test/CodeGen/X86/smul-with-overflow.ll b/llvm/test/CodeGen/X86/smul-with-overflow.ll
index da0e3fdc1a5272..5d657eac120f1a 100644
--- a/llvm/test/CodeGen/X86/smul-with-overflow.ll
+++ b/llvm/test/CodeGen/X86/smul-with-overflow.ll
@@ -271,61 +271,63 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, %ebx
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: mull %ecx
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ecx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: addl %esi, %ebx
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: addl %edi, %esi
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: mull %ebp
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: mull %ebx
; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl %ebx, %esi
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: addl %esi, %ebp
; X86-NEXT: adcl %ecx, %edi
; X86-NEXT: setb %cl
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ebp
+; X86-NEXT: mull %ebx
; X86-NEXT: addl %edi, %eax
; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
; X86-NEXT: movzbl %cl, %eax
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: mull %ecx
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ecx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull %edi
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl %ebx, %edi
+; X86-NEXT: addl %esi, %edi
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl %ebx, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: addl %edi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %ecx, %ebp
+; X86-NEXT: adcl %ecx, %esi
; X86-NEXT: setb %bl
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: addl %ebp, %ecx
+; X86-NEXT: addl %esi, %ecx
; X86-NEXT: movzbl %bl, %eax
; X86-NEXT: adcl %eax, %edi
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: adcl %esi, %edi
+; X86-NEXT: adcl %ebp, %edi
; X86-NEXT: adcl $0, (%esp) # 4-byte Folded Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
diff --git a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
index b2b5bcc5b44b2c..818eba1c3057a9 100644
--- a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
@@ -119,13 +119,14 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: movl %ebp, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: mull %edi
; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ecx
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull %edi
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, %edi
; X86-NEXT: addl %ebx, %edi
@@ -611,14 +612,15 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
; X86-NEXT: movzbl %cl, %eax
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl %edi, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ecx
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, %esi
; X86-NEXT: addl %ebx, %esi
@@ -708,18 +710,19 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
; X86-NEXT: adcl $0, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: movl %ebp, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: mull %ecx
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, %ebx
; X86-NEXT: addl %esi, %ebx
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl %ebp, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: mull %ebp
; X86-NEXT: movl %edx, %esi
@@ -735,13 +738,14 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: movl %ebp, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ecx
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, %esi
; X86-NEXT: addl %ebx, %esi
diff --git a/llvm/test/CodeGen/X86/sse-regcall.ll b/llvm/test/CodeGen/X86/sse-regcall.ll
index 6f0293392eef2b..e014c9f8953836 100644
--- a/llvm/test/CodeGen/X86/sse-regcall.ll
+++ b/llvm/test/CodeGen/X86/sse-regcall.ll
@@ -244,8 +244,6 @@ define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a
; WIN64: # %bb.0:
; WIN64-NEXT: pushq %rbp
; WIN64-NEXT: pushq %rbx
-; WIN64-NEXT: # kill: def $edx killed $edx def $rdx
-; WIN64-NEXT: # kill: def $esi killed $esi def $rsi
; WIN64-NEXT: # kill: def $r15d killed $r15d def $r15
; WIN64-NEXT: # kill: def $r14d killed $r14d def $r14
; WIN64-NEXT: # kill: def $r12d killed $r12d def $r12
@@ -253,7 +251,9 @@ define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a
; WIN64-NEXT: # kill: def $r10d killed $r10d def $r10
; WIN64-NEXT: # kill: def $r9d killed $r9d def $r9
; WIN64-NEXT: # kill: def $r8d killed $r8d def $r8
+; WIN64-NEXT: # kill: def $esi killed $esi def $rsi
; WIN64-NEXT: # kill: def $edi killed $edi def $rdi
+; WIN64-NEXT: # kill: def $edx killed $edx def $rdx
; WIN64-NEXT: leal (%rdx,%rdi), %ebx
; WIN64-NEXT: movl %edx, %ebp
; WIN64-NEXT: subl %edi, %ebp
@@ -289,14 +289,14 @@ define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a
;
; LINUXOSX-LABEL: testi32_inp:
; LINUXOSX: # %bb.0:
-; LINUXOSX-NEXT: # kill: def $edx killed $edx def $rdx
-; LINUXOSX-NEXT: # kill: def $esi killed $esi def $rsi
; LINUXOSX-NEXT: # kill: def $r14d killed $r14d def $r14
; LINUXOSX-NEXT: # kill: def $r13d killed $r13d def $r13
; LINUXOSX-NEXT: # kill: def $r12d killed $r12d def $r12
; LINUXOSX-NEXT: # kill: def $r9d killed $r9d def $r9
; LINUXOSX-NEXT: # kill: def $r8d killed $r8d def $r8
+; LINUXOSX-NEXT: # kill: def $esi killed $esi def $rsi
; LINUXOSX-NEXT: # kill: def $edi killed $edi def $rdi
+; LINUXOSX-NEXT: # kill: def $edx killed $edx def $rdx
; LINUXOSX-NEXT: leal (%rdx,%rdi), %r10d
; LINUXOSX-NEXT: movl %edx, %r11d
; LINUXOSX-NEXT: subl %edi, %r11d
diff --git a/llvm/test/CodeGen/X86/sse-regcall4.ll b/llvm/test/CodeGen/X86/sse-regcall4.ll
index c8df7a233d7e3f..f66f9d9d449420 100644
--- a/llvm/test/CodeGen/X86/sse-regcall4.ll
+++ b/llvm/test/CodeGen/X86/sse-regcall4.ll
@@ -244,14 +244,14 @@ define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a
; WIN64: # %bb.0:
; WIN64-NEXT: pushq %rbp
; WIN64-NEXT: pushq %rbx
-; WIN64-NEXT: # kill: def $edx killed $edx def $rdx
-; WIN64-NEXT: # kill: def $esi killed $esi def $rsi
; WIN64-NEXT: # kill: def $r14d killed $r14d def $r14
; WIN64-NEXT: # kill: def $r12d killed $r12d def $r12
; WIN64-NEXT: # kill: def $r11d killed $r11d def $r11
; WIN64-NEXT: # kill: def $r9d killed $r9d def $r9
; WIN64-NEXT: # kill: def $r8d killed $r8d def $r8
+; WIN64-NEXT: # kill: def $esi killed $esi def $rsi
; WIN64-NEXT: # kill: def $edi killed $edi def $rdi
+; WIN64-NEXT: # kill: def $edx killed $edx def $rdx
; WIN64-NEXT: leal (%rdx,%rdi), %ebx
; WIN64-NEXT: movl %edx, %ebp
; WIN64-NEXT: subl %edi, %ebp
@@ -288,14 +288,14 @@ define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a
;
; LINUXOSX-LABEL: testi32_inp:
; LINUXOSX: # %bb.0:
-; LINUXOSX-NEXT: # kill: def $edx killed $edx def $rdx
-; LINUXOSX-NEXT: # kill: def $esi killed $esi def $rsi
; LINUXOSX-NEXT: # kill: def $r14d killed $r14d def $r14
; LINUXOSX-NEXT: # kill: def $r13d killed $r13d def $r13
; LINUXOSX-NEXT: # kill: def $r12d killed $r12d def $r12
; LINUXOSX-NEXT: # kill: def $r9d killed $r9d def $r9
; LINUXOSX-NEXT: # kill: def $r8d killed $r8d def $r8
+; LINUXOSX-NEXT: # kill: def $esi killed $esi def $rsi
; LINUXOSX-NEXT: # kill: def $edi killed $edi def $rdi
+; LINUXOSX-NEXT: # kill: def $edx killed $edx def $rdx
; LINUXOSX-NEXT: leal (%rdx,%rdi), %r10d
; LINUXOSX-NEXT: movl %edx, %r11d
; LINUXOSX-NEXT: subl %edi, %r11d
diff --git a/llvm/test/CodeGen/X86/sshl_sat_vec.ll b/llvm/test/CodeGen/X86/sshl_sat_vec.ll
index f91758b861b4c4..c8e3930c87c647 100644
--- a/llvm/test/CodeGen/X86/sshl_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/sshl_sat_vec.ll
@@ -432,8 +432,8 @@ define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: cmovel %edx, %ecx
; X86-NEXT: movl %ecx, %ebp
-; X86-NEXT: movl %eax, %edx
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %eax, %edx
; X86-NEXT: shll %cl, %edx
; X86-NEXT: movswl %dx, %esi
; X86-NEXT: sarl %cl, %esi
@@ -444,8 +444,8 @@ define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
; X86-NEXT: cmpw %si, %ax
; X86-NEXT: cmovel %edx, %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %eax, %esi
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %eax, %esi
; X86-NEXT: shll %cl, %esi
; X86-NEXT: movswl %si, %edi
; X86-NEXT: sarl %cl, %edi
@@ -634,43 +634,57 @@ define <16 x i8> @vec_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
; X86-NEXT: pushl %esi
; X86-NEXT: subl $44, %esp
; X86-NEXT: movb {{[0-9]+}}(%esp), %dh
-; X86-NEXT: movb {{[0-9]+}}(%esp), %dl
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movb {{[0-9]+}}(%esp), %bh
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movb {{[0-9]+}}(%esp), %ch
-; X86-NEXT: movb %ch, %bh
-; X86-NEXT: shlb %cl, %bh
-; X86-NEXT: movzbl %bh, %esi
-; X86-NEXT: sarb %cl, %bh
+; X86-NEXT: movb %ch, %dl
+; X86-NEXT: shlb %cl, %dl
+; X86-NEXT: movzbl %dl, %esi
+; X86-NEXT: sarb %cl, %dl
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: testb %ch, %ch
; X86-NEXT: sets %al
; X86-NEXT: addl $127, %eax
-; X86-NEXT: cmpb %bh, %ch
+; X86-NEXT: cmpb %dl, %ch
; X86-NEXT: cmovel %esi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movb %bh, %al
+; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: shlb %cl, %al
+; X86-NEXT: movzbl %al, %esi
+; X86-NEXT: sarb %cl, %al
+; X86-NEXT: xorl %ecx, %ecx
+; X86-NEXT: testb %bh, %bh
+; X86-NEXT: sets %cl
+; X86-NEXT: addl $127, %ecx
+; X86-NEXT: cmpb %al, %bh
+; X86-NEXT: cmovel %esi, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movb {{[0-9]+}}(%esp), %dl
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: movb %dh, %cl
; X86-NEXT: shlb %cl, %al
; X86-NEXT: movzbl %al, %esi
; X86-NEXT: sarb %cl, %al
; X86-NEXT: xorl %ecx, %ecx
-; X86-NEXT: testb %bl, %bl
+; X86-NEXT: testb %dl, %dl
; X86-NEXT: sets %cl
; X86-NEXT: addl $127, %ecx
-; X86-NEXT: cmpb %al, %bl
+; X86-NEXT: cmpb %al, %dl
; X86-NEXT: cmovel %esi, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movb %dh, %al
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl %edx, %eax
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: shlb %cl, %al
; X86-NEXT: movzbl %al, %esi
; X86-NEXT: sarb %cl, %al
; X86-NEXT: xorl %ecx, %ecx
-; X86-NEXT: testb %dh, %dh
+; X86-NEXT: testb %dl, %dl
; X86-NEXT: sets %cl
; X86-NEXT: addl $127, %ecx
-; X86-NEXT: cmpb %al, %dh
+; X86-NEXT: cmpb %al, %dl
; X86-NEXT: cmovel %esi, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movb {{[0-9]+}}(%esp), %ah
@@ -763,19 +777,6 @@ define <16 x i8> @vec_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
; X86-NEXT: addl $127, %ecx
; X86-NEXT: cmpb %dl, %al
; X86-NEXT: cmovel %esi, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %eax, %edx
-; X86-NEXT: shlb %cl, %dl
-; X86-NEXT: movzbl %dl, %esi
-; X86-NEXT: sarb %cl, %dl
-; X86-NEXT: xorl %ecx, %ecx
-; X86-NEXT: testb %al, %al
-; X86-NEXT: sets %cl
-; X86-NEXT: addl $127, %ecx
-; X86-NEXT: cmpb %dl, %al
-; X86-NEXT: cmovel %esi, %ecx
; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
diff --git a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll
index ae66c5420638bc..f77536fd1df171 100644
--- a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll
+++ b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll
@@ -7371,6 +7371,14 @@ define void @vec512_v32i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
; SCALAR-NEXT: pushq %r13
; SCALAR-NEXT: pushq %r12
; SCALAR-NEXT: pushq %rbx
+; SCALAR-NEXT: movzbl 20(%rdi), %eax
+; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SCALAR-NEXT: movzbl 19(%rdi), %eax
+; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SCALAR-NEXT: movzbl 18(%rdi), %eax
+; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SCALAR-NEXT: movzbl 17(%rdi), %eax
+; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; SCALAR-NEXT: movzbl 16(%rdi), %eax
; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; SCALAR-NEXT: movzbl 15(%rdi), %eax
@@ -7379,9 +7387,9 @@ define void @vec512_v32i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; SCALAR-NEXT: movzbl 13(%rdi), %eax
; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SCALAR-NEXT: movzbl 12(%rdi), %r13d
-; SCALAR-NEXT: movzbl 11(%rdi), %eax
+; SCALAR-NEXT: movzbl 12(%rdi), %eax
; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SCALAR-NEXT: movzbl 11(%rdi), %r13d
; SCALAR-NEXT: movzbl 10(%rdi), %r12d
; SCALAR-NEXT: movzbl 9(%rdi), %r15d
; SCALAR-NEXT: movzbl 8(%rdi), %r14d
@@ -7415,55 +7423,51 @@ define void @vec512_v32i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
; SCALAR-NEXT: movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; SCALAR-NEXT: notb %r12b
; SCALAR-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload
-; SCALAR-NEXT: notb %r11b
-; SCALAR-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; SCALAR-NEXT: notb %r13b
; SCALAR-NEXT: movb %r13b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; SCALAR-NEXT: notb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
; SCALAR-NEXT: notb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; SCALAR-NEXT: notb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; SCALAR-NEXT: notb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; SCALAR-NEXT: notb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload
+; SCALAR-NEXT: notb %r12b
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
+; SCALAR-NEXT: notb %r10b
+; SCALAR-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload
+; SCALAR-NEXT: notb %r9b
+; SCALAR-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload
; SCALAR-NEXT: notb %r8b
-; SCALAR-NEXT: notb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
-; SCALAR-NEXT: movzbl 17(%rdi), %eax
-; SCALAR-NEXT: notb %al
-; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SCALAR-NEXT: movzbl 18(%rdi), %eax
-; SCALAR-NEXT: notb %al
-; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SCALAR-NEXT: movzbl 19(%rdi), %eax
+; SCALAR-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SCALAR-NEXT: movzbl 21(%rdi), %eax
; SCALAR-NEXT: notb %al
; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SCALAR-NEXT: movzbl 20(%rdi), %eax
+; SCALAR-NEXT: movzbl 22(%rdi), %ebx
+; SCALAR-NEXT: notb %bl
+; SCALAR-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SCALAR-NEXT: movzbl 23(%rdi), %eax
; SCALAR-NEXT: notb %al
; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SCALAR-NEXT: movzbl 21(%rdi), %ebp
+; SCALAR-NEXT: movzbl 24(%rdi), %ebp
; SCALAR-NEXT: notb %bpl
; SCALAR-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SCALAR-NEXT: movzbl 22(%rdi), %ebx
-; SCALAR-NEXT: notb %bl
-; SCALAR-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SCALAR-NEXT: movzbl 23(%rdi), %r10d
-; SCALAR-NEXT: notb %r10b
-; SCALAR-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SCALAR-NEXT: movzbl 24(%rdi), %r9d
-; SCALAR-NEXT: notb %r9b
-; SCALAR-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SCALAR-NEXT: movzbl 25(%rdi), %ecx
-; SCALAR-NEXT: notb %cl
-; SCALAR-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SCALAR-NEXT: movzbl 25(%rdi), %r11d
+; SCALAR-NEXT: notb %r11b
+; SCALAR-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; SCALAR-NEXT: movzbl 26(%rdi), %r14d
; SCALAR-NEXT: notb %r14b
; SCALAR-NEXT: movb %r14b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; SCALAR-NEXT: movzbl 27(%rdi), %r15d
; SCALAR-NEXT: notb %r15b
; SCALAR-NEXT: movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SCALAR-NEXT: movzbl 28(%rdi), %r12d
-; SCALAR-NEXT: notb %r12b
-; SCALAR-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SCALAR-NEXT: movzbl 29(%rdi), %r13d
+; SCALAR-NEXT: movzbl 28(%rdi), %r13d
; SCALAR-NEXT: notb %r13b
; SCALAR-NEXT: movb %r13b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SCALAR-NEXT: movzbl 29(%rdi), %ecx
+; SCALAR-NEXT: notb %cl
+; SCALAR-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; SCALAR-NEXT: movzbl 30(%rdi), %eax
; SCALAR-NEXT: notb %al
; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
@@ -7472,57 +7476,56 @@ define void @vec512_v32i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
; SCALAR-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; SCALAR-NEXT: movb %dil, 31(%rsi)
; SCALAR-NEXT: movb %al, 30(%rsi)
-; SCALAR-NEXT: movb %r13b, 29(%rsi)
-; SCALAR-NEXT: movb %r12b, 28(%rsi)
+; SCALAR-NEXT: movb %cl, 29(%rsi)
+; SCALAR-NEXT: movb %r13b, 28(%rsi)
; SCALAR-NEXT: movb %r15b, 27(%rsi)
; SCALAR-NEXT: movb %r14b, 26(%rsi)
-; SCALAR-NEXT: movb %cl, 25(%rsi)
-; SCALAR-NEXT: movb %r9b, 24(%rsi)
-; SCALAR-NEXT: movb %r10b, 23(%rsi)
-; SCALAR-NEXT: movb %bl, 22(%rsi)
-; SCALAR-NEXT: movb %bpl, 21(%rsi)
+; SCALAR-NEXT: movb %r11b, 25(%rsi)
+; SCALAR-NEXT: movb %bpl, 24(%rsi)
; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload
-; SCALAR-NEXT: movb %bpl, 20(%rsi)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; SCALAR-NEXT: movb %al, 19(%rsi)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; SCALAR-NEXT: movb %al, 18(%rsi)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; SCALAR-NEXT: movb %al, 17(%rsi)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
-; SCALAR-NEXT: movb %cl, 16(%rsi)
-; SCALAR-NEXT: movb %r8b, 15(%rsi)
-; SCALAR-NEXT: movl %r8d, %r14d
-; SCALAR-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SCALAR-NEXT: movb %bpl, 23(%rsi)
+; SCALAR-NEXT: movb %bl, 22(%rsi)
; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload
-; SCALAR-NEXT: movb %bl, 14(%rsi)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; SCALAR-NEXT: movb %al, 13(%rsi)
+; SCALAR-NEXT: movb %bl, 21(%rsi)
+; SCALAR-NEXT: movb %r8b, 20(%rsi)
+; SCALAR-NEXT: movb %r9b, 19(%rsi)
+; SCALAR-NEXT: movb %r10b, 18(%rsi)
+; SCALAR-NEXT: movb %r12b, 17(%rsi)
+; SCALAR-NEXT: movl %r12d, %r15d
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload
+; SCALAR-NEXT: movb %r12b, 16(%rsi)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload
+; SCALAR-NEXT: movb %r14b, 15(%rsi)
; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; SCALAR-NEXT: movb %al, 12(%rsi)
-; SCALAR-NEXT: movb %r11b, 11(%rsi)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
-; SCALAR-NEXT: movb %dil, 10(%rsi)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
-; SCALAR-NEXT: movb %dil, 9(%rsi)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
-; SCALAR-NEXT: movb %dil, 8(%rsi)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload
-; SCALAR-NEXT: movb %r11b, 7(%rsi)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload
-; SCALAR-NEXT: movb %r13b, 6(%rsi)
+; SCALAR-NEXT: movb %al, 14(%rsi)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; SCALAR-NEXT: movb %cl, 13(%rsi)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; SCALAR-NEXT: movb %cl, 12(%rsi)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; SCALAR-NEXT: movb %cl, 11(%rsi)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; SCALAR-NEXT: movb %cl, 10(%rsi)
; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
-; SCALAR-NEXT: movb %r10b, 5(%rsi)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload
-; SCALAR-NEXT: movb %r12b, 4(%rsi)
+; SCALAR-NEXT: movb %r10b, 9(%rsi)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; SCALAR-NEXT: movb %cl, 8(%rsi)
; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload
-; SCALAR-NEXT: movb %r9b, 3(%rsi)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload
-; SCALAR-NEXT: movb %r15b, 2(%rsi)
+; SCALAR-NEXT: movb %r9b, 7(%rsi)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload
+; SCALAR-NEXT: movb %r13b, 6(%rsi)
; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload
-; SCALAR-NEXT: movb %r8b, 1(%rsi)
+; SCALAR-NEXT: movb %r8b, 5(%rsi)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; SCALAR-NEXT: movb %cl, 4(%rsi)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
+; SCALAR-NEXT: movb %dil, 3(%rsi)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
+; SCALAR-NEXT: movb %dil, 2(%rsi)
; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
-; SCALAR-NEXT: movb %dil, (%rsi)
+; SCALAR-NEXT: movb %dil, 1(%rsi)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload
+; SCALAR-NEXT: movb %r11b, (%rsi)
; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
; SCALAR-NEXT: movb %sil, 31(%rdx)
; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
@@ -7539,92 +7542,92 @@ define void @vec512_v32i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
; SCALAR-NEXT: movb %sil, 25(%rdx)
; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
; SCALAR-NEXT: movb %sil, 24(%rdx)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
-; SCALAR-NEXT: movb %sil, 23(%rdx)
+; SCALAR-NEXT: movb %bpl, 23(%rdx)
; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
; SCALAR-NEXT: movb %sil, 22(%rdx)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
-; SCALAR-NEXT: movb %sil, 21(%rdx)
-; SCALAR-NEXT: movb %bpl, 20(%rdx)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
-; SCALAR-NEXT: movb %sil, 19(%rdx)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
-; SCALAR-NEXT: movb %sil, 18(%rdx)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
-; SCALAR-NEXT: movb %sil, 17(%rdx)
-; SCALAR-NEXT: movb %cl, 16(%rdx)
+; SCALAR-NEXT: movb %bl, 21(%rdx)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload
+; SCALAR-NEXT: movb %bl, 20(%rdx)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload
+; SCALAR-NEXT: movb %bl, 19(%rdx)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload
+; SCALAR-NEXT: movb %bl, 18(%rdx)
+; SCALAR-NEXT: movb %r15b, 17(%rdx)
+; SCALAR-NEXT: movb %r12b, 16(%rdx)
; SCALAR-NEXT: movb %r14b, 15(%rdx)
-; SCALAR-NEXT: movb %bl, 14(%rdx)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
-; SCALAR-NEXT: movb %cl, 13(%rdx)
+; SCALAR-NEXT: movb %al, 14(%rdx)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload
+; SCALAR-NEXT: movb %bpl, 13(%rdx)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; SCALAR-NEXT: movb %al, 12(%rdx)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
-; SCALAR-NEXT: movb %sil, 11(%rdx)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload
+; SCALAR-NEXT: movb %r12b, 11(%rdx)
; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload
; SCALAR-NEXT: movb %bl, 10(%rdx)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload
-; SCALAR-NEXT: movb %r14b, 9(%rdx)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload
-; SCALAR-NEXT: movb %bpl, 8(%rdx)
-; SCALAR-NEXT: movb %r11b, 7(%rdx)
+; SCALAR-NEXT: movb %r10b, 9(%rdx)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; SCALAR-NEXT: movb %al, 8(%rdx)
+; SCALAR-NEXT: movb %r9b, 7(%rdx)
; SCALAR-NEXT: movb %r13b, 6(%rdx)
-; SCALAR-NEXT: movb %r10b, 5(%rdx)
-; SCALAR-NEXT: movb %r12b, 4(%rdx)
+; SCALAR-NEXT: movb %r8b, 5(%rdx)
+; SCALAR-NEXT: movb %cl, 4(%rdx)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload
; SCALAR-NEXT: movb %r9b, 3(%rdx)
-; SCALAR-NEXT: movb %r15b, 2(%rdx)
-; SCALAR-NEXT: movb %r8b, 1(%rdx)
-; SCALAR-NEXT: movb %dil, (%rdx)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; SCALAR-NEXT: movb %al, 63(%rdx)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; SCALAR-NEXT: movb %al, 62(%rdx)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; SCALAR-NEXT: movb %al, 61(%rdx)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; SCALAR-NEXT: movb %al, 60(%rdx)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; SCALAR-NEXT: movb %al, 59(%rdx)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; SCALAR-NEXT: movb %al, 58(%rdx)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; SCALAR-NEXT: movb %al, 57(%rdx)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; SCALAR-NEXT: movb %al, 56(%rdx)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; SCALAR-NEXT: movb %al, 55(%rdx)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; SCALAR-NEXT: movb %al, 54(%rdx)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; SCALAR-NEXT: movb %al, 53(%rdx)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; SCALAR-NEXT: movb %al, 52(%rdx)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; SCALAR-NEXT: movb %al, 51(%rdx)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; SCALAR-NEXT: movb %al, 50(%rdx)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; SCALAR-NEXT: movb %al, 49(%rdx)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; SCALAR-NEXT: movb %al, 48(%rdx)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload
+; SCALAR-NEXT: movb %r8b, 2(%rdx)
+; SCALAR-NEXT: movb %dil, 1(%rdx)
+; SCALAR-NEXT: movb %r11b, (%rdx)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; SCALAR-NEXT: movb %cl, 63(%rdx)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; SCALAR-NEXT: movb %cl, 62(%rdx)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; SCALAR-NEXT: movb %cl, 61(%rdx)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; SCALAR-NEXT: movb %cl, 60(%rdx)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; SCALAR-NEXT: movb %cl, 59(%rdx)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; SCALAR-NEXT: movb %cl, 58(%rdx)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; SCALAR-NEXT: movb %cl, 57(%rdx)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; SCALAR-NEXT: movb %cl, 56(%rdx)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; SCALAR-NEXT: movb %cl, 55(%rdx)
+; SCALAR-NEXT: movb %sil, 54(%rdx)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; SCALAR-NEXT: movb %cl, 53(%rdx)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; SCALAR-NEXT: movb %cl, 52(%rdx)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; SCALAR-NEXT: movb %cl, 51(%rdx)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; SCALAR-NEXT: movb %cl, 50(%rdx)
+; SCALAR-NEXT: movb %r15b, 49(%rdx)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; SCALAR-NEXT: movb %cl, 48(%rdx)
+; SCALAR-NEXT: movb %r14b, 47(%rdx)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; SCALAR-NEXT: movb %cl, 46(%rdx)
+; SCALAR-NEXT: movb %bpl, 45(%rdx)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; SCALAR-NEXT: movb %cl, 44(%rdx)
+; SCALAR-NEXT: movb %r12b, 43(%rdx)
+; SCALAR-NEXT: movb %bl, 42(%rdx)
+; SCALAR-NEXT: movb %r10b, 41(%rdx)
+; SCALAR-NEXT: movb %al, 40(%rdx)
; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; SCALAR-NEXT: movb %al, 47(%rdx)
+; SCALAR-NEXT: movb %al, 39(%rdx)
+; SCALAR-NEXT: movb %r13b, 38(%rdx)
; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; SCALAR-NEXT: movb %al, 46(%rdx)
-; SCALAR-NEXT: movb %cl, 45(%rdx)
+; SCALAR-NEXT: movb %al, 37(%rdx)
; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; SCALAR-NEXT: movb %al, 44(%rdx)
-; SCALAR-NEXT: movb %sil, 43(%rdx)
-; SCALAR-NEXT: movb %bl, 42(%rdx)
-; SCALAR-NEXT: movb %r14b, 41(%rdx)
-; SCALAR-NEXT: movb %bpl, 40(%rdx)
-; SCALAR-NEXT: movb %r11b, 39(%rdx)
-; SCALAR-NEXT: movb %r13b, 38(%rdx)
-; SCALAR-NEXT: movb %r10b, 37(%rdx)
-; SCALAR-NEXT: movb %r12b, 36(%rdx)
+; SCALAR-NEXT: movb %al, 36(%rdx)
; SCALAR-NEXT: movb %r9b, 35(%rdx)
-; SCALAR-NEXT: movb %r15b, 34(%rdx)
-; SCALAR-NEXT: movb %r8b, 33(%rdx)
-; SCALAR-NEXT: movb %dil, 32(%rdx)
+; SCALAR-NEXT: movb %r8b, 34(%rdx)
+; SCALAR-NEXT: movb %dil, 33(%rdx)
+; SCALAR-NEXT: movb %r11b, 32(%rdx)
; SCALAR-NEXT: popq %rbx
; SCALAR-NEXT: popq %r12
; SCALAR-NEXT: popq %r13
diff --git a/llvm/test/CodeGen/X86/umax.ll b/llvm/test/CodeGen/X86/umax.ll
index f0479aea1b82c8..df9a82a591b54c 100644
--- a/llvm/test/CodeGen/X86/umax.ll
+++ b/llvm/test/CodeGen/X86/umax.ll
@@ -987,33 +987,36 @@ define <16 x i8> @test_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: subl $40, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: cmpb %bl, %al
-; X86-NEXT: cmoval %eax, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: cmpb %dl, %al
-; X86-NEXT: cmoval %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: cmpb %al, %dl
+; X86-NEXT: cmoval %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: cmpb %cl, %al
; X86-NEXT: cmoval %eax, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: cmpb %bl, %al
+; X86-NEXT: cmoval %eax, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %ebp, %ecx
; X86-NEXT: cmpb %cl, %al
; X86-NEXT: cmoval %eax, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %edi, %ecx
; X86-NEXT: cmpb %cl, %al
; X86-NEXT: cmoval %eax, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %esi, %ecx
; X86-NEXT: cmpb %cl, %al
; X86-NEXT: cmoval %eax, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -1122,31 +1125,34 @@ define <16 x i8> @test_v16i8_1(<16 x i8> %a) nounwind {
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: subl $40, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: cmpb $1, %bl
-; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: cmpb $1, %dl
-; X86-NEXT: adcl $0, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: cmpb $1, %cl
-; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: cmpb $1, %al
; X86-NEXT: adcl $0, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: cmpb $1, %cl
+; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: cmpb $1, %bl
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %esi, %eax
; X86-NEXT: cmpb $1, %al
; X86-NEXT: adcl $0, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: cmpb $1, %dl
+; X86-NEXT: adcl $0, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ebp, %eax
; X86-NEXT: cmpb $1, %al
; X86-NEXT: adcl $0, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: cmpb $1, %al
; X86-NEXT: adcl $0, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
diff --git a/llvm/test/CodeGen/X86/umin.ll b/llvm/test/CodeGen/X86/umin.ll
index e4ce08966a8946..f3ccafb1330797 100644
--- a/llvm/test/CodeGen/X86/umin.ll
+++ b/llvm/test/CodeGen/X86/umin.ll
@@ -521,33 +521,36 @@ define <16 x i8> @test_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: subl $40, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: cmpb %bl, %al
-; X86-NEXT: cmovbl %eax, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: cmpb %dl, %al
-; X86-NEXT: cmovbl %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: cmpb %al, %dl
+; X86-NEXT: cmovbl %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: cmpb %cl, %al
; X86-NEXT: cmovbl %eax, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: cmpb %bl, %al
+; X86-NEXT: cmovbl %eax, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %ebp, %ecx
; X86-NEXT: cmpb %cl, %al
; X86-NEXT: cmovbl %eax, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %edi, %ecx
; X86-NEXT: cmpb %cl, %al
; X86-NEXT: cmovbl %eax, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %esi, %ecx
; X86-NEXT: cmpb %cl, %al
; X86-NEXT: cmovbl %eax, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
diff --git a/llvm/test/CodeGen/X86/umul-with-overflow.ll b/llvm/test/CodeGen/X86/umul-with-overflow.ll
index ccabb360a990c9..8eaa224ede7522 100644
--- a/llvm/test/CodeGen/X86/umul-with-overflow.ll
+++ b/llvm/test/CodeGen/X86/umul-with-overflow.ll
@@ -93,7 +93,7 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
; X86-NEXT: movl %ebx, %eax
; X86-NEXT: mull %edi
; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: mull %edi
; X86-NEXT: movl %edx, %ecx
@@ -114,15 +114,16 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movzbl %cl, %eax
; X86-NEXT: adcl %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edx, (%esp) # 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: movl %ebp, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ecx
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: addl %edi, %ecx
@@ -142,10 +143,10 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
; X86-NEXT: addl %edi, %esi
; X86-NEXT: movzbl %cl, %eax
; X86-NEXT: adcl %eax, %ebp
-; X86-NEXT: addl (%esp), %esi # 4-byte Folded Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-NEXT: adcl %ebx, %ebp
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, (%esp) # 4-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %edi
@@ -163,21 +164,21 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
; X86-NEXT: addl %ebx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl %edi, %ecx
-; X86-NEXT: setb (%esp) # 1-byte Folded Spill
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %eax, %ebx
; X86-NEXT: addl %ecx, %ebx
-; X86-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X86-NEXT: adcl %eax, %edi
; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: adcl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: adcl $0, %ebx
; X86-NEXT: adcl $0, %edi
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT: adcl (%esp), %edi # 4-byte Folded Reload
+; X86-NEXT: setb (%esp) # 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: mull %ecx
@@ -206,18 +207,18 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
; X86-NEXT: addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: adcl %edi, %ebp
; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload
; X86-NEXT: adcl %eax, %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: movl %ebx, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: mull %ecx
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %ebp, %eax
; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ecx
@@ -240,12 +241,13 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl %ebx, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: mull %ebp
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: mull %ebp
; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %eax, %ecx
@@ -254,77 +256,78 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
; X86-NEXT: movl %ebx, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: mull %ebx
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %edx, %ebp
; X86-NEXT: addl %ecx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %esi, %edi
+; X86-NEXT: adcl %esi, %ebp
; X86-NEXT: setb %cl
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull %ebx
; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl %edi, %ebp
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: addl %ebp, %edi
; X86-NEXT: movzbl %cl, %eax
; X86-NEXT: adcl %eax, %ebx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
; X86-NEXT: adcl $0, (%esp) # 4-byte Folded Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: mull %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: mull %ebp
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull %edi
+; X86-NEXT: mull %ebp
; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl %ecx, %edi
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: addl %ecx, %ebp
; X86-NEXT: adcl $0, %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: addl %edi, %eax
+; X86-NEXT: addl %ebp, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl %esi, %ecx
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl %ecx, %edi
+; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: addl %ecx, %esi
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT: adcl %eax, %esi
-; X86-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT: adcl %ebx, %ebp
-; X86-NEXT: adcl $0, %edi
+; X86-NEXT: adcl %eax, %ebp
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: addl %edi, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: adcl %ebx, %edi
; X86-NEXT: adcl $0, %esi
-; X86-NEXT: addl (%esp), %edi # 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: imull %edx, %ecx
+; X86-NEXT: adcl $0, %ebp
+; X86-NEXT: addl (%esp), %esi # 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: imull %edx, %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %edx
-; X86-NEXT: addl %edx, %ecx
+; X86-NEXT: addl %edx, %ebx
+; X86-NEXT: movl %ebx, %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: imull {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: addl %ecx, %ebx
+; X86-NEXT: addl %edx, %ebx
; X86-NEXT: movl %eax, %edx
-; X86-NEXT: addl %edi, %edx
-; X86-NEXT: adcl %esi, %ebx
+; X86-NEXT: addl %esi, %edx
+; X86-NEXT: adcl %ebp, %ebx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, %ebx
@@ -447,19 +450,19 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
; X86-NEXT: adcl $0, %esi
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: imull %ecx, %ebp
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: imull %ecx, %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: movl %ebp, %eax
; X86-NEXT: mull %ecx
; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: addl %edx, %ebp
-; X86-NEXT: imull {{[0-9]+}}(%esp), %edi
-; X86-NEXT: addl %ebp, %edi
+; X86-NEXT: addl %edx, %edi
+; X86-NEXT: imull {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: addl %edi, %ebp
; X86-NEXT: addl %ebx, %ecx
-; X86-NEXT: adcl %esi, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl %esi, %ebp
+; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl %edi, %eax
; X86-NEXT: imull {{[0-9]+}}(%esp), %edi
diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll
index b1194bedc4e1ca..c8aa7cf8c8f29e 100644
--- a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll
+++ b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll
@@ -1198,6 +1198,14 @@ define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind {
; CHECK-BASELINE-NEXT: movq %rdx, %r8
; CHECK-BASELINE-NEXT: movq %rsi, %r9
; CHECK-BASELINE-NEXT: movq %rdi, %r11
+; CHECK-BASELINE-NEXT: movzbl 19(%rdx), %eax
+; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT: movzbl 18(%rdx), %eax
+; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT: movzbl 17(%rdx), %eax
+; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT: movzbl 16(%rdx), %eax
+; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT: movzbl 15(%rdx), %eax
; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT: movzbl 14(%rdx), %eax
@@ -1306,26 +1314,26 @@ define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind {
; CHECK-BASELINE-NEXT: andb 15(%r10), %al
; CHECK-BASELINE-NEXT: xorb %cl, %al
; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT: movzbl 16(%r8), %eax
-; CHECK-BASELINE-NEXT: movzbl 16(%r9), %ecx
-; CHECK-BASELINE-NEXT: xorb %al, %cl
-; CHECK-BASELINE-NEXT: andb 16(%r10), %cl
-; CHECK-BASELINE-NEXT: xorb %al, %cl
-; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT: movzbl 17(%r8), %eax
+; CHECK-BASELINE-NEXT: movzbl 16(%r9), %eax
+; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; CHECK-BASELINE-NEXT: xorb %cl, %al
+; CHECK-BASELINE-NEXT: andb 16(%r10), %al
+; CHECK-BASELINE-NEXT: xorb %cl, %al
+; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT: movzbl 17(%r9), %ecx
+; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT: xorb %al, %cl
; CHECK-BASELINE-NEXT: andb 17(%r10), %cl
; CHECK-BASELINE-NEXT: xorb %al, %cl
; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT: movzbl 18(%r8), %eax
; CHECK-BASELINE-NEXT: movzbl 18(%r9), %ecx
+; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT: xorb %al, %cl
; CHECK-BASELINE-NEXT: andb 18(%r10), %cl
; CHECK-BASELINE-NEXT: xorb %al, %cl
; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT: movzbl 19(%r8), %eax
; CHECK-BASELINE-NEXT: movzbl 19(%r9), %ecx
+; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT: xorb %al, %cl
; CHECK-BASELINE-NEXT: andb 19(%r10), %cl
; CHECK-BASELINE-NEXT: xorb %al, %cl
@@ -1465,6 +1473,14 @@ define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind {
; CHECK-SSE1-NEXT: movq %rdx, %r8
; CHECK-SSE1-NEXT: movq %rsi, %r9
; CHECK-SSE1-NEXT: movq %rdi, %r11
+; CHECK-SSE1-NEXT: movzbl 19(%rdx), %eax
+; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT: movzbl 18(%rdx), %eax
+; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT: movzbl 17(%rdx), %eax
+; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT: movzbl 16(%rdx), %eax
+; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT: movzbl 15(%rdx), %eax
; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT: movzbl 14(%rdx), %eax
@@ -1573,26 +1589,26 @@ define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind {
; CHECK-SSE1-NEXT: andb 15(%r10), %al
; CHECK-SSE1-NEXT: xorb %cl, %al
; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT: movzbl 16(%r8), %eax
-; CHECK-SSE1-NEXT: movzbl 16(%r9), %ecx
-; CHECK-SSE1-NEXT: xorb %al, %cl
-; CHECK-SSE1-NEXT: andb 16(%r10), %cl
-; CHECK-SSE1-NEXT: xorb %al, %cl
-; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT: movzbl 17(%r8), %eax
+; CHECK-SSE1-NEXT: movzbl 16(%r9), %eax
+; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; CHECK-SSE1-NEXT: xorb %cl, %al
+; CHECK-SSE1-NEXT: andb 16(%r10), %al
+; CHECK-SSE1-NEXT: xorb %cl, %al
+; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT: movzbl 17(%r9), %ecx
+; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT: xorb %al, %cl
; CHECK-SSE1-NEXT: andb 17(%r10), %cl
; CHECK-SSE1-NEXT: xorb %al, %cl
; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT: movzbl 18(%r8), %eax
; CHECK-SSE1-NEXT: movzbl 18(%r9), %ecx
+; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT: xorb %al, %cl
; CHECK-SSE1-NEXT: andb 18(%r10), %cl
; CHECK-SSE1-NEXT: xorb %al, %cl
; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT: movzbl 19(%r8), %eax
; CHECK-SSE1-NEXT: movzbl 19(%r9), %ecx
+; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT: xorb %al, %cl
; CHECK-SSE1-NEXT: andb 19(%r10), %cl
; CHECK-SSE1-NEXT: xorb %al, %cl
@@ -3231,10 +3247,18 @@ define <32 x i8> @in_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind {
; CHECK-BASELINE-NEXT: pushq %r13
; CHECK-BASELINE-NEXT: pushq %r12
; CHECK-BASELINE-NEXT: pushq %rbx
-; CHECK-BASELINE-NEXT: movq %rcx, %r12
-; CHECK-BASELINE-NEXT: movq %rdx, %r15
+; CHECK-BASELINE-NEXT: movq %rcx, %r15
+; CHECK-BASELINE-NEXT: movq %rdx, %rbx
; CHECK-BASELINE-NEXT: movq %rsi, %r14
-; CHECK-BASELINE-NEXT: movq %rdi, %r13
+; CHECK-BASELINE-NEXT: movq %rdi, %r12
+; CHECK-BASELINE-NEXT: movzbl 19(%rdx), %eax
+; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT: movzbl 18(%rdx), %eax
+; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT: movzbl 17(%rdx), %eax
+; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT: movzbl 16(%rdx), %eax
+; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT: movzbl 15(%rdx), %eax
; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT: movzbl 14(%rdx), %eax
@@ -3247,241 +3271,241 @@ define <32 x i8> @in_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind {
; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT: movzbl 10(%rdx), %eax
; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT: movzbl 9(%rdx), %r8d
-; CHECK-BASELINE-NEXT: movzbl 8(%rdx), %r9d
-; CHECK-BASELINE-NEXT: movzbl 7(%rdx), %r10d
-; CHECK-BASELINE-NEXT: movzbl 6(%rdx), %ebp
-; CHECK-BASELINE-NEXT: movzbl 5(%rdx), %edi
+; CHECK-BASELINE-NEXT: movzbl 9(%rdx), %r13d
+; CHECK-BASELINE-NEXT: movzbl 8(%rdx), %r10d
+; CHECK-BASELINE-NEXT: movzbl 7(%rdx), %r9d
+; CHECK-BASELINE-NEXT: movzbl 6(%rdx), %r8d
+; CHECK-BASELINE-NEXT: movzbl 5(%rdx), %ebp
; CHECK-BASELINE-NEXT: movzbl 4(%rdx), %esi
-; CHECK-BASELINE-NEXT: movzbl 3(%rdx), %eax
-; CHECK-BASELINE-NEXT: movzbl 2(%rdx), %ecx
-; CHECK-BASELINE-NEXT: movzbl (%rdx), %r11d
-; CHECK-BASELINE-NEXT: movzbl 1(%rdx), %edx
-; CHECK-BASELINE-NEXT: movzbl (%r14), %ebx
-; CHECK-BASELINE-NEXT: xorb %r11b, %bl
-; CHECK-BASELINE-NEXT: andb (%r12), %bl
-; CHECK-BASELINE-NEXT: xorb %r11b, %bl
-; CHECK-BASELINE-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT: movzbl 1(%r14), %r11d
-; CHECK-BASELINE-NEXT: xorb %dl, %r11b
-; CHECK-BASELINE-NEXT: andb 1(%r12), %r11b
-; CHECK-BASELINE-NEXT: xorb %dl, %r11b
+; CHECK-BASELINE-NEXT: movzbl 3(%rdx), %edx
+; CHECK-BASELINE-NEXT: movzbl 2(%rbx), %eax
+; CHECK-BASELINE-NEXT: movzbl (%rbx), %edi
+; CHECK-BASELINE-NEXT: movzbl 1(%rbx), %ecx
+; CHECK-BASELINE-NEXT: movzbl (%r14), %r11d
+; CHECK-BASELINE-NEXT: xorb %dil, %r11b
+; CHECK-BASELINE-NEXT: andb (%r15), %r11b
+; CHECK-BASELINE-NEXT: xorb %dil, %r11b
; CHECK-BASELINE-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT: movzbl 2(%r14), %edx
-; CHECK-BASELINE-NEXT: xorb %cl, %dl
-; CHECK-BASELINE-NEXT: andb 2(%r12), %dl
-; CHECK-BASELINE-NEXT: xorb %cl, %dl
-; CHECK-BASELINE-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT: movzbl 3(%r14), %ecx
+; CHECK-BASELINE-NEXT: movzbl 1(%r14), %edi
+; CHECK-BASELINE-NEXT: xorb %cl, %dil
+; CHECK-BASELINE-NEXT: andb 1(%r15), %dil
+; CHECK-BASELINE-NEXT: xorb %cl, %dil
+; CHECK-BASELINE-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT: movzbl 2(%r14), %ecx
; CHECK-BASELINE-NEXT: xorb %al, %cl
-; CHECK-BASELINE-NEXT: andb 3(%r12), %cl
+; CHECK-BASELINE-NEXT: andb 2(%r15), %cl
; CHECK-BASELINE-NEXT: xorb %al, %cl
; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT: movzbl 3(%r14), %eax
+; CHECK-BASELINE-NEXT: xorb %dl, %al
+; CHECK-BASELINE-NEXT: andb 3(%r15), %al
+; CHECK-BASELINE-NEXT: xorb %dl, %al
+; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT: movzbl 4(%r14), %eax
; CHECK-BASELINE-NEXT: xorb %sil, %al
-; CHECK-BASELINE-NEXT: andb 4(%r12), %al
+; CHECK-BASELINE-NEXT: andb 4(%r15), %al
; CHECK-BASELINE-NEXT: xorb %sil, %al
; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT: movzbl 5(%r14), %eax
-; CHECK-BASELINE-NEXT: xorb %dil, %al
-; CHECK-BASELINE-NEXT: andb 5(%r12), %al
-; CHECK-BASELINE-NEXT: xorb %dil, %al
-; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT: movzbl 6(%r14), %eax
; CHECK-BASELINE-NEXT: xorb %bpl, %al
-; CHECK-BASELINE-NEXT: andb 6(%r12), %al
+; CHECK-BASELINE-NEXT: andb 5(%r15), %al
; CHECK-BASELINE-NEXT: xorb %bpl, %al
; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT: movzbl 7(%r14), %eax
-; CHECK-BASELINE-NEXT: xorb %r10b, %al
-; CHECK-BASELINE-NEXT: andb 7(%r12), %al
-; CHECK-BASELINE-NEXT: xorb %r10b, %al
+; CHECK-BASELINE-NEXT: movzbl 6(%r14), %eax
+; CHECK-BASELINE-NEXT: xorb %r8b, %al
+; CHECK-BASELINE-NEXT: andb 6(%r15), %al
+; CHECK-BASELINE-NEXT: xorb %r8b, %al
; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT: movzbl 8(%r14), %eax
+; CHECK-BASELINE-NEXT: movzbl 7(%r14), %eax
; CHECK-BASELINE-NEXT: xorb %r9b, %al
-; CHECK-BASELINE-NEXT: andb 8(%r12), %al
+; CHECK-BASELINE-NEXT: andb 7(%r15), %al
; CHECK-BASELINE-NEXT: xorb %r9b, %al
; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT: movzbl 8(%r14), %eax
+; CHECK-BASELINE-NEXT: xorb %r10b, %al
+; CHECK-BASELINE-NEXT: andb 8(%r15), %al
+; CHECK-BASELINE-NEXT: xorb %r10b, %al
+; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT: movzbl 9(%r14), %eax
-; CHECK-BASELINE-NEXT: xorb %r8b, %al
-; CHECK-BASELINE-NEXT: andb 9(%r12), %al
-; CHECK-BASELINE-NEXT: xorb %r8b, %al
+; CHECK-BASELINE-NEXT: xorb %r13b, %al
+; CHECK-BASELINE-NEXT: andb 9(%r15), %al
+; CHECK-BASELINE-NEXT: xorb %r13b, %al
; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT: movzbl 10(%r14), %ecx
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT: xorb %al, %cl
-; CHECK-BASELINE-NEXT: andb 10(%r12), %cl
+; CHECK-BASELINE-NEXT: andb 10(%r15), %cl
; CHECK-BASELINE-NEXT: xorb %al, %cl
; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT: movzbl 11(%r14), %ecx
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT: xorb %al, %cl
-; CHECK-BASELINE-NEXT: andb 11(%r12), %cl
+; CHECK-BASELINE-NEXT: andb 11(%r15), %cl
; CHECK-BASELINE-NEXT: xorb %al, %cl
; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT: movzbl 12(%r14), %ecx
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT: xorb %al, %cl
-; CHECK-BASELINE-NEXT: andb 12(%r12), %cl
+; CHECK-BASELINE-NEXT: andb 12(%r15), %cl
; CHECK-BASELINE-NEXT: xorb %al, %cl
; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT: movzbl 13(%r14), %ecx
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT: xorb %al, %cl
-; CHECK-BASELINE-NEXT: andb 13(%r12), %cl
+; CHECK-BASELINE-NEXT: andb 13(%r15), %cl
; CHECK-BASELINE-NEXT: xorb %al, %cl
; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT: movzbl 14(%r14), %ecx
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT: xorb %al, %cl
-; CHECK-BASELINE-NEXT: andb 14(%r12), %cl
+; CHECK-BASELINE-NEXT: andb 14(%r15), %cl
; CHECK-BASELINE-NEXT: xorb %al, %cl
; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT: movzbl 15(%r14), %ecx
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT: xorb %al, %cl
-; CHECK-BASELINE-NEXT: andb 15(%r12), %cl
+; CHECK-BASELINE-NEXT: andb 15(%r15), %cl
; CHECK-BASELINE-NEXT: xorb %al, %cl
; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT: movzbl 16(%r15), %eax
; CHECK-BASELINE-NEXT: movzbl 16(%r14), %ecx
+; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT: xorb %al, %cl
-; CHECK-BASELINE-NEXT: andb 16(%r12), %cl
+; CHECK-BASELINE-NEXT: andb 16(%r15), %cl
; CHECK-BASELINE-NEXT: xorb %al, %cl
; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT: movzbl 17(%r15), %eax
; CHECK-BASELINE-NEXT: movzbl 17(%r14), %ecx
+; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT: xorb %al, %cl
-; CHECK-BASELINE-NEXT: andb 17(%r12), %cl
+; CHECK-BASELINE-NEXT: andb 17(%r15), %cl
; CHECK-BASELINE-NEXT: xorb %al, %cl
; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT: movzbl 18(%r15), %eax
; CHECK-BASELINE-NEXT: movzbl 18(%r14), %ecx
+; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT: xorb %al, %cl
-; CHECK-BASELINE-NEXT: andb 18(%r12), %cl
+; CHECK-BASELINE-NEXT: andb 18(%r15), %cl
; CHECK-BASELINE-NEXT: xorb %al, %cl
; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT: movzbl 19(%r15), %eax
; CHECK-BASELINE-NEXT: movzbl 19(%r14), %ecx
+; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT: xorb %al, %cl
-; CHECK-BASELINE-NEXT: andb 19(%r12), %cl
+; CHECK-BASELINE-NEXT: andb 19(%r15), %cl
; CHECK-BASELINE-NEXT: xorb %al, %cl
; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT: movzbl 20(%r15), %eax
+; CHECK-BASELINE-NEXT: movzbl 20(%rbx), %eax
; CHECK-BASELINE-NEXT: movzbl 20(%r14), %ecx
; CHECK-BASELINE-NEXT: xorb %al, %cl
-; CHECK-BASELINE-NEXT: andb 20(%r12), %cl
+; CHECK-BASELINE-NEXT: andb 20(%r15), %cl
; CHECK-BASELINE-NEXT: xorb %al, %cl
; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT: movzbl 21(%r15), %eax
-; CHECK-BASELINE-NEXT: movzbl 21(%r14), %ebp
+; CHECK-BASELINE-NEXT: movzbl 21(%rbx), %eax
+; CHECK-BASELINE-NEXT: movzbl 21(%r14), %r13d
+; CHECK-BASELINE-NEXT: xorb %al, %r13b
+; CHECK-BASELINE-NEXT: andb 21(%r15), %r13b
+; CHECK-BASELINE-NEXT: xorb %al, %r13b
+; CHECK-BASELINE-NEXT: movzbl 22(%rbx), %eax
+; CHECK-BASELINE-NEXT: movzbl 22(%r14), %ebp
; CHECK-BASELINE-NEXT: xorb %al, %bpl
-; CHECK-BASELINE-NEXT: andb 21(%r12), %bpl
+; CHECK-BASELINE-NEXT: andb 22(%r15), %bpl
; CHECK-BASELINE-NEXT: xorb %al, %bpl
-; CHECK-BASELINE-NEXT: movzbl 22(%r15), %eax
-; CHECK-BASELINE-NEXT: movzbl 22(%r14), %ebx
-; CHECK-BASELINE-NEXT: xorb %al, %bl
-; CHECK-BASELINE-NEXT: andb 22(%r12), %bl
-; CHECK-BASELINE-NEXT: xorb %al, %bl
-; CHECK-BASELINE-NEXT: movzbl 23(%r15), %eax
+; CHECK-BASELINE-NEXT: movzbl 23(%rbx), %eax
; CHECK-BASELINE-NEXT: movzbl 23(%r14), %r11d
; CHECK-BASELINE-NEXT: xorb %al, %r11b
-; CHECK-BASELINE-NEXT: andb 23(%r12), %r11b
+; CHECK-BASELINE-NEXT: andb 23(%r15), %r11b
; CHECK-BASELINE-NEXT: xorb %al, %r11b
-; CHECK-BASELINE-NEXT: movzbl 24(%r15), %eax
+; CHECK-BASELINE-NEXT: movzbl 24(%rbx), %eax
; CHECK-BASELINE-NEXT: movzbl 24(%r14), %r9d
; CHECK-BASELINE-NEXT: xorb %al, %r9b
-; CHECK-BASELINE-NEXT: andb 24(%r12), %r9b
+; CHECK-BASELINE-NEXT: andb 24(%r15), %r9b
; CHECK-BASELINE-NEXT: xorb %al, %r9b
-; CHECK-BASELINE-NEXT: movzbl 25(%r15), %eax
+; CHECK-BASELINE-NEXT: movzbl 25(%rbx), %eax
; CHECK-BASELINE-NEXT: movzbl 25(%r14), %r8d
; CHECK-BASELINE-NEXT: xorb %al, %r8b
-; CHECK-BASELINE-NEXT: andb 25(%r12), %r8b
+; CHECK-BASELINE-NEXT: andb 25(%r15), %r8b
; CHECK-BASELINE-NEXT: xorb %al, %r8b
-; CHECK-BASELINE-NEXT: movzbl 26(%r15), %eax
+; CHECK-BASELINE-NEXT: movzbl 26(%rbx), %eax
; CHECK-BASELINE-NEXT: movzbl 26(%r14), %edi
; CHECK-BASELINE-NEXT: xorb %al, %dil
-; CHECK-BASELINE-NEXT: andb 26(%r12), %dil
+; CHECK-BASELINE-NEXT: andb 26(%r15), %dil
; CHECK-BASELINE-NEXT: xorb %al, %dil
-; CHECK-BASELINE-NEXT: movzbl 27(%r15), %eax
+; CHECK-BASELINE-NEXT: movzbl 27(%rbx), %eax
; CHECK-BASELINE-NEXT: movzbl 27(%r14), %esi
; CHECK-BASELINE-NEXT: xorb %al, %sil
-; CHECK-BASELINE-NEXT: andb 27(%r12), %sil
+; CHECK-BASELINE-NEXT: andb 27(%r15), %sil
; CHECK-BASELINE-NEXT: xorb %al, %sil
-; CHECK-BASELINE-NEXT: movzbl 28(%r15), %eax
+; CHECK-BASELINE-NEXT: movzbl 28(%rbx), %eax
; CHECK-BASELINE-NEXT: movzbl 28(%r14), %edx
; CHECK-BASELINE-NEXT: xorb %al, %dl
-; CHECK-BASELINE-NEXT: andb 28(%r12), %dl
+; CHECK-BASELINE-NEXT: andb 28(%r15), %dl
; CHECK-BASELINE-NEXT: xorb %al, %dl
-; CHECK-BASELINE-NEXT: movzbl 29(%r15), %eax
+; CHECK-BASELINE-NEXT: movzbl 29(%rbx), %eax
; CHECK-BASELINE-NEXT: movzbl 29(%r14), %ecx
; CHECK-BASELINE-NEXT: xorb %al, %cl
-; CHECK-BASELINE-NEXT: andb 29(%r12), %cl
+; CHECK-BASELINE-NEXT: andb 29(%r15), %cl
; CHECK-BASELINE-NEXT: xorb %al, %cl
-; CHECK-BASELINE-NEXT: movzbl 30(%r15), %r10d
+; CHECK-BASELINE-NEXT: movzbl 30(%rbx), %r10d
; CHECK-BASELINE-NEXT: movzbl 30(%r14), %eax
; CHECK-BASELINE-NEXT: xorb %r10b, %al
-; CHECK-BASELINE-NEXT: andb 30(%r12), %al
+; CHECK-BASELINE-NEXT: andb 30(%r15), %al
; CHECK-BASELINE-NEXT: xorb %r10b, %al
-; CHECK-BASELINE-NEXT: movzbl 31(%r15), %r10d
-; CHECK-BASELINE-NEXT: movzbl 31(%r14), %r14d
-; CHECK-BASELINE-NEXT: xorb %r10b, %r14b
-; CHECK-BASELINE-NEXT: andb 31(%r12), %r14b
-; CHECK-BASELINE-NEXT: xorb %r10b, %r14b
-; CHECK-BASELINE-NEXT: movb %r14b, 31(%r13)
-; CHECK-BASELINE-NEXT: movb %al, 30(%r13)
-; CHECK-BASELINE-NEXT: movb %cl, 29(%r13)
-; CHECK-BASELINE-NEXT: movb %dl, 28(%r13)
-; CHECK-BASELINE-NEXT: movb %sil, 27(%r13)
-; CHECK-BASELINE-NEXT: movb %dil, 26(%r13)
-; CHECK-BASELINE-NEXT: movb %r8b, 25(%r13)
-; CHECK-BASELINE-NEXT: movb %r9b, 24(%r13)
-; CHECK-BASELINE-NEXT: movb %r11b, 23(%r13)
-; CHECK-BASELINE-NEXT: movb %bl, 22(%r13)
-; CHECK-BASELINE-NEXT: movb %bpl, 21(%r13)
+; CHECK-BASELINE-NEXT: movzbl 31(%rbx), %r10d
+; CHECK-BASELINE-NEXT: movzbl 31(%r14), %ebx
+; CHECK-BASELINE-NEXT: xorb %r10b, %bl
+; CHECK-BASELINE-NEXT: andb 31(%r15), %bl
+; CHECK-BASELINE-NEXT: xorb %r10b, %bl
+; CHECK-BASELINE-NEXT: movb %bl, 31(%r12)
+; CHECK-BASELINE-NEXT: movb %al, 30(%r12)
+; CHECK-BASELINE-NEXT: movb %cl, 29(%r12)
+; CHECK-BASELINE-NEXT: movb %dl, 28(%r12)
+; CHECK-BASELINE-NEXT: movb %sil, 27(%r12)
+; CHECK-BASELINE-NEXT: movb %dil, 26(%r12)
+; CHECK-BASELINE-NEXT: movb %r8b, 25(%r12)
+; CHECK-BASELINE-NEXT: movb %r9b, 24(%r12)
+; CHECK-BASELINE-NEXT: movb %r11b, 23(%r12)
+; CHECK-BASELINE-NEXT: movb %bpl, 22(%r12)
+; CHECK-BASELINE-NEXT: movb %r13b, 21(%r12)
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT: movb %al, 20(%r13)
+; CHECK-BASELINE-NEXT: movb %al, 20(%r12)
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT: movb %al, 19(%r13)
+; CHECK-BASELINE-NEXT: movb %al, 19(%r12)
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT: movb %al, 18(%r13)
+; CHECK-BASELINE-NEXT: movb %al, 18(%r12)
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT: movb %al, 17(%r13)
+; CHECK-BASELINE-NEXT: movb %al, 17(%r12)
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT: movb %al, 16(%r13)
+; CHECK-BASELINE-NEXT: movb %al, 16(%r12)
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT: movb %al, 15(%r13)
+; CHECK-BASELINE-NEXT: movb %al, 15(%r12)
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT: movb %al, 14(%r13)
+; CHECK-BASELINE-NEXT: movb %al, 14(%r12)
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT: movb %al, 13(%r13)
+; CHECK-BASELINE-NEXT: movb %al, 13(%r12)
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT: movb %al, 12(%r13)
+; CHECK-BASELINE-NEXT: movb %al, 12(%r12)
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT: movb %al, 11(%r13)
+; CHECK-BASELINE-NEXT: movb %al, 11(%r12)
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT: movb %al, 10(%r13)
+; CHECK-BASELINE-NEXT: movb %al, 10(%r12)
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT: movb %al, 9(%r13)
+; CHECK-BASELINE-NEXT: movb %al, 9(%r12)
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT: movb %al, 8(%r13)
+; CHECK-BASELINE-NEXT: movb %al, 8(%r12)
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT: movb %al, 7(%r13)
+; CHECK-BASELINE-NEXT: movb %al, 7(%r12)
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT: movb %al, 6(%r13)
+; CHECK-BASELINE-NEXT: movb %al, 6(%r12)
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT: movb %al, 5(%r13)
+; CHECK-BASELINE-NEXT: movb %al, 5(%r12)
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT: movb %al, 4(%r13)
+; CHECK-BASELINE-NEXT: movb %al, 4(%r12)
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT: movb %al, 3(%r13)
+; CHECK-BASELINE-NEXT: movb %al, 3(%r12)
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT: movb %al, 2(%r13)
+; CHECK-BASELINE-NEXT: movb %al, 2(%r12)
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT: movb %al, 1(%r13)
+; CHECK-BASELINE-NEXT: movb %al, 1(%r12)
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT: movb %al, (%r13)
-; CHECK-BASELINE-NEXT: movq %r13, %rax
+; CHECK-BASELINE-NEXT: movb %al, (%r12)
+; CHECK-BASELINE-NEXT: movq %r12, %rax
; CHECK-BASELINE-NEXT: popq %rbx
; CHECK-BASELINE-NEXT: popq %r12
; CHECK-BASELINE-NEXT: popq %r13
@@ -3498,10 +3522,18 @@ define <32 x i8> @in_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind {
; CHECK-SSE1-NEXT: pushq %r13
; CHECK-SSE1-NEXT: pushq %r12
; CHECK-SSE1-NEXT: pushq %rbx
-; CHECK-SSE1-NEXT: movq %rcx, %r12
-; CHECK-SSE1-NEXT: movq %rdx, %r15
+; CHECK-SSE1-NEXT: movq %rcx, %r15
+; CHECK-SSE1-NEXT: movq %rdx, %rbx
; CHECK-SSE1-NEXT: movq %rsi, %r14
-; CHECK-SSE1-NEXT: movq %rdi, %r13
+; CHECK-SSE1-NEXT: movq %rdi, %r12
+; CHECK-SSE1-NEXT: movzbl 19(%rdx), %eax
+; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT: movzbl 18(%rdx), %eax
+; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT: movzbl 17(%rdx), %eax
+; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT: movzbl 16(%rdx), %eax
+; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT: movzbl 15(%rdx), %eax
; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT: movzbl 14(%rdx), %eax
@@ -3514,241 +3546,241 @@ define <32 x i8> @in_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind {
; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT: movzbl 10(%rdx), %eax
; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT: movzbl 9(%rdx), %r8d
-; CHECK-SSE1-NEXT: movzbl 8(%rdx), %r9d
-; CHECK-SSE1-NEXT: movzbl 7(%rdx), %r10d
-; CHECK-SSE1-NEXT: movzbl 6(%rdx), %ebp
-; CHECK-SSE1-NEXT: movzbl 5(%rdx), %edi
+; CHECK-SSE1-NEXT: movzbl 9(%rdx), %r13d
+; CHECK-SSE1-NEXT: movzbl 8(%rdx), %r10d
+; CHECK-SSE1-NEXT: movzbl 7(%rdx), %r9d
+; CHECK-SSE1-NEXT: movzbl 6(%rdx), %r8d
+; CHECK-SSE1-NEXT: movzbl 5(%rdx), %ebp
; CHECK-SSE1-NEXT: movzbl 4(%rdx), %esi
-; CHECK-SSE1-NEXT: movzbl 3(%rdx), %eax
-; CHECK-SSE1-NEXT: movzbl 2(%rdx), %ecx
-; CHECK-SSE1-NEXT: movzbl (%rdx), %r11d
-; CHECK-SSE1-NEXT: movzbl 1(%rdx), %edx
-; CHECK-SSE1-NEXT: movzbl (%r14), %ebx
-; CHECK-SSE1-NEXT: xorb %r11b, %bl
-; CHECK-SSE1-NEXT: andb (%r12), %bl
-; CHECK-SSE1-NEXT: xorb %r11b, %bl
-; CHECK-SSE1-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT: movzbl 1(%r14), %r11d
-; CHECK-SSE1-NEXT: xorb %dl, %r11b
-; CHECK-SSE1-NEXT: andb 1(%r12), %r11b
-; CHECK-SSE1-NEXT: xorb %dl, %r11b
+; CHECK-SSE1-NEXT: movzbl 3(%rdx), %edx
+; CHECK-SSE1-NEXT: movzbl 2(%rbx), %eax
+; CHECK-SSE1-NEXT: movzbl (%rbx), %edi
+; CHECK-SSE1-NEXT: movzbl 1(%rbx), %ecx
+; CHECK-SSE1-NEXT: movzbl (%r14), %r11d
+; CHECK-SSE1-NEXT: xorb %dil, %r11b
+; CHECK-SSE1-NEXT: andb (%r15), %r11b
+; CHECK-SSE1-NEXT: xorb %dil, %r11b
; CHECK-SSE1-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT: movzbl 2(%r14), %edx
-; CHECK-SSE1-NEXT: xorb %cl, %dl
-; CHECK-SSE1-NEXT: andb 2(%r12), %dl
-; CHECK-SSE1-NEXT: xorb %cl, %dl
-; CHECK-SSE1-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT: movzbl 3(%r14), %ecx
+; CHECK-SSE1-NEXT: movzbl 1(%r14), %edi
+; CHECK-SSE1-NEXT: xorb %cl, %dil
+; CHECK-SSE1-NEXT: andb 1(%r15), %dil
+; CHECK-SSE1-NEXT: xorb %cl, %dil
+; CHECK-SSE1-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT: movzbl 2(%r14), %ecx
; CHECK-SSE1-NEXT: xorb %al, %cl
-; CHECK-SSE1-NEXT: andb 3(%r12), %cl
+; CHECK-SSE1-NEXT: andb 2(%r15), %cl
; CHECK-SSE1-NEXT: xorb %al, %cl
; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT: movzbl 3(%r14), %eax
+; CHECK-SSE1-NEXT: xorb %dl, %al
+; CHECK-SSE1-NEXT: andb 3(%r15), %al
+; CHECK-SSE1-NEXT: xorb %dl, %al
+; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT: movzbl 4(%r14), %eax
; CHECK-SSE1-NEXT: xorb %sil, %al
-; CHECK-SSE1-NEXT: andb 4(%r12), %al
+; CHECK-SSE1-NEXT: andb 4(%r15), %al
; CHECK-SSE1-NEXT: xorb %sil, %al
; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT: movzbl 5(%r14), %eax
-; CHECK-SSE1-NEXT: xorb %dil, %al
-; CHECK-SSE1-NEXT: andb 5(%r12), %al
-; CHECK-SSE1-NEXT: xorb %dil, %al
-; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT: movzbl 6(%r14), %eax
; CHECK-SSE1-NEXT: xorb %bpl, %al
-; CHECK-SSE1-NEXT: andb 6(%r12), %al
+; CHECK-SSE1-NEXT: andb 5(%r15), %al
; CHECK-SSE1-NEXT: xorb %bpl, %al
; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT: movzbl 7(%r14), %eax
-; CHECK-SSE1-NEXT: xorb %r10b, %al
-; CHECK-SSE1-NEXT: andb 7(%r12), %al
-; CHECK-SSE1-NEXT: xorb %r10b, %al
+; CHECK-SSE1-NEXT: movzbl 6(%r14), %eax
+; CHECK-SSE1-NEXT: xorb %r8b, %al
+; CHECK-SSE1-NEXT: andb 6(%r15), %al
+; CHECK-SSE1-NEXT: xorb %r8b, %al
; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT: movzbl 8(%r14), %eax
+; CHECK-SSE1-NEXT: movzbl 7(%r14), %eax
; CHECK-SSE1-NEXT: xorb %r9b, %al
-; CHECK-SSE1-NEXT: andb 8(%r12), %al
+; CHECK-SSE1-NEXT: andb 7(%r15), %al
; CHECK-SSE1-NEXT: xorb %r9b, %al
; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT: movzbl 8(%r14), %eax
+; CHECK-SSE1-NEXT: xorb %r10b, %al
+; CHECK-SSE1-NEXT: andb 8(%r15), %al
+; CHECK-SSE1-NEXT: xorb %r10b, %al
+; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT: movzbl 9(%r14), %eax
-; CHECK-SSE1-NEXT: xorb %r8b, %al
-; CHECK-SSE1-NEXT: andb 9(%r12), %al
-; CHECK-SSE1-NEXT: xorb %r8b, %al
+; CHECK-SSE1-NEXT: xorb %r13b, %al
+; CHECK-SSE1-NEXT: andb 9(%r15), %al
+; CHECK-SSE1-NEXT: xorb %r13b, %al
; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT: movzbl 10(%r14), %ecx
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT: xorb %al, %cl
-; CHECK-SSE1-NEXT: andb 10(%r12), %cl
+; CHECK-SSE1-NEXT: andb 10(%r15), %cl
; CHECK-SSE1-NEXT: xorb %al, %cl
; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT: movzbl 11(%r14), %ecx
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT: xorb %al, %cl
-; CHECK-SSE1-NEXT: andb 11(%r12), %cl
+; CHECK-SSE1-NEXT: andb 11(%r15), %cl
; CHECK-SSE1-NEXT: xorb %al, %cl
; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT: movzbl 12(%r14), %ecx
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT: xorb %al, %cl
-; CHECK-SSE1-NEXT: andb 12(%r12), %cl
+; CHECK-SSE1-NEXT: andb 12(%r15), %cl
; CHECK-SSE1-NEXT: xorb %al, %cl
; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT: movzbl 13(%r14), %ecx
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT: xorb %al, %cl
-; CHECK-SSE1-NEXT: andb 13(%r12), %cl
+; CHECK-SSE1-NEXT: andb 13(%r15), %cl
; CHECK-SSE1-NEXT: xorb %al, %cl
; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT: movzbl 14(%r14), %ecx
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT: xorb %al, %cl
-; CHECK-SSE1-NEXT: andb 14(%r12), %cl
+; CHECK-SSE1-NEXT: andb 14(%r15), %cl
; CHECK-SSE1-NEXT: xorb %al, %cl
; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT: movzbl 15(%r14), %ecx
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT: xorb %al, %cl
-; CHECK-SSE1-NEXT: andb 15(%r12), %cl
+; CHECK-SSE1-NEXT: andb 15(%r15), %cl
; CHECK-SSE1-NEXT: xorb %al, %cl
; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT: movzbl 16(%r15), %eax
; CHECK-SSE1-NEXT: movzbl 16(%r14), %ecx
+; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT: xorb %al, %cl
-; CHECK-SSE1-NEXT: andb 16(%r12), %cl
+; CHECK-SSE1-NEXT: andb 16(%r15), %cl
; CHECK-SSE1-NEXT: xorb %al, %cl
; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT: movzbl 17(%r15), %eax
; CHECK-SSE1-NEXT: movzbl 17(%r14), %ecx
+; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT: xorb %al, %cl
-; CHECK-SSE1-NEXT: andb 17(%r12), %cl
+; CHECK-SSE1-NEXT: andb 17(%r15), %cl
; CHECK-SSE1-NEXT: xorb %al, %cl
; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT: movzbl 18(%r15), %eax
; CHECK-SSE1-NEXT: movzbl 18(%r14), %ecx
+; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT: xorb %al, %cl
-; CHECK-SSE1-NEXT: andb 18(%r12), %cl
+; CHECK-SSE1-NEXT: andb 18(%r15), %cl
; CHECK-SSE1-NEXT: xorb %al, %cl
; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT: movzbl 19(%r15), %eax
; CHECK-SSE1-NEXT: movzbl 19(%r14), %ecx
+; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT: xorb %al, %cl
-; CHECK-SSE1-NEXT: andb 19(%r12), %cl
+; CHECK-SSE1-NEXT: andb 19(%r15), %cl
; CHECK-SSE1-NEXT: xorb %al, %cl
; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT: movzbl 20(%r15), %eax
+; CHECK-SSE1-NEXT: movzbl 20(%rbx), %eax
; CHECK-SSE1-NEXT: movzbl 20(%r14), %ecx
; CHECK-SSE1-NEXT: xorb %al, %cl
-; CHECK-SSE1-NEXT: andb 20(%r12), %cl
+; CHECK-SSE1-NEXT: andb 20(%r15), %cl
; CHECK-SSE1-NEXT: xorb %al, %cl
; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT: movzbl 21(%r15), %eax
-; CHECK-SSE1-NEXT: movzbl 21(%r14), %ebp
+; CHECK-SSE1-NEXT: movzbl 21(%rbx), %eax
+; CHECK-SSE1-NEXT: movzbl 21(%r14), %r13d
+; CHECK-SSE1-NEXT: xorb %al, %r13b
+; CHECK-SSE1-NEXT: andb 21(%r15), %r13b
+; CHECK-SSE1-NEXT: xorb %al, %r13b
+; CHECK-SSE1-NEXT: movzbl 22(%rbx), %eax
+; CHECK-SSE1-NEXT: movzbl 22(%r14), %ebp
; CHECK-SSE1-NEXT: xorb %al, %bpl
-; CHECK-SSE1-NEXT: andb 21(%r12), %bpl
+; CHECK-SSE1-NEXT: andb 22(%r15), %bpl
; CHECK-SSE1-NEXT: xorb %al, %bpl
-; CHECK-SSE1-NEXT: movzbl 22(%r15), %eax
-; CHECK-SSE1-NEXT: movzbl 22(%r14), %ebx
-; CHECK-SSE1-NEXT: xorb %al, %bl
-; CHECK-SSE1-NEXT: andb 22(%r12), %bl
-; CHECK-SSE1-NEXT: xorb %al, %bl
-; CHECK-SSE1-NEXT: movzbl 23(%r15), %eax
+; CHECK-SSE1-NEXT: movzbl 23(%rbx), %eax
; CHECK-SSE1-NEXT: movzbl 23(%r14), %r11d
; CHECK-SSE1-NEXT: xorb %al, %r11b
-; CHECK-SSE1-NEXT: andb 23(%r12), %r11b
+; CHECK-SSE1-NEXT: andb 23(%r15), %r11b
; CHECK-SSE1-NEXT: xorb %al, %r11b
-; CHECK-SSE1-NEXT: movzbl 24(%r15), %eax
+; CHECK-SSE1-NEXT: movzbl 24(%rbx), %eax
; CHECK-SSE1-NEXT: movzbl 24(%r14), %r9d
; CHECK-SSE1-NEXT: xorb %al, %r9b
-; CHECK-SSE1-NEXT: andb 24(%r12), %r9b
+; CHECK-SSE1-NEXT: andb 24(%r15), %r9b
; CHECK-SSE1-NEXT: xorb %al, %r9b
-; CHECK-SSE1-NEXT: movzbl 25(%r15), %eax
+; CHECK-SSE1-NEXT: movzbl 25(%rbx), %eax
; CHECK-SSE1-NEXT: movzbl 25(%r14), %r8d
; CHECK-SSE1-NEXT: xorb %al, %r8b
-; CHECK-SSE1-NEXT: andb 25(%r12), %r8b
+; CHECK-SSE1-NEXT: andb 25(%r15), %r8b
; CHECK-SSE1-NEXT: xorb %al, %r8b
-; CHECK-SSE1-NEXT: movzbl 26(%r15), %eax
+; CHECK-SSE1-NEXT: movzbl 26(%rbx), %eax
; CHECK-SSE1-NEXT: movzbl 26(%r14), %edi
; CHECK-SSE1-NEXT: xorb %al, %dil
-; CHECK-SSE1-NEXT: andb 26(%r12), %dil
+; CHECK-SSE1-NEXT: andb 26(%r15), %dil
; CHECK-SSE1-NEXT: xorb %al, %dil
-; CHECK-SSE1-NEXT: movzbl 27(%r15), %eax
+; CHECK-SSE1-NEXT: movzbl 27(%rbx), %eax
; CHECK-SSE1-NEXT: movzbl 27(%r14), %esi
; CHECK-SSE1-NEXT: xorb %al, %sil
-; CHECK-SSE1-NEXT: andb 27(%r12), %sil
+; CHECK-SSE1-NEXT: andb 27(%r15), %sil
; CHECK-SSE1-NEXT: xorb %al, %sil
-; CHECK-SSE1-NEXT: movzbl 28(%r15), %eax
+; CHECK-SSE1-NEXT: movzbl 28(%rbx), %eax
; CHECK-SSE1-NEXT: movzbl 28(%r14), %edx
; CHECK-SSE1-NEXT: xorb %al, %dl
-; CHECK-SSE1-NEXT: andb 28(%r12), %dl
+; CHECK-SSE1-NEXT: andb 28(%r15), %dl
; CHECK-SSE1-NEXT: xorb %al, %dl
-; CHECK-SSE1-NEXT: movzbl 29(%r15), %eax
+; CHECK-SSE1-NEXT: movzbl 29(%rbx), %eax
; CHECK-SSE1-NEXT: movzbl 29(%r14), %ecx
; CHECK-SSE1-NEXT: xorb %al, %cl
-; CHECK-SSE1-NEXT: andb 29(%r12), %cl
+; CHECK-SSE1-NEXT: andb 29(%r15), %cl
; CHECK-SSE1-NEXT: xorb %al, %cl
-; CHECK-SSE1-NEXT: movzbl 30(%r15), %r10d
+; CHECK-SSE1-NEXT: movzbl 30(%rbx), %r10d
; CHECK-SSE1-NEXT: movzbl 30(%r14), %eax
; CHECK-SSE1-NEXT: xorb %r10b, %al
-; CHECK-SSE1-NEXT: andb 30(%r12), %al
+; CHECK-SSE1-NEXT: andb 30(%r15), %al
; CHECK-SSE1-NEXT: xorb %r10b, %al
-; CHECK-SSE1-NEXT: movzbl 31(%r15), %r10d
-; CHECK-SSE1-NEXT: movzbl 31(%r14), %r14d
-; CHECK-SSE1-NEXT: xorb %r10b, %r14b
-; CHECK-SSE1-NEXT: andb 31(%r12), %r14b
-; CHECK-SSE1-NEXT: xorb %r10b, %r14b
-; CHECK-SSE1-NEXT: movb %r14b, 31(%r13)
-; CHECK-SSE1-NEXT: movb %al, 30(%r13)
-; CHECK-SSE1-NEXT: movb %cl, 29(%r13)
-; CHECK-SSE1-NEXT: movb %dl, 28(%r13)
-; CHECK-SSE1-NEXT: movb %sil, 27(%r13)
-; CHECK-SSE1-NEXT: movb %dil, 26(%r13)
-; CHECK-SSE1-NEXT: movb %r8b, 25(%r13)
-; CHECK-SSE1-NEXT: movb %r9b, 24(%r13)
-; CHECK-SSE1-NEXT: movb %r11b, 23(%r13)
-; CHECK-SSE1-NEXT: movb %bl, 22(%r13)
-; CHECK-SSE1-NEXT: movb %bpl, 21(%r13)
+; CHECK-SSE1-NEXT: movzbl 31(%rbx), %r10d
+; CHECK-SSE1-NEXT: movzbl 31(%r14), %ebx
+; CHECK-SSE1-NEXT: xorb %r10b, %bl
+; CHECK-SSE1-NEXT: andb 31(%r15), %bl
+; CHECK-SSE1-NEXT: xorb %r10b, %bl
+; CHECK-SSE1-NEXT: movb %bl, 31(%r12)
+; CHECK-SSE1-NEXT: movb %al, 30(%r12)
+; CHECK-SSE1-NEXT: movb %cl, 29(%r12)
+; CHECK-SSE1-NEXT: movb %dl, 28(%r12)
+; CHECK-SSE1-NEXT: movb %sil, 27(%r12)
+; CHECK-SSE1-NEXT: movb %dil, 26(%r12)
+; CHECK-SSE1-NEXT: movb %r8b, 25(%r12)
+; CHECK-SSE1-NEXT: movb %r9b, 24(%r12)
+; CHECK-SSE1-NEXT: movb %r11b, 23(%r12)
+; CHECK-SSE1-NEXT: movb %bpl, 22(%r12)
+; CHECK-SSE1-NEXT: movb %r13b, 21(%r12)
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT: movb %al, 20(%r13)
+; CHECK-SSE1-NEXT: movb %al, 20(%r12)
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT: movb %al, 19(%r13)
+; CHECK-SSE1-NEXT: movb %al, 19(%r12)
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT: movb %al, 18(%r13)
+; CHECK-SSE1-NEXT: movb %al, 18(%r12)
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT: movb %al, 17(%r13)
+; CHECK-SSE1-NEXT: movb %al, 17(%r12)
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT: movb %al, 16(%r13)
+; CHECK-SSE1-NEXT: movb %al, 16(%r12)
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT: movb %al, 15(%r13)
+; CHECK-SSE1-NEXT: movb %al, 15(%r12)
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT: movb %al, 14(%r13)
+; CHECK-SSE1-NEXT: movb %al, 14(%r12)
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT: movb %al, 13(%r13)
+; CHECK-SSE1-NEXT: movb %al, 13(%r12)
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT: movb %al, 12(%r13)
+; CHECK-SSE1-NEXT: movb %al, 12(%r12)
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT: movb %al, 11(%r13)
+; CHECK-SSE1-NEXT: movb %al, 11(%r12)
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT: movb %al, 10(%r13)
+; CHECK-SSE1-NEXT: movb %al, 10(%r12)
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT: movb %al, 9(%r13)
+; CHECK-SSE1-NEXT: movb %al, 9(%r12)
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT: movb %al, 8(%r13)
+; CHECK-SSE1-NEXT: movb %al, 8(%r12)
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT: movb %al, 7(%r13)
+; CHECK-SSE1-NEXT: movb %al, 7(%r12)
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT: movb %al, 6(%r13)
+; CHECK-SSE1-NEXT: movb %al, 6(%r12)
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT: movb %al, 5(%r13)
+; CHECK-SSE1-NEXT: movb %al, 5(%r12)
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT: movb %al, 4(%r13)
+; CHECK-SSE1-NEXT: movb %al, 4(%r12)
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT: movb %al, 3(%r13)
+; CHECK-SSE1-NEXT: movb %al, 3(%r12)
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT: movb %al, 2(%r13)
+; CHECK-SSE1-NEXT: movb %al, 2(%r12)
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT: movb %al, 1(%r13)
+; CHECK-SSE1-NEXT: movb %al, 1(%r12)
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT: movb %al, (%r13)
-; CHECK-SSE1-NEXT: movq %r13, %rax
+; CHECK-SSE1-NEXT: movb %al, (%r12)
+; CHECK-SSE1-NEXT: movq %r12, %rax
; CHECK-SSE1-NEXT: popq %rbx
; CHECK-SSE1-NEXT: popq %r12
; CHECK-SSE1-NEXT: popq %r13
diff --git a/llvm/test/CodeGen/X86/ushl_sat_vec.ll b/llvm/test/CodeGen/X86/ushl_sat_vec.ll
index ebb5e135eacd02..5e63e563a37fe2 100644
--- a/llvm/test/CodeGen/X86/ushl_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/ushl_sat_vec.ll
@@ -170,44 +170,45 @@ define <4 x i32> @vec_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
; X86-NEXT: pushl %esi
; X86-NEXT: movb {{[0-9]+}}(%esp), %ch
; X86-NEXT: movb {{[0-9]+}}(%esp), %ah
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: movl %ebx, %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl %edi, %esi
; X86-NEXT: shll %cl, %esi
-; X86-NEXT: movl %esi, %ebp
-; X86-NEXT: shrl %cl, %ebp
-; X86-NEXT: cmpl %ebp, %ebx
+; X86-NEXT: movl %esi, %ebx
+; X86-NEXT: shrl %cl, %ebx
+; X86-NEXT: cmpl %ebx, %edi
; X86-NEXT: movl $-1, %edx
; X86-NEXT: cmovnel %edx, %esi
-; X86-NEXT: movl $-1, %ebx
-; X86-NEXT: movl %edi, %edx
+; X86-NEXT: movl $-1, %edi
+; X86-NEXT: movl %ebp, %edx
; X86-NEXT: movb %ah, %cl
; X86-NEXT: shll %cl, %edx
-; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: shrl %cl, %ebp
-; X86-NEXT: cmpl %ebp, %edi
-; X86-NEXT: cmovnel %ebx, %edx
-; X86-NEXT: movl $-1, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: shrl %cl, %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: cmpl %ebx, %ebp
+; X86-NEXT: cmovnel %edi, %edx
+; X86-NEXT: movl %eax, %ebp
; X86-NEXT: movb %ch, %cl
-; X86-NEXT: shll %cl, %edi
-; X86-NEXT: movl %edi, %ebp
-; X86-NEXT: shrl %cl, %ebp
-; X86-NEXT: cmpl %ebp, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: cmovnel %eax, %edi
-; X86-NEXT: movl %ebx, %ebp
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: shll %cl, %ebp
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: shrl %cl, %eax
-; X86-NEXT: cmpl %eax, %ebx
+; X86-NEXT: movl %ebp, %ebx
+; X86-NEXT: shrl %cl, %ebx
+; X86-NEXT: cmpl %ebx, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl $-1, %eax
; X86-NEXT: cmovnel %eax, %ebp
+; X86-NEXT: movl %edi, %ebx
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: shll %cl, %ebx
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: shrl %cl, %eax
+; X86-NEXT: cmpl %eax, %edi
+; X86-NEXT: movl $-1, %eax
+; X86-NEXT: cmovnel %eax, %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %ebp, 12(%eax)
-; X86-NEXT: movl %edi, 8(%eax)
+; X86-NEXT: movl %ebx, 12(%eax)
+; X86-NEXT: movl %ebp, 8(%eax)
; X86-NEXT: movl %edx, 4(%eax)
; X86-NEXT: movl %esi, (%eax)
; X86-NEXT: popl %esi
@@ -304,8 +305,9 @@ define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movb {{[0-9]+}}(%esp), %ch
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
; X86-NEXT: movl %eax, %ebx
; X86-NEXT: shll %cl, %ebx
; X86-NEXT: movzwl %bx, %edi
@@ -315,33 +317,32 @@ define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
; X86-NEXT: cmovnel %eax, %ebx
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %esi, %eax
-; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movb %dl, %cl
; X86-NEXT: shll %cl, %eax
; X86-NEXT: movzwl %ax, %edi
; X86-NEXT: shrl %cl, %edi
; X86-NEXT: cmpw %di, %si
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl $65535, %esi # imm = 0xFFFF
-; X86-NEXT: cmovnel %esi, %eax
+; X86-NEXT: movl $65535, %edi # imm = 0xFFFF
+; X86-NEXT: cmovnel %edi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movb %ch, %cl
; X86-NEXT: shll %cl, %eax
-; X86-NEXT: movzwl %ax, %edx
-; X86-NEXT: shrl %cl, %edx
-; X86-NEXT: cmpw %dx, %bp
+; X86-NEXT: movzwl %ax, %esi
+; X86-NEXT: shrl %cl, %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: cmpw %si, %bp
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: cmovnel %esi, %eax
+; X86-NEXT: cmovnel %edi, %eax
; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT: movl $65535, %eax # imm = 0xFFFF
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl %esi, %ebp
+; X86-NEXT: movl %edx, %ebp
; X86-NEXT: shll %cl, %ebp
-; X86-NEXT: movzwl %bp, %edx
-; X86-NEXT: shrl %cl, %edx
-; X86-NEXT: cmpw %dx, %si
+; X86-NEXT: movzwl %bp, %esi
+; X86-NEXT: shrl %cl, %esi
+; X86-NEXT: cmpw %si, %dx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: cmovnel %eax, %ebp
+; X86-NEXT: cmovnel %edi, %ebp
; X86-NEXT: movl %edx, %ebx
; X86-NEXT: shll %cl, %ebx
; X86-NEXT: movzwl %bx, %esi
@@ -486,44 +487,52 @@ define <16 x i8> @vec_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: subl $48, %esp
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movb {{[0-9]+}}(%esp), %ch
; X86-NEXT: movb {{[0-9]+}}(%esp), %ah
-; X86-NEXT: movb {{[0-9]+}}(%esp), %dh
-; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: movb %bl, %bh
-; X86-NEXT: shlb %cl, %bh
-; X86-NEXT: movzbl %bh, %edi
-; X86-NEXT: shrb %cl, %bh
-; X86-NEXT: cmpb %bh, %bl
+; X86-NEXT: movb {{[0-9]+}}(%esp), %ch
+; X86-NEXT: movb {{[0-9]+}}(%esp), %bh
+; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movb %dl, %al
+; X86-NEXT: shlb %cl, %al
+; X86-NEXT: movzbl %al, %edi
+; X86-NEXT: shrb %cl, %al
+; X86-NEXT: cmpb %al, %dl
; X86-NEXT: movl $255, %esi
; X86-NEXT: cmovnel %esi, %edi
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movb %dh, %bl
+; X86-NEXT: movb %bh, %al
+; X86-NEXT: movb %ch, %cl
+; X86-NEXT: shlb %cl, %al
+; X86-NEXT: movzbl %al, %edx
+; X86-NEXT: shrb %cl, %al
+; X86-NEXT: cmpb %al, %bh
+; X86-NEXT: cmovnel %esi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movb %bl, %al
; X86-NEXT: movb %ah, %cl
-; X86-NEXT: shlb %cl, %bl
-; X86-NEXT: movzbl %bl, %edi
-; X86-NEXT: shrb %cl, %bl
-; X86-NEXT: cmpb %bl, %dh
-; X86-NEXT: cmovnel %esi, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movb %ch, %ah
-; X86-NEXT: movb %dl, %cl
-; X86-NEXT: shlb %cl, %ah
-; X86-NEXT: movzbl %ah, %edi
-; X86-NEXT: shrb %cl, %ah
+; X86-NEXT: shlb %cl, %al
+; X86-NEXT: movzbl %al, %edx
+; X86-NEXT: shrb %cl, %al
+; X86-NEXT: cmpb %al, %bl
+; X86-NEXT: cmovnel %esi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movb {{[0-9]+}}(%esp), %ah
+; X86-NEXT: movb %ah, %al
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: shlb %cl, %al
+; X86-NEXT: movzbl %al, %edi
+; X86-NEXT: shrb %cl, %al
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: cmpb %ah, %ch
+; X86-NEXT: cmpb %al, %ah
; X86-NEXT: cmovnel %esi, %edi
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movb %dl, %ah
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: shlb %cl, %ah
-; X86-NEXT: movzbl %ah, %edi
-; X86-NEXT: shrb %cl, %ah
-; X86-NEXT: cmpb %ah, %dl
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: shlb %cl, %al
+; X86-NEXT: movzbl %al, %edi
+; X86-NEXT: shrb %cl, %al
+; X86-NEXT: cmpb %al, %dl
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: cmovnel %esi, %edi
@@ -588,15 +597,6 @@ define <16 x i8> @vec_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
; X86-NEXT: shrb %cl, %dl
; X86-NEXT: cmpb %dl, %al
; X86-NEXT: cmovnel %esi, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %eax, %edx
-; X86-NEXT: shlb %cl, %dl
-; X86-NEXT: movzbl %dl, %edi
-; X86-NEXT: shrb %cl, %dl
-; X86-NEXT: cmpb %dl, %al
-; X86-NEXT: cmovnel %esi, %edi
; X86-NEXT: movl %edi, (%esp) # 4-byte Spill
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
index f84131dfc87970..70656b5b3566be 100644
--- a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
@@ -635,13 +635,13 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: orl (%esp), %ebp # 4-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 12(%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, (%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 4(%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 12(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 4(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: addl $36, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
@@ -655,7 +655,7 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $32, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $40, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %edx
@@ -666,7 +666,7 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -675,30 +675,31 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $7, %al
; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %ah
; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $15, %ah
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %ah, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebp), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %ah, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%edx), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%edx), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: notb %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebp), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edx, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebp), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 8(%ecx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%edx), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebp,%ebp), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%edx), %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 12(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $32, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, (%esp) # 4-byte Folded Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 8(%ebx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 12(%ebx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, (%ebx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 4(%ebx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $40, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %ebx
@@ -764,7 +765,6 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
;
; X86-HAVE-BMI2-HAVE-SHLD-LABEL: lshr_16bytes:
; X86-HAVE-BMI2-HAVE-SHLD: # %bb.0:
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi
@@ -789,30 +789,28 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $15, %al
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %dl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%edi,%edi), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ebp, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebx), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%edx,%edx), %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebx), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 8(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %ebx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 12(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, (%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 4(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %ebx, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx def $ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %cl
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %edi, %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 8(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 12(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 4(%esi)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $32, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: retl
%src = load i128, ptr %src.ptr, align 1
%bitOff = load i128, ptr %bitOff.ptr, align 1
@@ -950,14 +948,14 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 8(%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 12(%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 4(%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 8(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 12(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 4(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: addl $40, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
@@ -971,7 +969,7 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $32, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $40, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %edx
@@ -986,35 +984,37 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, (%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $7, %al
; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %cl
; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $15, %cl
; X86-NO-BMI2-HAVE-SHLD-NEXT: negb %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movsbl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebp), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movsbl %cl, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%edi), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, (%esp) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: notb %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%ebp), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%ebp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebp), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%edi), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%edi), %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 12(%ebx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%ebx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 8(%ebx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $32, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebp, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: orl (%esp), %edx # 4-byte Folded Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%edi), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 12(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, (%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 4(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 8(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $40, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %ebx
@@ -1081,7 +1081,6 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
;
; X86-HAVE-BMI2-HAVE-SHLD-LABEL: shl_16bytes:
; X86-HAVE-BMI2-HAVE-SHLD: # %bb.0:
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi
@@ -1107,31 +1106,29 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $15, %al
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: negb %al
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movsbl %al, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %edi, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %dl
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%ebx), %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%ebx), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ebp, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%ebx), %ebp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebx), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 12(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %ebp, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, (%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebp, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 8(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %edi, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx def $ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %cl
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edx, %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 12(%edx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, (%edx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%edx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 8(%edx)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $32, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: retl
%src = load i128, ptr %src.ptr, align 1
%bitOff = load i128, ptr %bitOff.ptr, align 1
@@ -1266,13 +1263,13 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: orl (%esp), %ebp # 4-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: sarl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 12(%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, (%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 4(%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 12(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 4(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: addl $36, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
@@ -1286,7 +1283,7 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $32, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $40, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %edx
@@ -1297,7 +1294,7 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl $31, %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
@@ -1307,30 +1304,31 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $7, %al
; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %cl
; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $15, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebp), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %cl, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%edx), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%edx), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: notb %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebp), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edx, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebp), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 8(%ecx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%edx), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebp,%ebp), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%edx), %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 12(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $32, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, (%esp) # 4-byte Folded Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 8(%ebx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl %cl, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 12(%ebx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, (%ebx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 4(%ebx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $40, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %ebx
@@ -1397,7 +1395,6 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
;
; X86-HAVE-BMI2-HAVE-SHLD-LABEL: ashr_16bytes:
; X86-HAVE-BMI2-HAVE-SHLD: # %bb.0:
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi
@@ -1423,30 +1420,28 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $15, %al
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %dl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%edi,%edi), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ebp, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebx), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%edx,%edx), %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebx), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 8(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %ebx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 12(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, (%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 4(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %ebx, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx def $ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %cl
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %edi, %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 8(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 12(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 4(%esi)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $32, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: retl
%src = load i128, ptr %src.ptr, align 1
%bitOff = load i128, ptr %bitOff.ptr, align 1
@@ -1644,7 +1639,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl (%edi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%edi), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%edi), %esi
@@ -1662,7 +1657,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -1675,48 +1670,47 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %al
; X86-NO-BMI2-NO-SHLD-NEXT: andb $7, %al
; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %ch
-; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %ch, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edi), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%edi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %ch, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%edx), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %ah
; X86-NO-BMI2-NO-SHLD-NEXT: notb %ah
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edx), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ebx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%ebx), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%edx), %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%esp,%ebx), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%esp,%edx), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %esi # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%esp,%edx), %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
@@ -1726,7 +1720,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%esp) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %ch
; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
@@ -1751,12 +1745,12 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 28(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 24(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 16(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax)
@@ -1773,7 +1767,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $92, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $100, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edi), %eax
@@ -1809,64 +1803,72 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $7, %al
; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %cl
; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%ebp), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebp), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT: notb %dl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%ebp), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebp), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebp), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%ebp), %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%ebp), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%ebp), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%ebp), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%ebp), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%ebp), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%edi,%edi), %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%ebp), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 24(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%ebp), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%ebp), %ebp
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 28(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ecx, (%esp) # 4-byte Folded Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 24(%ebx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 28(%ebx)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 20(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%ebx)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%ebx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, (%ebx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 20(%ebx)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $92, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ebx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $100, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %ebx
@@ -1988,13 +1990,13 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $88, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $92, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edi), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edi), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edi), %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edi), %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%edi), %ebp
@@ -2008,7 +2010,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
@@ -2023,57 +2025,54 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%eax,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%edi), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%edi), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %dl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%ebx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %eax, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebx), %ebp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%ebp,%ebp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %eax, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%ebx), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %eax, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%ebx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%esi,%esi), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %edi, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edi, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%edi), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%edi), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%edx,%edx), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%ebx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 24(%ebx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%edi), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%edi), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edi, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, (%esp), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %ebx, %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%ebx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx def $ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %cl
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ebx, %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 24(%ebx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 28(%ebx)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 16(%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 8(%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 8(%ebx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%ebx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 20(%ebx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 12(%ebx)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $88, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $92, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx
@@ -2411,7 +2410,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $92, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $100, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edi), %eax
@@ -2448,65 +2447,71 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %cl
; X86-NO-BMI2-HAVE-SHLD-NEXT: negb %cl
; X86-NO-BMI2-HAVE-SHLD-NEXT: movsbl %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%ebx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%ebx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT: notb %dl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%ebx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%ebx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%ebx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%ebx), %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%ebx), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%ebx), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%ebx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 92(%esp,%ebx), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%ebx), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%ebx), %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, (%esp) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, (%esp) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%ebx), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%ebx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 28(%ebx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 96(%esp,%ebx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%ebx), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 28(%edx)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 20(%ebx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 20(%edx)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 12(%ebx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 12(%edx)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 4(%ebx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 24(%ebx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 16(%ebx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 4(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 24(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 16(%edx)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%ebx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $92, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $100, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %ebx
@@ -2631,13 +2636,13 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $88, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $96, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edi), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edi), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edi), %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edi), %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%edi), %ebp
@@ -2651,7 +2656,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
@@ -2668,63 +2673,64 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: negb %al
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movsbl %al, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%esi), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %eax, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%esi), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %eax, %ebx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%esi), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %eax, %edx, %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%esi), %edx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %edx, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%esi), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %eax, %edx, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%esi), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %edx, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ebp, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%esi), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%esi), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%esi), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%esi), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 20(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 12(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %ebp, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebp, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $88, %esp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: retl
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%esi), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%esi), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 92(%esp,%esi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%esi), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%esi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, (%esp), %ebx # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %edi, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx def $ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %cl
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edi, %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl (%esp), %ecx # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 28(%edi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 20(%edi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 12(%edi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%edi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%edi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 24(%edi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 16(%edi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 8(%edi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $96, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: retl
%src = load i256, ptr %src.ptr, align 1
%bitOff = load i256, ptr %bitOff.ptr, align 1
%res = shl i256 %src, %bitOff
@@ -2924,7 +2930,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl (%edx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%edx), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%edx), %edi
@@ -2942,7 +2948,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: sarl $31, %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
@@ -2956,48 +2962,47 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %al
; X86-NO-BMI2-NO-SHLD-NEXT: andb $7, %al
; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %ch
-; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %ch, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edi), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%edi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %ch, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%edx), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %ah
; X86-NO-BMI2-NO-SHLD-NEXT: notb %ah
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edx), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ebx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%ebx), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%edx), %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%esp,%ebx), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%esp,%edx), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %esi # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%esp,%edx), %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
@@ -3007,7 +3012,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%esp) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %ch
; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
@@ -3032,12 +3037,12 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 28(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 24(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 16(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax)
@@ -3054,7 +3059,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $92, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $100, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edx), %eax
@@ -3091,64 +3096,72 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $7, %al
; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %cl
; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%ebp), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebp), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT: notb %dl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%ebp), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebp), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebp), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%ebp), %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%ebp), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%ebp), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%ebp), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%ebp), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%ebp), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%edi,%edi), %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%ebp), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 24(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%ebp), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%ebp), %ebp
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 28(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ecx, (%esp) # 4-byte Folded Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 24(%ebx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl %cl, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 28(%ebx)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 20(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%ebx)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%ebx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, (%ebx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 20(%ebx)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $92, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ebx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $100, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %ebx
@@ -3273,13 +3286,13 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $88, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $92, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%edx), %ebp
@@ -3293,7 +3306,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
@@ -3309,57 +3322,54 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $7, %cl
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%eax,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%edi), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%edi), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %dl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%ebx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %eax, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebx), %ebp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%ebp,%ebp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %eax, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%ebx), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %eax, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%ebx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%esi,%esi), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %edi, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edi, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%edi), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%edi), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%edx,%edx), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%ebx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 24(%ebx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%edi), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%edi), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edi, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, (%esp), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %ebx, %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %eax, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%ebx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx def $ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %cl
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ebx, %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 24(%ebx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 28(%ebx)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 16(%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 8(%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 8(%ebx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%ebx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 20(%ebx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 12(%ebx)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $88, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $92, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx
@@ -4036,7 +4046,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $204, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $232, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esi), %eax
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -4049,7 +4059,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%esi), %eax
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%esi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%esi), %eax
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esi), %eax
@@ -4057,7 +4067,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%esi), %eax
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%esi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%esi), %ebp
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%esi), %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%esi), %edi
@@ -4072,7 +4082,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
@@ -4080,7 +4090,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
@@ -4112,84 +4122,85 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $7, %eax
; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 108(%esp,%esi), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%esi), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 112(%esp,%esi), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT: notl %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $31, %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%esi), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 116(%esp,%esi), %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 92(%esp,%esi), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 120(%esp,%esi), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 96(%esp,%esi), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 124(%esp,%esi), %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 100(%esp,%esi), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 128(%esp,%esi), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 104(%esp,%esi), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 132(%esp,%esi), %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 108(%esp,%esi), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 136(%esp,%esi), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 112(%esp,%esi), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 140(%esp,%esi), %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 116(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 144(%esp,%esi), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 120(%esp,%esi), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 148(%esp,%esi), %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 124(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 152(%esp,%esi), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 128(%esp,%esi), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 156(%esp,%esi), %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 132(%esp,%esi), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 160(%esp,%esi), %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %ebp
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebp
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
@@ -4200,45 +4211,59 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 136(%esp,%esi), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 56(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 60(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 48(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 164(%esp,%esi), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 104(%esp,%esi), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 56(%esi)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 60(%esi)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 40(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 48(%esi)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 40(%esi)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 32(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 32(%esi)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%esi)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%esi)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, (%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 52(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%esi)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%esi)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 52(%esi)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 44(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 44(%esi)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 36(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 36(%esi)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%esi)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%esi)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%esi)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $204, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%esi)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $232, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %ebx
@@ -4478,7 +4503,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $200, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $212, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%eax), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -4534,8 +4559,8 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $7, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $7, %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -4554,118 +4579,115 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %ecx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notl %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $31, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%eax), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %edi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %edi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 92(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%edx,%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 96(%esp,%eax), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 100(%esp,%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%esi,%esi), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 104(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 108(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%edx,%edx), %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 96(%esp,%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 112(%esp,%eax), %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 92(%esp,%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 116(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%edx,%edx), %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 104(%esp,%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 120(%esp,%eax), %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 100(%esp,%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 124(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%edx,%edx), %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 112(%esp,%eax), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 108(%esp,%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 128(%esp,%eax), %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 120(%esp,%eax), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 132(%esp,%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%esi,%esi), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 136(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 140(%esp,%eax), %ebp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%ebp,%ebp), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 116(%esp,%eax), %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 128(%esp,%eax), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 124(%esp,%eax), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %esi, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ecx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, (%esp) # 4-byte Folded Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 144(%esp,%eax), %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%eax), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 132(%esp,%eax), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%eax), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %ebx, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notl %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $31, %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, (%esp), %edx # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ebx, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 56(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 56(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 48(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 40(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 32(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 24(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 16(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 8(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, (%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 60(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 52(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 44(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 36(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 28(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 20(%eax)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 40(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 32(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 24(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 16(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 8(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %edi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 60(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 52(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 44(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 36(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 28(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 20(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 12(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 12(%eax)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 4(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $200, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $212, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx
@@ -5366,7 +5388,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $204, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $228, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%eax), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -5377,7 +5399,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%eax), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%eax), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%eax), %ecx
@@ -5385,7 +5407,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%eax), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%eax), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%eax), %ebp
@@ -5404,7 +5426,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
@@ -5412,7 +5434,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
@@ -5443,87 +5465,87 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%edi), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%edi), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $7, %eax
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edi), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edi), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT: notl %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $31, %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%edi), %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%edi), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%edi), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%edi), %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%edi), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%edi), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%edi), %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%edi), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%edi), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%edi), %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%edi), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%edi), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: negl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 188(%esp,%esi), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: negl %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 212(%esp,%esi), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%edi), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%edi), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%edi), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%edi), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%edi), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebp
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%edi), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, (%esp) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %ebp
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
@@ -5534,45 +5556,57 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, (%esp) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edi), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%edi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 60(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%edi), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edi), %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 52(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 44(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 36(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 56(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 48(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 40(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, (%esp) # 4-byte Folded Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 32(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $204, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 60(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ebx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 52(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 44(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 36(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 28(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 20(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 12(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 56(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 48(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 40(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 32(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 24(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 16(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 8(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $228, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %ebx
@@ -5826,43 +5860,43 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $204, %esp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $212, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%edx), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%edx), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%edx), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%edx), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%edx), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%edi), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%edi), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%edi), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%edi), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%edi), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%edi), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%edi), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edi), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%edx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%edx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%edx), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%edx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%edx), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%edx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
@@ -5882,13 +5916,14 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $7, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl %edi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $7, %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal {{[0-9]+}}(%esp), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl %ebx, %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -5905,126 +5940,127 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%esi), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%esi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%esi), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%esi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%esi), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%esi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%esi), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%esi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%esi), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%esi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%esi), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%esi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esi), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%esi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%esi), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: negl %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 196(%esp,%ebx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, (%esp), %ebp # 4-byte Folded Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %eax, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %edx, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notl %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $31, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %ebp, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%edx), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%edx), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edi, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl (%esp), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %ebx, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ebp, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %ebp, %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ecx, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 60(%ecx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 52(%ecx)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: negl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 188(%esp,%esi), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ecx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 44(%ecx)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 36(%ecx)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 28(%ecx)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 20(%ecx)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 60(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %edi, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 12(%ecx)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 52(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 44(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 36(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%edx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%ecx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 4(%ecx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 56(%ecx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 48(%ecx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 40(%ecx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 32(%ecx)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%edx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%ecx)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%edx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%ecx)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 4(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 56(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 48(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 40(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 32(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $204, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%ecx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $212, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx
@@ -6705,7 +6741,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $204, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $232, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%eax), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -6718,7 +6754,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%eax), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%eax), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%eax), %ecx
@@ -6726,7 +6762,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%eax), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%eax), %ebp
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%eax), %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%eax), %edi
@@ -6740,7 +6776,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
@@ -6748,7 +6784,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -6782,84 +6818,85 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $7, %eax
; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 108(%esp,%esi), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%esi), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 112(%esp,%esi), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT: notl %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $31, %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%esi), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 116(%esp,%esi), %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 92(%esp,%esi), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 120(%esp,%esi), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 96(%esp,%esi), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 124(%esp,%esi), %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 100(%esp,%esi), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 128(%esp,%esi), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 104(%esp,%esi), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 132(%esp,%esi), %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 108(%esp,%esi), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 136(%esp,%esi), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 112(%esp,%esi), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 140(%esp,%esi), %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 116(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 144(%esp,%esi), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 120(%esp,%esi), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 148(%esp,%esi), %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 124(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 152(%esp,%esi), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 128(%esp,%esi), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 156(%esp,%esi), %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 132(%esp,%esi), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 160(%esp,%esi), %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %ebp
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebp
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
@@ -6870,45 +6907,59 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 136(%esp,%esi), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 56(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 60(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 48(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 164(%esp,%esi), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 104(%esp,%esi), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 56(%esi)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl %cl, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 60(%esi)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 40(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 48(%esi)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 40(%esi)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 32(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 32(%esi)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%esi)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%esi)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, (%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 52(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%esi)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%esi)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 52(%esi)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 44(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 44(%esi)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 36(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 36(%esi)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%esi)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%esi)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%esi)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $204, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%esi)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $232, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %ebx
@@ -7149,7 +7200,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $200, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $212, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%eax), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -7158,9 +7209,9 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%eax), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%eax), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%eax), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%eax), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%eax), %ecx
@@ -7175,168 +7226,166 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%eax), %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%eax), %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%eax), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%eax), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%eax), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%eax), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarl $31, %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarl $31, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $7, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%edx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notl %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $31, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%edx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %edi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %edi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 92(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%edx,%edx), %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 96(%esp,%eax), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 100(%esp,%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%esi,%esi), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 104(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 96(%esp,%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 108(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%edx,%edx), %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 92(%esp,%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 112(%esp,%eax), %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 104(%esp,%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 116(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%edx,%edx), %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 100(%esp,%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 120(%esp,%eax), %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 112(%esp,%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 108(%esp,%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 124(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%edx,%edx), %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 120(%esp,%edx), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 116(%esp,%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 128(%esp,%eax), %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 128(%esp,%edx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%edi,%edi), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %ebp, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 124(%esp,%edx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ebp, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 132(%esp,%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%esi,%esi), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 136(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 140(%esp,%eax), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%ebp,%ebp), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 144(%esp,%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%eax), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %ebx, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %esi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notl %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $31, %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, (%esp), %edx # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ebx, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 56(%eax)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 48(%eax)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, (%esp) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%edx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 132(%esp,%edx), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 56(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 48(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 40(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 32(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 24(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 16(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 8(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %ebp, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 60(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 52(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 44(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 36(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $200, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 40(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 32(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 24(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 16(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 8(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, (%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 60(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 52(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 44(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 36(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 28(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 20(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 12(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 4(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $212, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx
diff --git a/llvm/test/CodeGen/X86/x86-64-flags-intrinsics.ll b/llvm/test/CodeGen/X86/x86-64-flags-intrinsics.ll
index 47aefdbf0e466e..e973751912425c 100644
--- a/llvm/test/CodeGen/X86/x86-64-flags-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/x86-64-flags-intrinsics.ll
@@ -65,11 +65,11 @@ define i64 @read_flags_reg_pressure() nounwind {
; CHECK-NEXT: subq $16, %rsp
; CHECK-NEXT: #APP
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: movq %rdx, (%rsp) # 8-byte Spill
+; CHECK-NEXT: movq %rcx, (%rsp) # 8-byte Spill
; CHECK-NEXT: pushfq
-; CHECK-NEXT: popq %rdx
-; CHECK-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT: movq (%rsp), %rdx # 8-byte Reload
+; CHECK-NEXT: popq %rcx
+; CHECK-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq (%rsp), %rcx # 8-byte Reload
; CHECK-NEXT: #APP
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
@@ -96,11 +96,11 @@ define i64 @read_flags_reg_pressure() nounwind {
; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rbp
; WIN64-NEXT: #APP
; WIN64-NEXT: #NO_APP
-; WIN64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; WIN64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; WIN64-NEXT: pushfq
-; WIN64-NEXT: popq %rdx
-; WIN64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; WIN64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; WIN64-NEXT: popq %rcx
+; WIN64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; WIN64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
; WIN64-NEXT: #APP
; WIN64-NEXT: #NO_APP
; WIN64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
@@ -148,11 +148,11 @@ define void @write_flags_reg_pressure(i64 noundef %0) nounwind {
; CHECK-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: movq %rdx, (%rsp) # 8-byte Spill
-; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; CHECK-NEXT: pushq %rdx
+; CHECK-NEXT: movq %rcx, (%rsp) # 8-byte Spill
+; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; CHECK-NEXT: pushq %rcx
; CHECK-NEXT: popfq
-; CHECK-NEXT: movq (%rsp), %rdx # 8-byte Reload
+; CHECK-NEXT: movq (%rsp), %rcx # 8-byte Reload
; CHECK-NEXT: #APP
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: addq $16, %rsp
@@ -179,11 +179,11 @@ define void @write_flags_reg_pressure(i64 noundef %0) nounwind {
; WIN64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; WIN64-NEXT: #APP
; WIN64-NEXT: #NO_APP
-; WIN64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; WIN64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; WIN64-NEXT: pushq %rdx
+; WIN64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; WIN64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; WIN64-NEXT: pushq %rcx
; WIN64-NEXT: popfq
-; WIN64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; WIN64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
; WIN64-NEXT: #APP
; WIN64-NEXT: #NO_APP
; WIN64-NEXT: addq $16, %rsp
diff --git a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
index 624e8d5d54ba8f..097e0b01cec98c 100644
--- a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
@@ -2042,15 +2042,14 @@ void CodeGenRegBank::computeRegUnitSets() {
// Iterate over all unit sets, including new ones added by this loop.
unsigned NumRegUnitSubSets = RegUnitSets.size();
- for (unsigned Idx = 0, EndIdx = RegUnitSets.size(); Idx != EndIdx; ++Idx) {
+ for (unsigned Idx = 0; Idx != RegUnitSets.size(); ++Idx) {
// In theory, this is combinatorial. In practice, it needs to be bounded
// by a small number of sets for regpressure to be efficient.
// If the assert is hit, we need to implement pruning.
- assert(Idx < (2 * NumRegUnitSubSets) && "runaway unit set inference");
+ assert(Idx < (8 * NumRegUnitSubSets) && "runaway unit set inference");
// Compare new sets with all original classes.
- for (unsigned SearchIdx = (Idx >= NumRegUnitSubSets) ? 0 : Idx + 1;
- SearchIdx != EndIdx; ++SearchIdx) {
+ for (unsigned SearchIdx = 0; SearchIdx != Idx; ++SearchIdx) {
std::vector<unsigned> Intersection;
std::set_intersection(
RegUnitSets[Idx].Units.begin(), RegUnitSets[Idx].Units.end(),
@@ -2059,8 +2058,8 @@ void CodeGenRegBank::computeRegUnitSets() {
if (Intersection.empty())
continue;
- RegUnitSet RUSet(RegUnitSets[Idx].Name + "_with_" +
- RegUnitSets[SearchIdx].Name);
+ RegUnitSet RUSet(RegUnitSets[SearchIdx].Name + "_with_" +
+ RegUnitSets[Idx].Name);
std::set_union(RegUnitSets[Idx].Units.begin(),
RegUnitSets[Idx].Units.end(),
RegUnitSets[SearchIdx].Units.begin(),
>From 42e07c71c845d5dc1b9d373d80075efb4e3c39ac Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Wed, 30 Oct 2024 10:48:39 +0000
Subject: [PATCH 2/2] Bump unit set inference limit
AArch64 currently needs the limit set to 94 or higher. This probably
indicates that there is some deeper problem.
---
llvm/utils/TableGen/Common/CodeGenRegisters.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
index f9eef3b3e22f1a..a79cae4951fcb7 100644
--- a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
@@ -2051,7 +2051,7 @@ void CodeGenRegBank::computeRegUnitSets() {
// In theory, this is combinatorial. In practice, it needs to be bounded
// by a small number of sets for regpressure to be efficient.
// If the assert is hit, we need to implement pruning.
- assert(Idx < (8 * NumRegUnitSubSets) && "runaway unit set inference");
+ assert(Idx < (100 * NumRegUnitSubSets) && "runaway unit set inference");
// Compare new sets with all original classes.
for (unsigned SearchIdx = 0; SearchIdx != Idx; ++SearchIdx) {
More information about the llvm-commits
mailing list