[llvm] 3f22a49 - [X86] selectLEAAddr - add X86ISD::SMUL/UMULO handling
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Thu Feb 17 05:51:25 PST 2022
Author: Simon Pilgrim
Date: 2022-02-17T13:51:02Z
New Revision: 3f22a4962dafe2718a92b3cd9b5be4a6fcc83b77
URL: https://github.com/llvm/llvm-project/commit/3f22a4962dafe2718a92b3cd9b5be4a6fcc83b77
DIFF: https://github.com/llvm/llvm-project/commit/3f22a4962dafe2718a92b3cd9b5be4a6fcc83b77.diff
LOG: [X86] selectLEAAddr - add X86ISD::SMUL/UMULO handling
After D118128 relaxed the heuristic to require only one EFLAGS generating operand, it now makes sense to avoid X86ISD::SMUL/UMULO duplication as well.
Differential Revision: https://reviews.llvm.org/D119578
Added:
Modified:
llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
llvm/test/CodeGen/X86/select-lea.ll
llvm/test/CodeGen/X86/umul_fix_sat.ll
llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll
llvm/test/CodeGen/X86/umulo-64-legalisation-lowering.ll
llvm/test/CodeGen/X86/vec_umulo.ll
llvm/test/CodeGen/X86/xmulo.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 17f4b1ec5bf78..66c44a49f4f68 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -2782,10 +2782,10 @@ bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
case X86ISD::SUB:
case X86ISD::ADC:
case X86ISD::SBB:
- /* TODO: These opcodes can be added safely, but we may want to justify
- their inclusion for
diff erent reasons (better for reg-alloc).
case X86ISD::SMUL:
case X86ISD::UMUL:
+ /* TODO: These opcodes can be added safely, but we may want to justify
+ their inclusion for
diff erent reasons (better for reg-alloc).
case X86ISD::OR:
case X86ISD::XOR:
case X86ISD::AND:
diff --git a/llvm/test/CodeGen/X86/select-lea.ll b/llvm/test/CodeGen/X86/select-lea.ll
index 487b1f3d3a223..a849280c1377e 100644
--- a/llvm/test/CodeGen/X86/select-lea.ll
+++ b/llvm/test/CodeGen/X86/select-lea.ll
@@ -330,35 +330,27 @@ define i32 @usub_add_load(i32 %x, i32 %y, i32* %pz) nounwind {
define i32 @smul_add_imm(i32 %x, i32 %y) {
; X64-LABEL: smul_add_imm:
; X64: # %bb.0:
-; X64-NEXT: movl %edi, %eax
-; X64-NEXT: imull %esi, %eax
-; X64-NEXT: addl $100, %eax
+; X64-NEXT: # kill: def $edi killed $edi def $rdi
; X64-NEXT: imull %esi, %edi
+; X64-NEXT: leal 100(%rdi), %eax
; X64-NEXT: cmovnol %edi, %eax
; X64-NEXT: retq
;
; CMOV-LABEL: smul_add_imm:
; CMOV: # %bb.0:
-; CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
; CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; CMOV-NEXT: movl %eax, %edx
-; CMOV-NEXT: imull %ecx, %edx
-; CMOV-NEXT: addl $100, %edx
-; CMOV-NEXT: imull %ecx, %eax
-; CMOV-NEXT: cmovol %edx, %eax
+; CMOV-NEXT: imull {{[0-9]+}}(%esp), %ecx
+; CMOV-NEXT: leal 100(%ecx), %eax
+; CMOV-NEXT: cmovnol %ecx, %eax
; CMOV-NEXT: retl
;
; NOCMOV-LABEL: smul_add_imm:
; NOCMOV: # %bb.0:
; NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
-; NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; NOCMOV-NEXT: movl %eax, %ecx
-; NOCMOV-NEXT: imull %edx, %ecx
-; NOCMOV-NEXT: imull %edx, %eax
+; NOCMOV-NEXT: imull {{[0-9]+}}(%esp), %eax
; NOCMOV-NEXT: jno .LBB8_2
; NOCMOV-NEXT: # %bb.1:
-; NOCMOV-NEXT: addl $100, %ecx
-; NOCMOV-NEXT: movl %ecx, %eax
+; NOCMOV-NEXT: addl $100, %eax
; NOCMOV-NEXT: .LBB8_2:
; NOCMOV-NEXT: retl
%o = tail call { i32, i1 } @llvm.smul.with.overflow.i32(i32 %x, i32 %y)
@@ -422,10 +414,8 @@ define i32 @umul_add_imm(i32 %x, i32 %y) {
; X64-NEXT: movl %edi, %eax
; X64-NEXT: mull %esi
; X64-NEXT: # kill: def $eax killed $eax def $rax
-; X64-NEXT: seto %cl
-; X64-NEXT: leal 100(%rax), %edx
-; X64-NEXT: testb %cl, %cl
-; X64-NEXT: cmovnel %edx, %eax
+; X64-NEXT: leal 100(%rax), %ecx
+; X64-NEXT: cmovol %ecx, %eax
; X64-NEXT: # kill: def $eax killed $eax killed $rax
; X64-NEXT: retq
;
@@ -433,10 +423,8 @@ define i32 @umul_add_imm(i32 %x, i32 %y) {
; CMOV: # %bb.0:
; CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
; CMOV-NEXT: mull {{[0-9]+}}(%esp)
-; CMOV-NEXT: seto %cl
-; CMOV-NEXT: leal 100(%eax), %edx
-; CMOV-NEXT: testb %cl, %cl
-; CMOV-NEXT: cmovnel %edx, %eax
+; CMOV-NEXT: leal 100(%eax), %ecx
+; CMOV-NEXT: cmovol %ecx, %eax
; CMOV-NEXT: retl
;
; NOCMOV-LABEL: umul_add_imm:
diff --git a/llvm/test/CodeGen/X86/umul_fix_sat.ll b/llvm/test/CodeGen/X86/umul_fix_sat.ll
index 504557242c305..247b5ee17e7a5 100644
--- a/llvm/test/CodeGen/X86/umul_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/umul_fix_sat.ll
@@ -281,21 +281,21 @@ define i64 @func5(i64 %x, i64 %y) {
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: testl %edi, %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: testl %esi, %esi
; X86-NEXT: setne %dl
; X86-NEXT: testl %eax, %eax
; X86-NEXT: setne %bl
; X86-NEXT: andb %dl, %bl
; X86-NEXT: mull %ebp
-; X86-NEXT: movl %eax, %esi
+; X86-NEXT: movl %eax, %edi
; X86-NEXT: seto %bh
-; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull %ecx
; X86-NEXT: movl %ecx, %edx
; X86-NEXT: seto %cl
; X86-NEXT: orb %bh, %cl
-; X86-NEXT: addl %eax, %esi
+; X86-NEXT: leal (%edi,%eax), %esi
; X86-NEXT: movl %edx, %eax
; X86-NEXT: mull %ebp
; X86-NEXT: addl %esi, %edx
diff --git a/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll
index 40fc6db7fe6b2..3d7544f7f6814 100644
--- a/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll
@@ -19,10 +19,10 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
; X64-NEXT: mulq %rdi
; X64-NEXT: seto %r11b
; X64-NEXT: orb %r10b, %r11b
-; X64-NEXT: addq %rax, %rsi
+; X64-NEXT: leaq (%rsi,%rax), %rcx
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: mulq %r8
-; X64-NEXT: addq %rsi, %rdx
+; X64-NEXT: addq %rcx, %rdx
; X64-NEXT: setb %cl
; X64-NEXT: orb %r11b, %cl
; X64-NEXT: orb %r9b, %cl
@@ -38,64 +38,63 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
; X86-NEXT: .cfi_def_cfa_offset 16
; X86-NEXT: pushl %esi
; X86-NEXT: .cfi_def_cfa_offset 20
-; X86-NEXT: subl $28, %esp
-; X86-NEXT: .cfi_def_cfa_offset 48
+; X86-NEXT: subl $24, %esp
+; X86-NEXT: .cfi_def_cfa_offset 44
; X86-NEXT: .cfi_offset %esi, -20
; X86-NEXT: .cfi_offset %edi, -16
; X86-NEXT: .cfi_offset %ebx, -12
; X86-NEXT: .cfi_offset %ebp, -8
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: mull %ebx
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT: addl %ecx, %esi
; X86-NEXT: movl %edi, %eax
+; X86-NEXT: mull %esi
+; X86-NEXT: leal (%ecx,%eax), %ecx
+; X86-NEXT: seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull %ebx
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, %esi
; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: addl %esi, %ebp
+; X86-NEXT: addl %ecx, %ebp
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: mull %ebx
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ebx
-; X86-NEXT: movl %eax, %edi
+; X86-NEXT: mull %edi
+; X86-NEXT: leal (%ecx,%eax), %ecx
; X86-NEXT: seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: mull %ebx
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: addl %ecx, %edi
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: addl %edi, %ebx
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %ebp, %ebx
+; X86-NEXT: addl %esi, %ebx
+; X86-NEXT: adcl %ebp, %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %esi
+; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ebp
; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: addl %edi, %ecx
+; X86-NEXT: addl %esi, %ecx
; X86-NEXT: adcl $0, %ebp
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: mull %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: mull %edx
; X86-NEXT: movl %edx, %esi
; X86-NEXT: addl %ecx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -103,12 +102,12 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
; X86-NEXT: setb %cl
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull %edi
+; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: addl %esi, %eax
; X86-NEXT: movzbl %cl, %ecx
; X86-NEXT: adcl %ecx, %edx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: adcl %ebx, %edx
+; X86-NEXT: addl %ebx, %eax
+; X86-NEXT: adcl %edi, %edx
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X86-NEXT: testl %ebp, %ebp
; X86-NEXT: setne %cl
@@ -121,10 +120,10 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
; X86-NEXT: orb %ch, %cl
; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT: testl %edi, %edi
+; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp)
; X86-NEXT: setne %cl
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: testl %ebp, %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: testl %edi, %edi
; X86-NEXT: setne %bh
; X86-NEXT: andb %cl, %bh
; X86-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
@@ -133,7 +132,7 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
; X86-NEXT: orl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: setne %bl
-; X86-NEXT: orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: orl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: movl %esi, 4(%ecx)
@@ -150,7 +149,7 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
; X86-NEXT: andb $1, %al
; X86-NEXT: movb %al, 16(%ecx)
; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: addl $28, %esp
+; X86-NEXT: addl $24, %esp
; X86-NEXT: .cfi_def_cfa_offset 20
; X86-NEXT: popl %esi
; X86-NEXT: .cfi_def_cfa_offset 16
diff --git a/llvm/test/CodeGen/X86/umulo-64-legalisation-lowering.ll b/llvm/test/CodeGen/X86/umulo-64-legalisation-lowering.ll
index 2b7e032fb4b7f..3bbeec17c7a9e 100644
--- a/llvm/test/CodeGen/X86/umulo-64-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/X86/umulo-64-legalisation-lowering.ll
@@ -19,21 +19,21 @@ define { i64, i8 } @mulodi_test(i64 %l, i64 %r) unnamed_addr #0 {
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: testl %edi, %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: testl %esi, %esi
; X86-NEXT: setne %dl
; X86-NEXT: testl %eax, %eax
; X86-NEXT: setne %bl
; X86-NEXT: andb %dl, %bl
; X86-NEXT: mull %ebp
-; X86-NEXT: movl %eax, %esi
+; X86-NEXT: movl %eax, %edi
; X86-NEXT: seto %bh
-; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull %ecx
; X86-NEXT: movl %ecx, %edx
; X86-NEXT: seto %ch
; X86-NEXT: orb %bh, %ch
-; X86-NEXT: addl %eax, %esi
+; X86-NEXT: leal (%edi,%eax), %esi
; X86-NEXT: movl %edx, %eax
; X86-NEXT: mull %ebp
; X86-NEXT: addl %esi, %edx
diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll
index 51de68916596b..bd448d5d19244 100644
--- a/llvm/test/CodeGen/X86/vec_umulo.ll
+++ b/llvm/test/CodeGen/X86/vec_umulo.ll
@@ -2952,63 +2952,61 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2)
; SSE2-NEXT: movq %rcx, %r12
; SSE2-NEXT: movq %rdx, %r11
; SSE2-NEXT: movq %rsi, %rax
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r14
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r15
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r14
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r9
; SSE2-NEXT: testq %r10, %r10
-; SSE2-NEXT: setne %cl
+; SSE2-NEXT: setne %dl
; SSE2-NEXT: testq %rsi, %rsi
-; SSE2-NEXT: setne %r13b
-; SSE2-NEXT: andb %cl, %r13b
+; SSE2-NEXT: setne %bpl
+; SSE2-NEXT: andb %dl, %bpl
; SSE2-NEXT: mulq %r8
; SSE2-NEXT: movq %rax, %rsi
-; SSE2-NEXT: seto %bpl
+; SSE2-NEXT: seto %bl
; SSE2-NEXT: movq %r10, %rax
; SSE2-NEXT: mulq %rdi
-; SSE2-NEXT: movq %rax, %rcx
-; SSE2-NEXT: seto %bl
-; SSE2-NEXT: orb %bpl, %bl
-; SSE2-NEXT: addq %rsi, %rcx
+; SSE2-NEXT: seto %cl
+; SSE2-NEXT: orb %bl, %cl
+; SSE2-NEXT: leaq (%rsi,%rax), %rbx
; SSE2-NEXT: movq %rdi, %rax
; SSE2-NEXT: mulq %r8
-; SSE2-NEXT: movq %rax, %r8
+; SSE2-NEXT: movq %rax, %rdi
; SSE2-NEXT: movq %rdx, %rsi
-; SSE2-NEXT: addq %rcx, %rsi
-; SSE2-NEXT: setb %cl
-; SSE2-NEXT: orb %bl, %cl
-; SSE2-NEXT: orb %r13b, %cl
+; SSE2-NEXT: addq %rbx, %rsi
+; SSE2-NEXT: setb %r13b
+; SSE2-NEXT: orb %cl, %r13b
+; SSE2-NEXT: orb %bpl, %r13b
; SSE2-NEXT: testq %r9, %r9
; SSE2-NEXT: setne %al
; SSE2-NEXT: testq %r12, %r12
; SSE2-NEXT: setne %r10b
; SSE2-NEXT: andb %al, %r10b
; SSE2-NEXT: movq %r12, %rax
-; SSE2-NEXT: mulq %r15
-; SSE2-NEXT: movq %rax, %rdi
-; SSE2-NEXT: seto %bpl
+; SSE2-NEXT: mulq %r14
+; SSE2-NEXT: movq %rax, %rbp
+; SSE2-NEXT: seto %r8b
; SSE2-NEXT: movq %r9, %rax
; SSE2-NEXT: mulq %r11
-; SSE2-NEXT: movq %rax, %rbx
-; SSE2-NEXT: seto %r9b
-; SSE2-NEXT: orb %bpl, %r9b
-; SSE2-NEXT: addq %rdi, %rbx
+; SSE2-NEXT: seto %cl
+; SSE2-NEXT: orb %r8b, %cl
+; SSE2-NEXT: addq %rax, %rbp
; SSE2-NEXT: movq %r11, %rax
-; SSE2-NEXT: mulq %r15
-; SSE2-NEXT: addq %rbx, %rdx
+; SSE2-NEXT: mulq %r14
+; SSE2-NEXT: addq %rbp, %rdx
; SSE2-NEXT: setb %bl
-; SSE2-NEXT: orb %r9b, %bl
+; SSE2-NEXT: orb %cl, %bl
; SSE2-NEXT: orb %r10b, %bl
-; SSE2-NEXT: movzbl %bl, %edi
-; SSE2-NEXT: negl %edi
-; SSE2-NEXT: movd %edi, %xmm1
-; SSE2-NEXT: movzbl %cl, %ecx
+; SSE2-NEXT: movzbl %bl, %ecx
+; SSE2-NEXT: negl %ecx
+; SSE2-NEXT: movd %ecx, %xmm1
+; SSE2-NEXT: movzbl %r13b, %ecx
; SSE2-NEXT: negl %ecx
; SSE2-NEXT: movd %ecx, %xmm0
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: movq %rax, 16(%r14)
-; SSE2-NEXT: movq %r8, (%r14)
-; SSE2-NEXT: movq %rdx, 24(%r14)
-; SSE2-NEXT: movq %rsi, 8(%r14)
+; SSE2-NEXT: movq %rax, 16(%r15)
+; SSE2-NEXT: movq %rdi, (%r15)
+; SSE2-NEXT: movq %rdx, 24(%r15)
+; SSE2-NEXT: movq %rsi, 8(%r15)
; SSE2-NEXT: popq %rbx
; SSE2-NEXT: popq %r12
; SSE2-NEXT: popq %r13
@@ -3029,63 +3027,61 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2)
; SSSE3-NEXT: movq %rcx, %r12
; SSSE3-NEXT: movq %rdx, %r11
; SSSE3-NEXT: movq %rsi, %rax
-; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r14
; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r15
+; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r14
; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r9
; SSSE3-NEXT: testq %r10, %r10
-; SSSE3-NEXT: setne %cl
+; SSSE3-NEXT: setne %dl
; SSSE3-NEXT: testq %rsi, %rsi
-; SSSE3-NEXT: setne %r13b
-; SSSE3-NEXT: andb %cl, %r13b
+; SSSE3-NEXT: setne %bpl
+; SSSE3-NEXT: andb %dl, %bpl
; SSSE3-NEXT: mulq %r8
; SSSE3-NEXT: movq %rax, %rsi
-; SSSE3-NEXT: seto %bpl
+; SSSE3-NEXT: seto %bl
; SSSE3-NEXT: movq %r10, %rax
; SSSE3-NEXT: mulq %rdi
-; SSSE3-NEXT: movq %rax, %rcx
-; SSSE3-NEXT: seto %bl
-; SSSE3-NEXT: orb %bpl, %bl
-; SSSE3-NEXT: addq %rsi, %rcx
+; SSSE3-NEXT: seto %cl
+; SSSE3-NEXT: orb %bl, %cl
+; SSSE3-NEXT: leaq (%rsi,%rax), %rbx
; SSSE3-NEXT: movq %rdi, %rax
; SSSE3-NEXT: mulq %r8
-; SSSE3-NEXT: movq %rax, %r8
+; SSSE3-NEXT: movq %rax, %rdi
; SSSE3-NEXT: movq %rdx, %rsi
-; SSSE3-NEXT: addq %rcx, %rsi
-; SSSE3-NEXT: setb %cl
-; SSSE3-NEXT: orb %bl, %cl
-; SSSE3-NEXT: orb %r13b, %cl
+; SSSE3-NEXT: addq %rbx, %rsi
+; SSSE3-NEXT: setb %r13b
+; SSSE3-NEXT: orb %cl, %r13b
+; SSSE3-NEXT: orb %bpl, %r13b
; SSSE3-NEXT: testq %r9, %r9
; SSSE3-NEXT: setne %al
; SSSE3-NEXT: testq %r12, %r12
; SSSE3-NEXT: setne %r10b
; SSSE3-NEXT: andb %al, %r10b
; SSSE3-NEXT: movq %r12, %rax
-; SSSE3-NEXT: mulq %r15
-; SSSE3-NEXT: movq %rax, %rdi
-; SSSE3-NEXT: seto %bpl
+; SSSE3-NEXT: mulq %r14
+; SSSE3-NEXT: movq %rax, %rbp
+; SSSE3-NEXT: seto %r8b
; SSSE3-NEXT: movq %r9, %rax
; SSSE3-NEXT: mulq %r11
-; SSSE3-NEXT: movq %rax, %rbx
-; SSSE3-NEXT: seto %r9b
-; SSSE3-NEXT: orb %bpl, %r9b
-; SSSE3-NEXT: addq %rdi, %rbx
+; SSSE3-NEXT: seto %cl
+; SSSE3-NEXT: orb %r8b, %cl
+; SSSE3-NEXT: addq %rax, %rbp
; SSSE3-NEXT: movq %r11, %rax
-; SSSE3-NEXT: mulq %r15
-; SSSE3-NEXT: addq %rbx, %rdx
+; SSSE3-NEXT: mulq %r14
+; SSSE3-NEXT: addq %rbp, %rdx
; SSSE3-NEXT: setb %bl
-; SSSE3-NEXT: orb %r9b, %bl
+; SSSE3-NEXT: orb %cl, %bl
; SSSE3-NEXT: orb %r10b, %bl
-; SSSE3-NEXT: movzbl %bl, %edi
-; SSSE3-NEXT: negl %edi
-; SSSE3-NEXT: movd %edi, %xmm1
-; SSSE3-NEXT: movzbl %cl, %ecx
+; SSSE3-NEXT: movzbl %bl, %ecx
+; SSSE3-NEXT: negl %ecx
+; SSSE3-NEXT: movd %ecx, %xmm1
+; SSSE3-NEXT: movzbl %r13b, %ecx
; SSSE3-NEXT: negl %ecx
; SSSE3-NEXT: movd %ecx, %xmm0
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSSE3-NEXT: movq %rax, 16(%r14)
-; SSSE3-NEXT: movq %r8, (%r14)
-; SSSE3-NEXT: movq %rdx, 24(%r14)
-; SSSE3-NEXT: movq %rsi, 8(%r14)
+; SSSE3-NEXT: movq %rax, 16(%r15)
+; SSSE3-NEXT: movq %rdi, (%r15)
+; SSSE3-NEXT: movq %rdx, 24(%r15)
+; SSSE3-NEXT: movq %rsi, 8(%r15)
; SSSE3-NEXT: popq %rbx
; SSSE3-NEXT: popq %r12
; SSSE3-NEXT: popq %r13
@@ -3106,62 +3102,60 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2)
; SSE41-NEXT: movq %rcx, %r12
; SSE41-NEXT: movq %rdx, %r11
; SSE41-NEXT: movq %rsi, %rax
-; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r14
; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r15
+; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r14
; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r9
; SSE41-NEXT: testq %r10, %r10
-; SSE41-NEXT: setne %cl
+; SSE41-NEXT: setne %dl
; SSE41-NEXT: testq %rsi, %rsi
-; SSE41-NEXT: setne %r13b
-; SSE41-NEXT: andb %cl, %r13b
+; SSE41-NEXT: setne %bpl
+; SSE41-NEXT: andb %dl, %bpl
; SSE41-NEXT: mulq %r8
; SSE41-NEXT: movq %rax, %rsi
-; SSE41-NEXT: seto %bpl
+; SSE41-NEXT: seto %bl
; SSE41-NEXT: movq %r10, %rax
; SSE41-NEXT: mulq %rdi
-; SSE41-NEXT: movq %rax, %rcx
-; SSE41-NEXT: seto %bl
-; SSE41-NEXT: orb %bpl, %bl
-; SSE41-NEXT: addq %rsi, %rcx
+; SSE41-NEXT: seto %cl
+; SSE41-NEXT: orb %bl, %cl
+; SSE41-NEXT: leaq (%rsi,%rax), %rbx
; SSE41-NEXT: movq %rdi, %rax
; SSE41-NEXT: mulq %r8
-; SSE41-NEXT: movq %rax, %r8
+; SSE41-NEXT: movq %rax, %rdi
; SSE41-NEXT: movq %rdx, %rsi
-; SSE41-NEXT: addq %rcx, %rsi
-; SSE41-NEXT: setb %cl
-; SSE41-NEXT: orb %bl, %cl
-; SSE41-NEXT: orb %r13b, %cl
+; SSE41-NEXT: addq %rbx, %rsi
+; SSE41-NEXT: setb %r13b
+; SSE41-NEXT: orb %cl, %r13b
+; SSE41-NEXT: orb %bpl, %r13b
; SSE41-NEXT: testq %r9, %r9
; SSE41-NEXT: setne %al
; SSE41-NEXT: testq %r12, %r12
; SSE41-NEXT: setne %r10b
; SSE41-NEXT: andb %al, %r10b
; SSE41-NEXT: movq %r12, %rax
-; SSE41-NEXT: mulq %r15
-; SSE41-NEXT: movq %rax, %rdi
-; SSE41-NEXT: seto %bpl
+; SSE41-NEXT: mulq %r14
+; SSE41-NEXT: movq %rax, %rbp
+; SSE41-NEXT: seto %r8b
; SSE41-NEXT: movq %r9, %rax
; SSE41-NEXT: mulq %r11
-; SSE41-NEXT: movq %rax, %rbx
-; SSE41-NEXT: seto %r9b
-; SSE41-NEXT: orb %bpl, %r9b
-; SSE41-NEXT: addq %rdi, %rbx
+; SSE41-NEXT: seto %cl
+; SSE41-NEXT: orb %r8b, %cl
+; SSE41-NEXT: addq %rax, %rbp
; SSE41-NEXT: movq %r11, %rax
-; SSE41-NEXT: mulq %r15
-; SSE41-NEXT: addq %rbx, %rdx
+; SSE41-NEXT: mulq %r14
+; SSE41-NEXT: addq %rbp, %rdx
; SSE41-NEXT: setb %bl
-; SSE41-NEXT: orb %r9b, %bl
+; SSE41-NEXT: orb %cl, %bl
; SSE41-NEXT: orb %r10b, %bl
-; SSE41-NEXT: movzbl %bl, %edi
-; SSE41-NEXT: negl %edi
-; SSE41-NEXT: movzbl %cl, %ecx
+; SSE41-NEXT: movzbl %bl, %ecx
; SSE41-NEXT: negl %ecx
-; SSE41-NEXT: movd %ecx, %xmm0
-; SSE41-NEXT: pinsrd $1, %edi, %xmm0
-; SSE41-NEXT: movq %rax, 16(%r14)
-; SSE41-NEXT: movq %r8, (%r14)
-; SSE41-NEXT: movq %rdx, 24(%r14)
-; SSE41-NEXT: movq %rsi, 8(%r14)
+; SSE41-NEXT: movzbl %r13b, %ebp
+; SSE41-NEXT: negl %ebp
+; SSE41-NEXT: movd %ebp, %xmm0
+; SSE41-NEXT: pinsrd $1, %ecx, %xmm0
+; SSE41-NEXT: movq %rax, 16(%r15)
+; SSE41-NEXT: movq %rdi, (%r15)
+; SSE41-NEXT: movq %rdx, 24(%r15)
+; SSE41-NEXT: movq %rsi, 8(%r15)
; SSE41-NEXT: popq %rbx
; SSE41-NEXT: popq %r12
; SSE41-NEXT: popq %r13
@@ -3182,62 +3176,60 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2)
; AVX-NEXT: movq %rcx, %r12
; AVX-NEXT: movq %rdx, %r11
; AVX-NEXT: movq %rsi, %rax
-; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r14
; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r15
+; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r14
; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r9
; AVX-NEXT: testq %r10, %r10
-; AVX-NEXT: setne %cl
+; AVX-NEXT: setne %dl
; AVX-NEXT: testq %rsi, %rsi
-; AVX-NEXT: setne %r13b
-; AVX-NEXT: andb %cl, %r13b
+; AVX-NEXT: setne %bpl
+; AVX-NEXT: andb %dl, %bpl
; AVX-NEXT: mulq %r8
; AVX-NEXT: movq %rax, %rsi
-; AVX-NEXT: seto %bpl
+; AVX-NEXT: seto %bl
; AVX-NEXT: movq %r10, %rax
; AVX-NEXT: mulq %rdi
-; AVX-NEXT: movq %rax, %rcx
-; AVX-NEXT: seto %bl
-; AVX-NEXT: orb %bpl, %bl
-; AVX-NEXT: addq %rsi, %rcx
+; AVX-NEXT: seto %cl
+; AVX-NEXT: orb %bl, %cl
+; AVX-NEXT: leaq (%rsi,%rax), %rbx
; AVX-NEXT: movq %rdi, %rax
; AVX-NEXT: mulq %r8
-; AVX-NEXT: movq %rax, %r8
+; AVX-NEXT: movq %rax, %rdi
; AVX-NEXT: movq %rdx, %rsi
-; AVX-NEXT: addq %rcx, %rsi
-; AVX-NEXT: setb %cl
-; AVX-NEXT: orb %bl, %cl
-; AVX-NEXT: orb %r13b, %cl
+; AVX-NEXT: addq %rbx, %rsi
+; AVX-NEXT: setb %r13b
+; AVX-NEXT: orb %cl, %r13b
+; AVX-NEXT: orb %bpl, %r13b
; AVX-NEXT: testq %r9, %r9
; AVX-NEXT: setne %al
; AVX-NEXT: testq %r12, %r12
; AVX-NEXT: setne %r10b
; AVX-NEXT: andb %al, %r10b
; AVX-NEXT: movq %r12, %rax
-; AVX-NEXT: mulq %r15
-; AVX-NEXT: movq %rax, %rdi
-; AVX-NEXT: seto %bpl
+; AVX-NEXT: mulq %r14
+; AVX-NEXT: movq %rax, %rbp
+; AVX-NEXT: seto %r8b
; AVX-NEXT: movq %r9, %rax
; AVX-NEXT: mulq %r11
-; AVX-NEXT: movq %rax, %rbx
-; AVX-NEXT: seto %r9b
-; AVX-NEXT: orb %bpl, %r9b
-; AVX-NEXT: addq %rdi, %rbx
+; AVX-NEXT: seto %cl
+; AVX-NEXT: orb %r8b, %cl
+; AVX-NEXT: addq %rax, %rbp
; AVX-NEXT: movq %r11, %rax
-; AVX-NEXT: mulq %r15
-; AVX-NEXT: addq %rbx, %rdx
+; AVX-NEXT: mulq %r14
+; AVX-NEXT: addq %rbp, %rdx
; AVX-NEXT: setb %bl
-; AVX-NEXT: orb %r9b, %bl
+; AVX-NEXT: orb %cl, %bl
; AVX-NEXT: orb %r10b, %bl
-; AVX-NEXT: movzbl %bl, %edi
-; AVX-NEXT: negl %edi
-; AVX-NEXT: movzbl %cl, %ecx
+; AVX-NEXT: movzbl %bl, %ecx
; AVX-NEXT: negl %ecx
-; AVX-NEXT: vmovd %ecx, %xmm0
-; AVX-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0
-; AVX-NEXT: movq %rax, 16(%r14)
-; AVX-NEXT: movq %r8, (%r14)
-; AVX-NEXT: movq %rdx, 24(%r14)
-; AVX-NEXT: movq %rsi, 8(%r14)
+; AVX-NEXT: movzbl %r13b, %ebp
+; AVX-NEXT: negl %ebp
+; AVX-NEXT: vmovd %ebp, %xmm0
+; AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
+; AVX-NEXT: movq %rax, 16(%r15)
+; AVX-NEXT: movq %rdi, (%r15)
+; AVX-NEXT: movq %rdx, 24(%r15)
+; AVX-NEXT: movq %rsi, 8(%r15)
; AVX-NEXT: popq %rbx
; AVX-NEXT: popq %r12
; AVX-NEXT: popq %r13
@@ -3251,7 +3243,6 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2)
; AVX512F-NEXT: pushq %rbp
; AVX512F-NEXT: pushq %r15
; AVX512F-NEXT: pushq %r14
-; AVX512F-NEXT: pushq %r13
; AVX512F-NEXT: pushq %r12
; AVX512F-NEXT: pushq %rbx
; AVX512F-NEXT: movq %rcx, %rax
@@ -3263,25 +3254,24 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2)
; AVX512F-NEXT: testq %r10, %r10
; AVX512F-NEXT: setne %dl
; AVX512F-NEXT: testq %rcx, %rcx
-; AVX512F-NEXT: setne %r13b
-; AVX512F-NEXT: andb %dl, %r13b
+; AVX512F-NEXT: setne %bl
+; AVX512F-NEXT: andb %dl, %bl
; AVX512F-NEXT: mulq %r15
; AVX512F-NEXT: movq %rax, %rdi
; AVX512F-NEXT: seto %bpl
; AVX512F-NEXT: movq %r10, %rax
; AVX512F-NEXT: mulq %r12
-; AVX512F-NEXT: movq %rax, %rbx
; AVX512F-NEXT: seto %cl
; AVX512F-NEXT: orb %bpl, %cl
-; AVX512F-NEXT: addq %rdi, %rbx
+; AVX512F-NEXT: leaq (%rdi,%rax), %rbp
; AVX512F-NEXT: movq %r12, %rax
; AVX512F-NEXT: mulq %r15
; AVX512F-NEXT: movq %rax, %r10
-; AVX512F-NEXT: movq %rdx, %r15
-; AVX512F-NEXT: addq %rbx, %r15
+; AVX512F-NEXT: movq %rdx, %rdi
+; AVX512F-NEXT: addq %rbp, %rdi
; AVX512F-NEXT: setb %al
; AVX512F-NEXT: orb %cl, %al
-; AVX512F-NEXT: orb %r13b, %al
+; AVX512F-NEXT: orb %bl, %al
; AVX512F-NEXT: kmovw %eax, %k0
; AVX512F-NEXT: testq %r9, %r9
; AVX512F-NEXT: setne %al
@@ -3294,13 +3284,12 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2)
; AVX512F-NEXT: seto %bpl
; AVX512F-NEXT: movq %r9, %rax
; AVX512F-NEXT: mulq %r11
-; AVX512F-NEXT: movq %rax, %rdi
; AVX512F-NEXT: seto %bl
; AVX512F-NEXT: orb %bpl, %bl
-; AVX512F-NEXT: addq %rsi, %rdi
+; AVX512F-NEXT: addq %rax, %rsi
; AVX512F-NEXT: movq %r11, %rax
; AVX512F-NEXT: mulq %r8
-; AVX512F-NEXT: addq %rdi, %rdx
+; AVX512F-NEXT: addq %rsi, %rdx
; AVX512F-NEXT: setb %sil
; AVX512F-NEXT: orb %bl, %sil
; AVX512F-NEXT: orb %cl, %sil
@@ -3312,11 +3301,10 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2)
; AVX512F-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512F-NEXT: movq %r10, 16(%r14)
; AVX512F-NEXT: movq %rax, (%r14)
-; AVX512F-NEXT: movq %r15, 24(%r14)
+; AVX512F-NEXT: movq %rdi, 24(%r14)
; AVX512F-NEXT: movq %rdx, 8(%r14)
; AVX512F-NEXT: popq %rbx
; AVX512F-NEXT: popq %r12
-; AVX512F-NEXT: popq %r13
; AVX512F-NEXT: popq %r14
; AVX512F-NEXT: popq %r15
; AVX512F-NEXT: popq %rbp
@@ -3327,7 +3315,6 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2)
; AVX512BW-NEXT: pushq %rbp
; AVX512BW-NEXT: pushq %r15
; AVX512BW-NEXT: pushq %r14
-; AVX512BW-NEXT: pushq %r13
; AVX512BW-NEXT: pushq %r12
; AVX512BW-NEXT: pushq %rbx
; AVX512BW-NEXT: movq %rcx, %rax
@@ -3339,25 +3326,24 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2)
; AVX512BW-NEXT: testq %r10, %r10
; AVX512BW-NEXT: setne %dl
; AVX512BW-NEXT: testq %rcx, %rcx
-; AVX512BW-NEXT: setne %r13b
-; AVX512BW-NEXT: andb %dl, %r13b
+; AVX512BW-NEXT: setne %bl
+; AVX512BW-NEXT: andb %dl, %bl
; AVX512BW-NEXT: mulq %r15
; AVX512BW-NEXT: movq %rax, %rdi
; AVX512BW-NEXT: seto %bpl
; AVX512BW-NEXT: movq %r10, %rax
; AVX512BW-NEXT: mulq %r12
-; AVX512BW-NEXT: movq %rax, %rbx
; AVX512BW-NEXT: seto %cl
; AVX512BW-NEXT: orb %bpl, %cl
-; AVX512BW-NEXT: addq %rdi, %rbx
+; AVX512BW-NEXT: leaq (%rdi,%rax), %rbp
; AVX512BW-NEXT: movq %r12, %rax
; AVX512BW-NEXT: mulq %r15
; AVX512BW-NEXT: movq %rax, %r10
-; AVX512BW-NEXT: movq %rdx, %r15
-; AVX512BW-NEXT: addq %rbx, %r15
+; AVX512BW-NEXT: movq %rdx, %rdi
+; AVX512BW-NEXT: addq %rbp, %rdi
; AVX512BW-NEXT: setb %al
; AVX512BW-NEXT: orb %cl, %al
-; AVX512BW-NEXT: orb %r13b, %al
+; AVX512BW-NEXT: orb %bl, %al
; AVX512BW-NEXT: kmovd %eax, %k0
; AVX512BW-NEXT: testq %r9, %r9
; AVX512BW-NEXT: setne %al
@@ -3370,13 +3356,12 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2)
; AVX512BW-NEXT: seto %bpl
; AVX512BW-NEXT: movq %r9, %rax
; AVX512BW-NEXT: mulq %r11
-; AVX512BW-NEXT: movq %rax, %rdi
; AVX512BW-NEXT: seto %bl
; AVX512BW-NEXT: orb %bpl, %bl
-; AVX512BW-NEXT: addq %rsi, %rdi
+; AVX512BW-NEXT: addq %rax, %rsi
; AVX512BW-NEXT: movq %r11, %rax
; AVX512BW-NEXT: mulq %r8
-; AVX512BW-NEXT: addq %rdi, %rdx
+; AVX512BW-NEXT: addq %rsi, %rdx
; AVX512BW-NEXT: setb %sil
; AVX512BW-NEXT: orb %bl, %sil
; AVX512BW-NEXT: orb %cl, %sil
@@ -3388,11 +3373,10 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2)
; AVX512BW-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512BW-NEXT: movq %r10, 16(%r14)
; AVX512BW-NEXT: movq %rax, (%r14)
-; AVX512BW-NEXT: movq %r15, 24(%r14)
+; AVX512BW-NEXT: movq %rdi, 24(%r14)
; AVX512BW-NEXT: movq %rdx, 8(%r14)
; AVX512BW-NEXT: popq %rbx
; AVX512BW-NEXT: popq %r12
-; AVX512BW-NEXT: popq %r13
; AVX512BW-NEXT: popq %r14
; AVX512BW-NEXT: popq %r15
; AVX512BW-NEXT: popq %rbp
diff --git a/llvm/test/CodeGen/X86/xmulo.ll b/llvm/test/CodeGen/X86/xmulo.ll
index d416b1a547815..71d92af0dd94b 100644
--- a/llvm/test/CodeGen/X86/xmulo.ll
+++ b/llvm/test/CodeGen/X86/xmulo.ll
@@ -487,10 +487,9 @@ define zeroext i1 @umuloi64(i64 %v1, i64 %v2, i64* %res) {
; WIN32-NEXT: movl %esi, %eax
; WIN32-NEXT: mull %ecx
; WIN32-NEXT: movl %ecx, %edx
-; WIN32-NEXT: movl %eax, %esi
; WIN32-NEXT: seto %ch
; WIN32-NEXT: orb %bh, %ch
-; WIN32-NEXT: addl %edi, %esi
+; WIN32-NEXT: leal (%edi,%eax), %esi
; WIN32-NEXT: movl %edx, %eax
; WIN32-NEXT: mull %ebp
; WIN32-NEXT: addl %esi, %edx
@@ -713,6 +712,7 @@ define i64 @umuloselecti64(i64 %v1, i64 %v2) {
; WIN32-NEXT: pushl %eax
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp
; WIN32-NEXT: testl %ebp, %ebp
; WIN32-NEXT: setne %al
@@ -720,26 +720,26 @@ define i64 @umuloselecti64(i64 %v1, i64 %v2) {
; WIN32-NEXT: setne %bl
; WIN32-NEXT: andb %al, %bl
; WIN32-NEXT: movl %esi, %eax
-; WIN32-NEXT: mull {{[0-9]+}}(%esp)
+; WIN32-NEXT: mull %edi
+; WIN32-NEXT: movl %edi, %edx
; WIN32-NEXT: movl %eax, %edi
; WIN32-NEXT: seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; WIN32-NEXT: movl %ebp, %eax
+; WIN32-NEXT: movl %edx, %ebp
; WIN32-NEXT: mull %ecx
-; WIN32-NEXT: movl %eax, %ebp
; WIN32-NEXT: seto %bh
; WIN32-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Folded Reload
-; WIN32-NEXT: addl %edi, %ebp
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi
+; WIN32-NEXT: addl %eax, %edi
; WIN32-NEXT: movl %ecx, %eax
-; WIN32-NEXT: mull %edi
-; WIN32-NEXT: addl %ebp, %edx
+; WIN32-NEXT: mull %ebp
+; WIN32-NEXT: addl %edi, %edx
; WIN32-NEXT: setb %al
; WIN32-NEXT: orb %bh, %al
; WIN32-NEXT: orb %bl, %al
; WIN32-NEXT: testb %al, %al
; WIN32-NEXT: jne LBB14_2
; WIN32-NEXT: # %bb.1:
-; WIN32-NEXT: movl %edi, %ecx
+; WIN32-NEXT: movl %ebp, %ecx
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi
; WIN32-NEXT: LBB14_2:
; WIN32-NEXT: movl %ecx, %eax
@@ -1337,10 +1337,9 @@ define zeroext i1 @umulobri64(i64 %v1, i64 %v2) {
; WIN32-NEXT: movl %esi, %eax
; WIN32-NEXT: mull %ecx
; WIN32-NEXT: movl %ecx, %edx
-; WIN32-NEXT: movl %eax, %esi
; WIN32-NEXT: seto %cl
; WIN32-NEXT: orb %bh, %cl
-; WIN32-NEXT: addl %edi, %esi
+; WIN32-NEXT: leal (%edi,%eax), %esi
; WIN32-NEXT: movl %edx, %eax
; WIN32-NEXT: mull %ebp
; WIN32-NEXT: addl %esi, %edx
@@ -2244,10 +2243,9 @@ define zeroext i1 @umuloi64_load(i64* %ptr1, i64 %v2, i64* %res) {
; WIN32-NEXT: seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; WIN32-NEXT: movl %esi, %eax
; WIN32-NEXT: mull %ecx
-; WIN32-NEXT: movl %eax, %esi
; WIN32-NEXT: seto %bh
; WIN32-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Folded Reload
-; WIN32-NEXT: addl %edi, %esi
+; WIN32-NEXT: leal (%edi,%eax), %esi
; WIN32-NEXT: movl %ecx, %eax
; WIN32-NEXT: mull %ebp
; WIN32-NEXT: addl %esi, %edx
@@ -2325,10 +2323,9 @@ define zeroext i1 @umuloi64_load2(i64 %v1, i64* %ptr2, i64* %res) {
; WIN32-NEXT: seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; WIN32-NEXT: movl %esi, %eax
; WIN32-NEXT: mull %ebp
-; WIN32-NEXT: movl %eax, %esi
; WIN32-NEXT: seto %bh
; WIN32-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Folded Reload
-; WIN32-NEXT: addl %edi, %esi
+; WIN32-NEXT: leal (%edi,%eax), %esi
; WIN32-NEXT: movl %ebp, %eax
; WIN32-NEXT: mull %ecx
; WIN32-NEXT: addl %esi, %edx
More information about the llvm-commits
mailing list