[llvm] 3f22a49 - [X86] selectLEAAddr - add X86ISD::SMUL/UMULO handling

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Thu Feb 17 05:51:25 PST 2022


Author: Simon Pilgrim
Date: 2022-02-17T13:51:02Z
New Revision: 3f22a4962dafe2718a92b3cd9b5be4a6fcc83b77

URL: https://github.com/llvm/llvm-project/commit/3f22a4962dafe2718a92b3cd9b5be4a6fcc83b77
DIFF: https://github.com/llvm/llvm-project/commit/3f22a4962dafe2718a92b3cd9b5be4a6fcc83b77.diff

LOG: [X86] selectLEAAddr - add X86ISD::SMUL/UMULO handling

After D118128 relaxed the heuristic to require only one EFLAGS generating operand, it now makes sense to avoid X86ISD::SMUL/UMULO duplication as well.

Differential Revision: https://reviews.llvm.org/D119578

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
    llvm/test/CodeGen/X86/select-lea.ll
    llvm/test/CodeGen/X86/umul_fix_sat.ll
    llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll
    llvm/test/CodeGen/X86/umulo-64-legalisation-lowering.ll
    llvm/test/CodeGen/X86/vec_umulo.ll
    llvm/test/CodeGen/X86/xmulo.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 17f4b1ec5bf78..66c44a49f4f68 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -2782,10 +2782,10 @@ bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
       case X86ISD::SUB:
       case X86ISD::ADC:
       case X86ISD::SBB:
-      /* TODO: These opcodes can be added safely, but we may want to justify
-               their inclusion for 
diff erent reasons (better for reg-alloc).
       case X86ISD::SMUL:
       case X86ISD::UMUL:
+      /* TODO: These opcodes can be added safely, but we may want to justify
+               their inclusion for 
diff erent reasons (better for reg-alloc).
       case X86ISD::OR:
       case X86ISD::XOR:
       case X86ISD::AND:

diff  --git a/llvm/test/CodeGen/X86/select-lea.ll b/llvm/test/CodeGen/X86/select-lea.ll
index 487b1f3d3a223..a849280c1377e 100644
--- a/llvm/test/CodeGen/X86/select-lea.ll
+++ b/llvm/test/CodeGen/X86/select-lea.ll
@@ -330,35 +330,27 @@ define i32 @usub_add_load(i32 %x, i32 %y, i32* %pz) nounwind {
 define i32 @smul_add_imm(i32 %x, i32 %y) {
 ; X64-LABEL: smul_add_imm:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    imull %esi, %eax
-; X64-NEXT:    addl $100, %eax
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
 ; X64-NEXT:    imull %esi, %edi
+; X64-NEXT:    leal 100(%rdi), %eax
 ; X64-NEXT:    cmovnol %edi, %eax
 ; X64-NEXT:    retq
 ;
 ; CMOV-LABEL: smul_add_imm:
 ; CMOV:       # %bb.0:
-; CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CMOV-NEXT:    movl %eax, %edx
-; CMOV-NEXT:    imull %ecx, %edx
-; CMOV-NEXT:    addl $100, %edx
-; CMOV-NEXT:    imull %ecx, %eax
-; CMOV-NEXT:    cmovol %edx, %eax
+; CMOV-NEXT:    imull {{[0-9]+}}(%esp), %ecx
+; CMOV-NEXT:    leal 100(%ecx), %eax
+; CMOV-NEXT:    cmovnol %ecx, %eax
 ; CMOV-NEXT:    retl
 ;
 ; NOCMOV-LABEL: smul_add_imm:
 ; NOCMOV:       # %bb.0:
 ; NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; NOCMOV-NEXT:    movl %eax, %ecx
-; NOCMOV-NEXT:    imull %edx, %ecx
-; NOCMOV-NEXT:    imull %edx, %eax
+; NOCMOV-NEXT:    imull {{[0-9]+}}(%esp), %eax
 ; NOCMOV-NEXT:    jno .LBB8_2
 ; NOCMOV-NEXT:  # %bb.1:
-; NOCMOV-NEXT:    addl $100, %ecx
-; NOCMOV-NEXT:    movl %ecx, %eax
+; NOCMOV-NEXT:    addl $100, %eax
 ; NOCMOV-NEXT:  .LBB8_2:
 ; NOCMOV-NEXT:    retl
   %o = tail call { i32, i1 } @llvm.smul.with.overflow.i32(i32 %x, i32 %y)
@@ -422,10 +414,8 @@ define i32 @umul_add_imm(i32 %x, i32 %y) {
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    mull %esi
 ; X64-NEXT:    # kill: def $eax killed $eax def $rax
-; X64-NEXT:    seto %cl
-; X64-NEXT:    leal 100(%rax), %edx
-; X64-NEXT:    testb %cl, %cl
-; X64-NEXT:    cmovnel %edx, %eax
+; X64-NEXT:    leal 100(%rax), %ecx
+; X64-NEXT:    cmovol %ecx, %eax
 ; X64-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-NEXT:    retq
 ;
@@ -433,10 +423,8 @@ define i32 @umul_add_imm(i32 %x, i32 %y) {
 ; CMOV:       # %bb.0:
 ; CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CMOV-NEXT:    mull {{[0-9]+}}(%esp)
-; CMOV-NEXT:    seto %cl
-; CMOV-NEXT:    leal 100(%eax), %edx
-; CMOV-NEXT:    testb %cl, %cl
-; CMOV-NEXT:    cmovnel %edx, %eax
+; CMOV-NEXT:    leal 100(%eax), %ecx
+; CMOV-NEXT:    cmovol %ecx, %eax
 ; CMOV-NEXT:    retl
 ;
 ; NOCMOV-LABEL: umul_add_imm:

diff  --git a/llvm/test/CodeGen/X86/umul_fix_sat.ll b/llvm/test/CodeGen/X86/umul_fix_sat.ll
index 504557242c305..247b5ee17e7a5 100644
--- a/llvm/test/CodeGen/X86/umul_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/umul_fix_sat.ll
@@ -281,21 +281,21 @@ define i64 @func5(i64 %x, i64 %y) {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    testl %esi, %esi
 ; X86-NEXT:    setne %dl
 ; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    setne %bl
 ; X86-NEXT:    andb %dl, %bl
 ; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    seto %bh
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    seto %cl
 ; X86-NEXT:    orb %bh, %cl
-; X86-NEXT:    addl %eax, %esi
+; X86-NEXT:    leal (%edi,%eax), %esi
 ; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    mull %ebp
 ; X86-NEXT:    addl %esi, %edx

diff  --git a/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll
index 40fc6db7fe6b2..3d7544f7f6814 100644
--- a/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll
@@ -19,10 +19,10 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    seto %r11b
 ; X64-NEXT:    orb %r10b, %r11b
-; X64-NEXT:    addq %rax, %rsi
+; X64-NEXT:    leaq (%rsi,%rax), %rcx
 ; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    mulq %r8
-; X64-NEXT:    addq %rsi, %rdx
+; X64-NEXT:    addq %rcx, %rdx
 ; X64-NEXT:    setb %cl
 ; X64-NEXT:    orb %r11b, %cl
 ; X64-NEXT:    orb %r9b, %cl
@@ -38,64 +38,63 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 20
-; X86-NEXT:    subl $28, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 48
+; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    .cfi_def_cfa_offset 44
 ; X86-NEXT:    .cfi_offset %esi, -20
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
 ; X86-NEXT:    .cfi_offset %ebp, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    addl %ecx, %esi
 ; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    leal (%ecx,%eax), %ecx
+; X86-NEXT:    seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %esi, %ebp
+; X86-NEXT:    addl %ecx, %ebp
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    mull %edi
+; X86-NEXT:    leal (%ecx,%eax), %ecx
 ; X86-NEXT:    seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ecx, %edi
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %edi, %ebx
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ebp, %ebx
+; X86-NEXT:    addl %esi, %ebx
+; X86-NEXT:    adcl %ebp, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %edi, %ecx
+; X86-NEXT:    addl %esi, %ecx
 ; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    mull %edx
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -103,12 +102,12 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %edi
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movzbl %cl, %ecx
 ; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    adcl %ebx, %edx
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    adcl %edi, %edx
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    testl %ebp, %ebp
 ; X86-NEXT:    setne %cl
@@ -121,10 +120,10 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
 ; X86-NEXT:    orb %ch, %cl
 ; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    setne %cl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    testl %ebp, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    setne %bh
 ; X86-NEXT:    andb %cl, %bh
 ; X86-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
@@ -133,7 +132,7 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    orl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    setne %bl
-; X86-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    orl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    movl %esi, 4(%ecx)
@@ -150,7 +149,7 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; X86-NEXT:    andb $1, %al
 ; X86-NEXT:    movb %al, 16(%ecx)
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $24, %esp
 ; X86-NEXT:    .cfi_def_cfa_offset 20
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 16

diff  --git a/llvm/test/CodeGen/X86/umulo-64-legalisation-lowering.ll b/llvm/test/CodeGen/X86/umulo-64-legalisation-lowering.ll
index 2b7e032fb4b7f..3bbeec17c7a9e 100644
--- a/llvm/test/CodeGen/X86/umulo-64-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/X86/umulo-64-legalisation-lowering.ll
@@ -19,21 +19,21 @@ define { i64, i8 } @mulodi_test(i64 %l, i64 %r) unnamed_addr #0 {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    testl %esi, %esi
 ; X86-NEXT:    setne %dl
 ; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    setne %bl
 ; X86-NEXT:    andb %dl, %bl
 ; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    seto %bh
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    seto %ch
 ; X86-NEXT:    orb %bh, %ch
-; X86-NEXT:    addl %eax, %esi
+; X86-NEXT:    leal (%edi,%eax), %esi
 ; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    mull %ebp
 ; X86-NEXT:    addl %esi, %edx

diff  --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll
index 51de68916596b..bd448d5d19244 100644
--- a/llvm/test/CodeGen/X86/vec_umulo.ll
+++ b/llvm/test/CodeGen/X86/vec_umulo.ll
@@ -2952,63 +2952,61 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2)
 ; SSE2-NEXT:    movq %rcx, %r12
 ; SSE2-NEXT:    movq %rdx, %r11
 ; SSE2-NEXT:    movq %rsi, %rax
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r14
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r15
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r14
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r9
 ; SSE2-NEXT:    testq %r10, %r10
-; SSE2-NEXT:    setne %cl
+; SSE2-NEXT:    setne %dl
 ; SSE2-NEXT:    testq %rsi, %rsi
-; SSE2-NEXT:    setne %r13b
-; SSE2-NEXT:    andb %cl, %r13b
+; SSE2-NEXT:    setne %bpl
+; SSE2-NEXT:    andb %dl, %bpl
 ; SSE2-NEXT:    mulq %r8
 ; SSE2-NEXT:    movq %rax, %rsi
-; SSE2-NEXT:    seto %bpl
+; SSE2-NEXT:    seto %bl
 ; SSE2-NEXT:    movq %r10, %rax
 ; SSE2-NEXT:    mulq %rdi
-; SSE2-NEXT:    movq %rax, %rcx
-; SSE2-NEXT:    seto %bl
-; SSE2-NEXT:    orb %bpl, %bl
-; SSE2-NEXT:    addq %rsi, %rcx
+; SSE2-NEXT:    seto %cl
+; SSE2-NEXT:    orb %bl, %cl
+; SSE2-NEXT:    leaq (%rsi,%rax), %rbx
 ; SSE2-NEXT:    movq %rdi, %rax
 ; SSE2-NEXT:    mulq %r8
-; SSE2-NEXT:    movq %rax, %r8
+; SSE2-NEXT:    movq %rax, %rdi
 ; SSE2-NEXT:    movq %rdx, %rsi
-; SSE2-NEXT:    addq %rcx, %rsi
-; SSE2-NEXT:    setb %cl
-; SSE2-NEXT:    orb %bl, %cl
-; SSE2-NEXT:    orb %r13b, %cl
+; SSE2-NEXT:    addq %rbx, %rsi
+; SSE2-NEXT:    setb %r13b
+; SSE2-NEXT:    orb %cl, %r13b
+; SSE2-NEXT:    orb %bpl, %r13b
 ; SSE2-NEXT:    testq %r9, %r9
 ; SSE2-NEXT:    setne %al
 ; SSE2-NEXT:    testq %r12, %r12
 ; SSE2-NEXT:    setne %r10b
 ; SSE2-NEXT:    andb %al, %r10b
 ; SSE2-NEXT:    movq %r12, %rax
-; SSE2-NEXT:    mulq %r15
-; SSE2-NEXT:    movq %rax, %rdi
-; SSE2-NEXT:    seto %bpl
+; SSE2-NEXT:    mulq %r14
+; SSE2-NEXT:    movq %rax, %rbp
+; SSE2-NEXT:    seto %r8b
 ; SSE2-NEXT:    movq %r9, %rax
 ; SSE2-NEXT:    mulq %r11
-; SSE2-NEXT:    movq %rax, %rbx
-; SSE2-NEXT:    seto %r9b
-; SSE2-NEXT:    orb %bpl, %r9b
-; SSE2-NEXT:    addq %rdi, %rbx
+; SSE2-NEXT:    seto %cl
+; SSE2-NEXT:    orb %r8b, %cl
+; SSE2-NEXT:    addq %rax, %rbp
 ; SSE2-NEXT:    movq %r11, %rax
-; SSE2-NEXT:    mulq %r15
-; SSE2-NEXT:    addq %rbx, %rdx
+; SSE2-NEXT:    mulq %r14
+; SSE2-NEXT:    addq %rbp, %rdx
 ; SSE2-NEXT:    setb %bl
-; SSE2-NEXT:    orb %r9b, %bl
+; SSE2-NEXT:    orb %cl, %bl
 ; SSE2-NEXT:    orb %r10b, %bl
-; SSE2-NEXT:    movzbl %bl, %edi
-; SSE2-NEXT:    negl %edi
-; SSE2-NEXT:    movd %edi, %xmm1
-; SSE2-NEXT:    movzbl %cl, %ecx
+; SSE2-NEXT:    movzbl %bl, %ecx
+; SSE2-NEXT:    negl %ecx
+; SSE2-NEXT:    movd %ecx, %xmm1
+; SSE2-NEXT:    movzbl %r13b, %ecx
 ; SSE2-NEXT:    negl %ecx
 ; SSE2-NEXT:    movd %ecx, %xmm0
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT:    movq %rax, 16(%r14)
-; SSE2-NEXT:    movq %r8, (%r14)
-; SSE2-NEXT:    movq %rdx, 24(%r14)
-; SSE2-NEXT:    movq %rsi, 8(%r14)
+; SSE2-NEXT:    movq %rax, 16(%r15)
+; SSE2-NEXT:    movq %rdi, (%r15)
+; SSE2-NEXT:    movq %rdx, 24(%r15)
+; SSE2-NEXT:    movq %rsi, 8(%r15)
 ; SSE2-NEXT:    popq %rbx
 ; SSE2-NEXT:    popq %r12
 ; SSE2-NEXT:    popq %r13
@@ -3029,63 +3027,61 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2)
 ; SSSE3-NEXT:    movq %rcx, %r12
 ; SSSE3-NEXT:    movq %rdx, %r11
 ; SSSE3-NEXT:    movq %rsi, %rax
-; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %r14
 ; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %r15
+; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %r14
 ; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %r9
 ; SSSE3-NEXT:    testq %r10, %r10
-; SSSE3-NEXT:    setne %cl
+; SSSE3-NEXT:    setne %dl
 ; SSSE3-NEXT:    testq %rsi, %rsi
-; SSSE3-NEXT:    setne %r13b
-; SSSE3-NEXT:    andb %cl, %r13b
+; SSSE3-NEXT:    setne %bpl
+; SSSE3-NEXT:    andb %dl, %bpl
 ; SSSE3-NEXT:    mulq %r8
 ; SSSE3-NEXT:    movq %rax, %rsi
-; SSSE3-NEXT:    seto %bpl
+; SSSE3-NEXT:    seto %bl
 ; SSSE3-NEXT:    movq %r10, %rax
 ; SSSE3-NEXT:    mulq %rdi
-; SSSE3-NEXT:    movq %rax, %rcx
-; SSSE3-NEXT:    seto %bl
-; SSSE3-NEXT:    orb %bpl, %bl
-; SSSE3-NEXT:    addq %rsi, %rcx
+; SSSE3-NEXT:    seto %cl
+; SSSE3-NEXT:    orb %bl, %cl
+; SSSE3-NEXT:    leaq (%rsi,%rax), %rbx
 ; SSSE3-NEXT:    movq %rdi, %rax
 ; SSSE3-NEXT:    mulq %r8
-; SSSE3-NEXT:    movq %rax, %r8
+; SSSE3-NEXT:    movq %rax, %rdi
 ; SSSE3-NEXT:    movq %rdx, %rsi
-; SSSE3-NEXT:    addq %rcx, %rsi
-; SSSE3-NEXT:    setb %cl
-; SSSE3-NEXT:    orb %bl, %cl
-; SSSE3-NEXT:    orb %r13b, %cl
+; SSSE3-NEXT:    addq %rbx, %rsi
+; SSSE3-NEXT:    setb %r13b
+; SSSE3-NEXT:    orb %cl, %r13b
+; SSSE3-NEXT:    orb %bpl, %r13b
 ; SSSE3-NEXT:    testq %r9, %r9
 ; SSSE3-NEXT:    setne %al
 ; SSSE3-NEXT:    testq %r12, %r12
 ; SSSE3-NEXT:    setne %r10b
 ; SSSE3-NEXT:    andb %al, %r10b
 ; SSSE3-NEXT:    movq %r12, %rax
-; SSSE3-NEXT:    mulq %r15
-; SSSE3-NEXT:    movq %rax, %rdi
-; SSSE3-NEXT:    seto %bpl
+; SSSE3-NEXT:    mulq %r14
+; SSSE3-NEXT:    movq %rax, %rbp
+; SSSE3-NEXT:    seto %r8b
 ; SSSE3-NEXT:    movq %r9, %rax
 ; SSSE3-NEXT:    mulq %r11
-; SSSE3-NEXT:    movq %rax, %rbx
-; SSSE3-NEXT:    seto %r9b
-; SSSE3-NEXT:    orb %bpl, %r9b
-; SSSE3-NEXT:    addq %rdi, %rbx
+; SSSE3-NEXT:    seto %cl
+; SSSE3-NEXT:    orb %r8b, %cl
+; SSSE3-NEXT:    addq %rax, %rbp
 ; SSSE3-NEXT:    movq %r11, %rax
-; SSSE3-NEXT:    mulq %r15
-; SSSE3-NEXT:    addq %rbx, %rdx
+; SSSE3-NEXT:    mulq %r14
+; SSSE3-NEXT:    addq %rbp, %rdx
 ; SSSE3-NEXT:    setb %bl
-; SSSE3-NEXT:    orb %r9b, %bl
+; SSSE3-NEXT:    orb %cl, %bl
 ; SSSE3-NEXT:    orb %r10b, %bl
-; SSSE3-NEXT:    movzbl %bl, %edi
-; SSSE3-NEXT:    negl %edi
-; SSSE3-NEXT:    movd %edi, %xmm1
-; SSSE3-NEXT:    movzbl %cl, %ecx
+; SSSE3-NEXT:    movzbl %bl, %ecx
+; SSSE3-NEXT:    negl %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm1
+; SSSE3-NEXT:    movzbl %r13b, %ecx
 ; SSSE3-NEXT:    negl %ecx
 ; SSSE3-NEXT:    movd %ecx, %xmm0
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSSE3-NEXT:    movq %rax, 16(%r14)
-; SSSE3-NEXT:    movq %r8, (%r14)
-; SSSE3-NEXT:    movq %rdx, 24(%r14)
-; SSSE3-NEXT:    movq %rsi, 8(%r14)
+; SSSE3-NEXT:    movq %rax, 16(%r15)
+; SSSE3-NEXT:    movq %rdi, (%r15)
+; SSSE3-NEXT:    movq %rdx, 24(%r15)
+; SSSE3-NEXT:    movq %rsi, 8(%r15)
 ; SSSE3-NEXT:    popq %rbx
 ; SSSE3-NEXT:    popq %r12
 ; SSSE3-NEXT:    popq %r13
@@ -3106,62 +3102,60 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2)
 ; SSE41-NEXT:    movq %rcx, %r12
 ; SSE41-NEXT:    movq %rdx, %r11
 ; SSE41-NEXT:    movq %rsi, %rax
-; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %r14
 ; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %r15
+; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %r14
 ; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %r9
 ; SSE41-NEXT:    testq %r10, %r10
-; SSE41-NEXT:    setne %cl
+; SSE41-NEXT:    setne %dl
 ; SSE41-NEXT:    testq %rsi, %rsi
-; SSE41-NEXT:    setne %r13b
-; SSE41-NEXT:    andb %cl, %r13b
+; SSE41-NEXT:    setne %bpl
+; SSE41-NEXT:    andb %dl, %bpl
 ; SSE41-NEXT:    mulq %r8
 ; SSE41-NEXT:    movq %rax, %rsi
-; SSE41-NEXT:    seto %bpl
+; SSE41-NEXT:    seto %bl
 ; SSE41-NEXT:    movq %r10, %rax
 ; SSE41-NEXT:    mulq %rdi
-; SSE41-NEXT:    movq %rax, %rcx
-; SSE41-NEXT:    seto %bl
-; SSE41-NEXT:    orb %bpl, %bl
-; SSE41-NEXT:    addq %rsi, %rcx
+; SSE41-NEXT:    seto %cl
+; SSE41-NEXT:    orb %bl, %cl
+; SSE41-NEXT:    leaq (%rsi,%rax), %rbx
 ; SSE41-NEXT:    movq %rdi, %rax
 ; SSE41-NEXT:    mulq %r8
-; SSE41-NEXT:    movq %rax, %r8
+; SSE41-NEXT:    movq %rax, %rdi
 ; SSE41-NEXT:    movq %rdx, %rsi
-; SSE41-NEXT:    addq %rcx, %rsi
-; SSE41-NEXT:    setb %cl
-; SSE41-NEXT:    orb %bl, %cl
-; SSE41-NEXT:    orb %r13b, %cl
+; SSE41-NEXT:    addq %rbx, %rsi
+; SSE41-NEXT:    setb %r13b
+; SSE41-NEXT:    orb %cl, %r13b
+; SSE41-NEXT:    orb %bpl, %r13b
 ; SSE41-NEXT:    testq %r9, %r9
 ; SSE41-NEXT:    setne %al
 ; SSE41-NEXT:    testq %r12, %r12
 ; SSE41-NEXT:    setne %r10b
 ; SSE41-NEXT:    andb %al, %r10b
 ; SSE41-NEXT:    movq %r12, %rax
-; SSE41-NEXT:    mulq %r15
-; SSE41-NEXT:    movq %rax, %rdi
-; SSE41-NEXT:    seto %bpl
+; SSE41-NEXT:    mulq %r14
+; SSE41-NEXT:    movq %rax, %rbp
+; SSE41-NEXT:    seto %r8b
 ; SSE41-NEXT:    movq %r9, %rax
 ; SSE41-NEXT:    mulq %r11
-; SSE41-NEXT:    movq %rax, %rbx
-; SSE41-NEXT:    seto %r9b
-; SSE41-NEXT:    orb %bpl, %r9b
-; SSE41-NEXT:    addq %rdi, %rbx
+; SSE41-NEXT:    seto %cl
+; SSE41-NEXT:    orb %r8b, %cl
+; SSE41-NEXT:    addq %rax, %rbp
 ; SSE41-NEXT:    movq %r11, %rax
-; SSE41-NEXT:    mulq %r15
-; SSE41-NEXT:    addq %rbx, %rdx
+; SSE41-NEXT:    mulq %r14
+; SSE41-NEXT:    addq %rbp, %rdx
 ; SSE41-NEXT:    setb %bl
-; SSE41-NEXT:    orb %r9b, %bl
+; SSE41-NEXT:    orb %cl, %bl
 ; SSE41-NEXT:    orb %r10b, %bl
-; SSE41-NEXT:    movzbl %bl, %edi
-; SSE41-NEXT:    negl %edi
-; SSE41-NEXT:    movzbl %cl, %ecx
+; SSE41-NEXT:    movzbl %bl, %ecx
 ; SSE41-NEXT:    negl %ecx
-; SSE41-NEXT:    movd %ecx, %xmm0
-; SSE41-NEXT:    pinsrd $1, %edi, %xmm0
-; SSE41-NEXT:    movq %rax, 16(%r14)
-; SSE41-NEXT:    movq %r8, (%r14)
-; SSE41-NEXT:    movq %rdx, 24(%r14)
-; SSE41-NEXT:    movq %rsi, 8(%r14)
+; SSE41-NEXT:    movzbl %r13b, %ebp
+; SSE41-NEXT:    negl %ebp
+; SSE41-NEXT:    movd %ebp, %xmm0
+; SSE41-NEXT:    pinsrd $1, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, 16(%r15)
+; SSE41-NEXT:    movq %rdi, (%r15)
+; SSE41-NEXT:    movq %rdx, 24(%r15)
+; SSE41-NEXT:    movq %rsi, 8(%r15)
 ; SSE41-NEXT:    popq %rbx
 ; SSE41-NEXT:    popq %r12
 ; SSE41-NEXT:    popq %r13
@@ -3182,62 +3176,60 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2)
 ; AVX-NEXT:    movq %rcx, %r12
 ; AVX-NEXT:    movq %rdx, %r11
 ; AVX-NEXT:    movq %rsi, %rax
-; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %r14
 ; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %r15
+; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %r14
 ; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %r9
 ; AVX-NEXT:    testq %r10, %r10
-; AVX-NEXT:    setne %cl
+; AVX-NEXT:    setne %dl
 ; AVX-NEXT:    testq %rsi, %rsi
-; AVX-NEXT:    setne %r13b
-; AVX-NEXT:    andb %cl, %r13b
+; AVX-NEXT:    setne %bpl
+; AVX-NEXT:    andb %dl, %bpl
 ; AVX-NEXT:    mulq %r8
 ; AVX-NEXT:    movq %rax, %rsi
-; AVX-NEXT:    seto %bpl
+; AVX-NEXT:    seto %bl
 ; AVX-NEXT:    movq %r10, %rax
 ; AVX-NEXT:    mulq %rdi
-; AVX-NEXT:    movq %rax, %rcx
-; AVX-NEXT:    seto %bl
-; AVX-NEXT:    orb %bpl, %bl
-; AVX-NEXT:    addq %rsi, %rcx
+; AVX-NEXT:    seto %cl
+; AVX-NEXT:    orb %bl, %cl
+; AVX-NEXT:    leaq (%rsi,%rax), %rbx
 ; AVX-NEXT:    movq %rdi, %rax
 ; AVX-NEXT:    mulq %r8
-; AVX-NEXT:    movq %rax, %r8
+; AVX-NEXT:    movq %rax, %rdi
 ; AVX-NEXT:    movq %rdx, %rsi
-; AVX-NEXT:    addq %rcx, %rsi
-; AVX-NEXT:    setb %cl
-; AVX-NEXT:    orb %bl, %cl
-; AVX-NEXT:    orb %r13b, %cl
+; AVX-NEXT:    addq %rbx, %rsi
+; AVX-NEXT:    setb %r13b
+; AVX-NEXT:    orb %cl, %r13b
+; AVX-NEXT:    orb %bpl, %r13b
 ; AVX-NEXT:    testq %r9, %r9
 ; AVX-NEXT:    setne %al
 ; AVX-NEXT:    testq %r12, %r12
 ; AVX-NEXT:    setne %r10b
 ; AVX-NEXT:    andb %al, %r10b
 ; AVX-NEXT:    movq %r12, %rax
-; AVX-NEXT:    mulq %r15
-; AVX-NEXT:    movq %rax, %rdi
-; AVX-NEXT:    seto %bpl
+; AVX-NEXT:    mulq %r14
+; AVX-NEXT:    movq %rax, %rbp
+; AVX-NEXT:    seto %r8b
 ; AVX-NEXT:    movq %r9, %rax
 ; AVX-NEXT:    mulq %r11
-; AVX-NEXT:    movq %rax, %rbx
-; AVX-NEXT:    seto %r9b
-; AVX-NEXT:    orb %bpl, %r9b
-; AVX-NEXT:    addq %rdi, %rbx
+; AVX-NEXT:    seto %cl
+; AVX-NEXT:    orb %r8b, %cl
+; AVX-NEXT:    addq %rax, %rbp
 ; AVX-NEXT:    movq %r11, %rax
-; AVX-NEXT:    mulq %r15
-; AVX-NEXT:    addq %rbx, %rdx
+; AVX-NEXT:    mulq %r14
+; AVX-NEXT:    addq %rbp, %rdx
 ; AVX-NEXT:    setb %bl
-; AVX-NEXT:    orb %r9b, %bl
+; AVX-NEXT:    orb %cl, %bl
 ; AVX-NEXT:    orb %r10b, %bl
-; AVX-NEXT:    movzbl %bl, %edi
-; AVX-NEXT:    negl %edi
-; AVX-NEXT:    movzbl %cl, %ecx
+; AVX-NEXT:    movzbl %bl, %ecx
 ; AVX-NEXT:    negl %ecx
-; AVX-NEXT:    vmovd %ecx, %xmm0
-; AVX-NEXT:    vpinsrd $1, %edi, %xmm0, %xmm0
-; AVX-NEXT:    movq %rax, 16(%r14)
-; AVX-NEXT:    movq %r8, (%r14)
-; AVX-NEXT:    movq %rdx, 24(%r14)
-; AVX-NEXT:    movq %rsi, 8(%r14)
+; AVX-NEXT:    movzbl %r13b, %ebp
+; AVX-NEXT:    negl %ebp
+; AVX-NEXT:    vmovd %ebp, %xmm0
+; AVX-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
+; AVX-NEXT:    movq %rax, 16(%r15)
+; AVX-NEXT:    movq %rdi, (%r15)
+; AVX-NEXT:    movq %rdx, 24(%r15)
+; AVX-NEXT:    movq %rsi, 8(%r15)
 ; AVX-NEXT:    popq %rbx
 ; AVX-NEXT:    popq %r12
 ; AVX-NEXT:    popq %r13
@@ -3251,7 +3243,6 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2)
 ; AVX512F-NEXT:    pushq %rbp
 ; AVX512F-NEXT:    pushq %r15
 ; AVX512F-NEXT:    pushq %r14
-; AVX512F-NEXT:    pushq %r13
 ; AVX512F-NEXT:    pushq %r12
 ; AVX512F-NEXT:    pushq %rbx
 ; AVX512F-NEXT:    movq %rcx, %rax
@@ -3263,25 +3254,24 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2)
 ; AVX512F-NEXT:    testq %r10, %r10
 ; AVX512F-NEXT:    setne %dl
 ; AVX512F-NEXT:    testq %rcx, %rcx
-; AVX512F-NEXT:    setne %r13b
-; AVX512F-NEXT:    andb %dl, %r13b
+; AVX512F-NEXT:    setne %bl
+; AVX512F-NEXT:    andb %dl, %bl
 ; AVX512F-NEXT:    mulq %r15
 ; AVX512F-NEXT:    movq %rax, %rdi
 ; AVX512F-NEXT:    seto %bpl
 ; AVX512F-NEXT:    movq %r10, %rax
 ; AVX512F-NEXT:    mulq %r12
-; AVX512F-NEXT:    movq %rax, %rbx
 ; AVX512F-NEXT:    seto %cl
 ; AVX512F-NEXT:    orb %bpl, %cl
-; AVX512F-NEXT:    addq %rdi, %rbx
+; AVX512F-NEXT:    leaq (%rdi,%rax), %rbp
 ; AVX512F-NEXT:    movq %r12, %rax
 ; AVX512F-NEXT:    mulq %r15
 ; AVX512F-NEXT:    movq %rax, %r10
-; AVX512F-NEXT:    movq %rdx, %r15
-; AVX512F-NEXT:    addq %rbx, %r15
+; AVX512F-NEXT:    movq %rdx, %rdi
+; AVX512F-NEXT:    addq %rbp, %rdi
 ; AVX512F-NEXT:    setb %al
 ; AVX512F-NEXT:    orb %cl, %al
-; AVX512F-NEXT:    orb %r13b, %al
+; AVX512F-NEXT:    orb %bl, %al
 ; AVX512F-NEXT:    kmovw %eax, %k0
 ; AVX512F-NEXT:    testq %r9, %r9
 ; AVX512F-NEXT:    setne %al
@@ -3294,13 +3284,12 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2)
 ; AVX512F-NEXT:    seto %bpl
 ; AVX512F-NEXT:    movq %r9, %rax
 ; AVX512F-NEXT:    mulq %r11
-; AVX512F-NEXT:    movq %rax, %rdi
 ; AVX512F-NEXT:    seto %bl
 ; AVX512F-NEXT:    orb %bpl, %bl
-; AVX512F-NEXT:    addq %rsi, %rdi
+; AVX512F-NEXT:    addq %rax, %rsi
 ; AVX512F-NEXT:    movq %r11, %rax
 ; AVX512F-NEXT:    mulq %r8
-; AVX512F-NEXT:    addq %rdi, %rdx
+; AVX512F-NEXT:    addq %rsi, %rdx
 ; AVX512F-NEXT:    setb %sil
 ; AVX512F-NEXT:    orb %bl, %sil
 ; AVX512F-NEXT:    orb %cl, %sil
@@ -3312,11 +3301,10 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2)
 ; AVX512F-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
 ; AVX512F-NEXT:    movq %r10, 16(%r14)
 ; AVX512F-NEXT:    movq %rax, (%r14)
-; AVX512F-NEXT:    movq %r15, 24(%r14)
+; AVX512F-NEXT:    movq %rdi, 24(%r14)
 ; AVX512F-NEXT:    movq %rdx, 8(%r14)
 ; AVX512F-NEXT:    popq %rbx
 ; AVX512F-NEXT:    popq %r12
-; AVX512F-NEXT:    popq %r13
 ; AVX512F-NEXT:    popq %r14
 ; AVX512F-NEXT:    popq %r15
 ; AVX512F-NEXT:    popq %rbp
@@ -3327,7 +3315,6 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2)
 ; AVX512BW-NEXT:    pushq %rbp
 ; AVX512BW-NEXT:    pushq %r15
 ; AVX512BW-NEXT:    pushq %r14
-; AVX512BW-NEXT:    pushq %r13
 ; AVX512BW-NEXT:    pushq %r12
 ; AVX512BW-NEXT:    pushq %rbx
 ; AVX512BW-NEXT:    movq %rcx, %rax
@@ -3339,25 +3326,24 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2)
 ; AVX512BW-NEXT:    testq %r10, %r10
 ; AVX512BW-NEXT:    setne %dl
 ; AVX512BW-NEXT:    testq %rcx, %rcx
-; AVX512BW-NEXT:    setne %r13b
-; AVX512BW-NEXT:    andb %dl, %r13b
+; AVX512BW-NEXT:    setne %bl
+; AVX512BW-NEXT:    andb %dl, %bl
 ; AVX512BW-NEXT:    mulq %r15
 ; AVX512BW-NEXT:    movq %rax, %rdi
 ; AVX512BW-NEXT:    seto %bpl
 ; AVX512BW-NEXT:    movq %r10, %rax
 ; AVX512BW-NEXT:    mulq %r12
-; AVX512BW-NEXT:    movq %rax, %rbx
 ; AVX512BW-NEXT:    seto %cl
 ; AVX512BW-NEXT:    orb %bpl, %cl
-; AVX512BW-NEXT:    addq %rdi, %rbx
+; AVX512BW-NEXT:    leaq (%rdi,%rax), %rbp
 ; AVX512BW-NEXT:    movq %r12, %rax
 ; AVX512BW-NEXT:    mulq %r15
 ; AVX512BW-NEXT:    movq %rax, %r10
-; AVX512BW-NEXT:    movq %rdx, %r15
-; AVX512BW-NEXT:    addq %rbx, %r15
+; AVX512BW-NEXT:    movq %rdx, %rdi
+; AVX512BW-NEXT:    addq %rbp, %rdi
 ; AVX512BW-NEXT:    setb %al
 ; AVX512BW-NEXT:    orb %cl, %al
-; AVX512BW-NEXT:    orb %r13b, %al
+; AVX512BW-NEXT:    orb %bl, %al
 ; AVX512BW-NEXT:    kmovd %eax, %k0
 ; AVX512BW-NEXT:    testq %r9, %r9
 ; AVX512BW-NEXT:    setne %al
@@ -3370,13 +3356,12 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2)
 ; AVX512BW-NEXT:    seto %bpl
 ; AVX512BW-NEXT:    movq %r9, %rax
 ; AVX512BW-NEXT:    mulq %r11
-; AVX512BW-NEXT:    movq %rax, %rdi
 ; AVX512BW-NEXT:    seto %bl
 ; AVX512BW-NEXT:    orb %bpl, %bl
-; AVX512BW-NEXT:    addq %rsi, %rdi
+; AVX512BW-NEXT:    addq %rax, %rsi
 ; AVX512BW-NEXT:    movq %r11, %rax
 ; AVX512BW-NEXT:    mulq %r8
-; AVX512BW-NEXT:    addq %rdi, %rdx
+; AVX512BW-NEXT:    addq %rsi, %rdx
 ; AVX512BW-NEXT:    setb %sil
 ; AVX512BW-NEXT:    orb %bl, %sil
 ; AVX512BW-NEXT:    orb %cl, %sil
@@ -3388,11 +3373,10 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2)
 ; AVX512BW-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
 ; AVX512BW-NEXT:    movq %r10, 16(%r14)
 ; AVX512BW-NEXT:    movq %rax, (%r14)
-; AVX512BW-NEXT:    movq %r15, 24(%r14)
+; AVX512BW-NEXT:    movq %rdi, 24(%r14)
 ; AVX512BW-NEXT:    movq %rdx, 8(%r14)
 ; AVX512BW-NEXT:    popq %rbx
 ; AVX512BW-NEXT:    popq %r12
-; AVX512BW-NEXT:    popq %r13
 ; AVX512BW-NEXT:    popq %r14
 ; AVX512BW-NEXT:    popq %r15
 ; AVX512BW-NEXT:    popq %rbp

diff  --git a/llvm/test/CodeGen/X86/xmulo.ll b/llvm/test/CodeGen/X86/xmulo.ll
index d416b1a547815..71d92af0dd94b 100644
--- a/llvm/test/CodeGen/X86/xmulo.ll
+++ b/llvm/test/CodeGen/X86/xmulo.ll
@@ -487,10 +487,9 @@ define zeroext i1 @umuloi64(i64 %v1, i64 %v2, i64* %res) {
 ; WIN32-NEXT:    movl %esi, %eax
 ; WIN32-NEXT:    mull %ecx
 ; WIN32-NEXT:    movl %ecx, %edx
-; WIN32-NEXT:    movl %eax, %esi
 ; WIN32-NEXT:    seto %ch
 ; WIN32-NEXT:    orb %bh, %ch
-; WIN32-NEXT:    addl %edi, %esi
+; WIN32-NEXT:    leal (%edi,%eax), %esi
 ; WIN32-NEXT:    movl %edx, %eax
 ; WIN32-NEXT:    mull %ebp
 ; WIN32-NEXT:    addl %esi, %edx
@@ -713,6 +712,7 @@ define i64 @umuloselecti64(i64 %v1, i64 %v2) {
 ; WIN32-NEXT:    pushl %eax
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; WIN32-NEXT:    testl %ebp, %ebp
 ; WIN32-NEXT:    setne %al
@@ -720,26 +720,26 @@ define i64 @umuloselecti64(i64 %v1, i64 %v2) {
 ; WIN32-NEXT:    setne %bl
 ; WIN32-NEXT:    andb %al, %bl
 ; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
+; WIN32-NEXT:    mull %edi
+; WIN32-NEXT:    movl %edi, %edx
 ; WIN32-NEXT:    movl %eax, %edi
 ; WIN32-NEXT:    seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; WIN32-NEXT:    movl %ebp, %eax
+; WIN32-NEXT:    movl %edx, %ebp
 ; WIN32-NEXT:    mull %ecx
-; WIN32-NEXT:    movl %eax, %ebp
 ; WIN32-NEXT:    seto %bh
 ; WIN32-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Folded Reload
-; WIN32-NEXT:    addl %edi, %ebp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; WIN32-NEXT:    addl %eax, %edi
 ; WIN32-NEXT:    movl %ecx, %eax
-; WIN32-NEXT:    mull %edi
-; WIN32-NEXT:    addl %ebp, %edx
+; WIN32-NEXT:    mull %ebp
+; WIN32-NEXT:    addl %edi, %edx
 ; WIN32-NEXT:    setb %al
 ; WIN32-NEXT:    orb %bh, %al
 ; WIN32-NEXT:    orb %bl, %al
 ; WIN32-NEXT:    testb %al, %al
 ; WIN32-NEXT:    jne LBB14_2
 ; WIN32-NEXT:  # %bb.1:
-; WIN32-NEXT:    movl %edi, %ecx
+; WIN32-NEXT:    movl %ebp, %ecx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; WIN32-NEXT:  LBB14_2:
 ; WIN32-NEXT:    movl %ecx, %eax
@@ -1337,10 +1337,9 @@ define zeroext i1 @umulobri64(i64 %v1, i64 %v2) {
 ; WIN32-NEXT:    movl %esi, %eax
 ; WIN32-NEXT:    mull %ecx
 ; WIN32-NEXT:    movl %ecx, %edx
-; WIN32-NEXT:    movl %eax, %esi
 ; WIN32-NEXT:    seto %cl
 ; WIN32-NEXT:    orb %bh, %cl
-; WIN32-NEXT:    addl %edi, %esi
+; WIN32-NEXT:    leal (%edi,%eax), %esi
 ; WIN32-NEXT:    movl %edx, %eax
 ; WIN32-NEXT:    mull %ebp
 ; WIN32-NEXT:    addl %esi, %edx
@@ -2244,10 +2243,9 @@ define zeroext i1 @umuloi64_load(i64* %ptr1, i64 %v2, i64* %res) {
 ; WIN32-NEXT:    seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; WIN32-NEXT:    movl %esi, %eax
 ; WIN32-NEXT:    mull %ecx
-; WIN32-NEXT:    movl %eax, %esi
 ; WIN32-NEXT:    seto %bh
 ; WIN32-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Folded Reload
-; WIN32-NEXT:    addl %edi, %esi
+; WIN32-NEXT:    leal (%edi,%eax), %esi
 ; WIN32-NEXT:    movl %ecx, %eax
 ; WIN32-NEXT:    mull %ebp
 ; WIN32-NEXT:    addl %esi, %edx
@@ -2325,10 +2323,9 @@ define zeroext i1 @umuloi64_load2(i64 %v1, i64* %ptr2, i64* %res) {
 ; WIN32-NEXT:    seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; WIN32-NEXT:    movl %esi, %eax
 ; WIN32-NEXT:    mull %ebp
-; WIN32-NEXT:    movl %eax, %esi
 ; WIN32-NEXT:    seto %bh
 ; WIN32-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Folded Reload
-; WIN32-NEXT:    addl %edi, %esi
+; WIN32-NEXT:    leal (%edi,%eax), %esi
 ; WIN32-NEXT:    movl %ebp, %eax
 ; WIN32-NEXT:    mull %ecx
 ; WIN32-NEXT:    addl %esi, %edx


        


More information about the llvm-commits mailing list