[llvm] d5521d1 - [DAG] Reducing instructions by better legalization handling of AVGFLOORU for illegal data types (#99913)

Sat Jul 27 17:33:12 PDT 2024

Author: Julius Alexandre
Date: 2024-07-27T17:33:09-07:00
New Revision: d5521d128494690be66e03a674b9d1181935bf77

URL: https://github.com/llvm/llvm-project/commit/d5521d128494690be66e03a674b9d1181935bf77
DIFF: https://github.com/llvm/llvm-project/commit/d5521d128494690be66e03a674b9d1181935bf77.diff

LOG: [DAG] Reducing instructions by better legalization handling of AVGFLOORU for illegal data types (#99913)

**Issue:** https://github.com/rust-lang/rust/issues/124790
**Previous PR:** https://github.com/llvm/llvm-project/pull/99614

https://rust.godbolt.org/z/T7eKP3Tvo

**Aarch64:** https://alive2.llvm.org/ce/z/dqr2Kg
**x86:** https://alive2.llvm.org/ce/z/ze88Hw

cc: @RKSimon @topperc

Added: 
    llvm/test/CodeGen/AArch64/avgflooru-i128.ll
    llvm/test/CodeGen/X86/avgflooru-i128.ll

Modified: 
    llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
    llvm/test/CodeGen/RISCV/avgflooru.ll
    llvm/test/CodeGen/X86/avgflooru-scalar.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 6fd23b5ab9f5f..7fa83a5999dfe 100644

--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -9379,6 +9379,26 @@ SDValue TargetLowering::expandAVG(SDNode *N, SelectionDAG &DAG) const {
     }
   }
 
+  // avgflooru(lhs, rhs) -> or(lshr(add(lhs, rhs),1),shl(overflow, typesize-1))
+  if (Opc == ISD::AVGFLOORU && VT.isScalarInteger() && !isTypeLegal(VT)) {
+    SDValue UAddWithOverflow =
+        DAG.getNode(ISD::UADDO, dl, DAG.getVTList(VT, MVT::i1), {RHS, LHS});
+
+    SDValue Sum = UAddWithOverflow.getValue(0);
+    SDValue Overflow = UAddWithOverflow.getValue(1);
+
+    // Right shift the sum by 1
+    SDValue One = DAG.getShiftAmountConstant(1, VT, dl);
+    SDValue LShrVal = DAG.getNode(ISD::SRL, dl, VT, Sum, One);
+
+    SDValue ZeroExtOverflow = DAG.getNode(ISD::ANY_EXTEND, dl, VT, Overflow);
+    SDValue OverflowShl =
+        DAG.getNode(ISD::SHL, dl, VT, ZeroExtOverflow,
+                    DAG.getConstant(VT.getScalarSizeInBits() - 1, dl, VT));
+
+    return DAG.getNode(ISD::OR, dl, VT, LShrVal, OverflowShl);
+  }
+
   // avgceils(lhs, rhs) -> sub(or(lhs,rhs),ashr(xor(lhs,rhs),1))
   // avgceilu(lhs, rhs) -> sub(or(lhs,rhs),lshr(xor(lhs,rhs),1))
   // avgfloors(lhs, rhs) -> add(and(lhs,rhs),ashr(xor(lhs,rhs),1))

diff  --git a/llvm/test/CodeGen/AArch64/avgflooru-i128.ll b/llvm/test/CodeGen/AArch64/avgflooru-i128.ll
new file mode 100644
index 0000000000000..d336c38f20799
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/avgflooru-i128.ll
@@ -0,0 +1,124 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64 < %s | FileCheck %s
+
+define i128 @avgflooru_i128(i128 %x, i128 %y) {
+; CHECK-LABEL: avgflooru_i128:
+; CHECK:       // %bb.0: // %start
+; CHECK-NEXT:    adds x8, x0, x2
+; CHECK-NEXT:    adcs x9, x1, x3
+; CHECK-NEXT:    cset w10, hs
+; CHECK-NEXT:    extr x0, x9, x8, #1
+; CHECK-NEXT:    extr x1, x10, x9, #1
+; CHECK-NEXT:    ret
+start:
+  %xor = xor i128 %y, %x
+  %lshr = lshr i128 %xor, 1
+  %and = and i128 %y, %x
+  %add = add i128 %lshr, %and
+  ret i128 %add
+}
+
+declare void @use(i8)
+
+define i128 @avgflooru_i128_multi_use(i128 %x, i128 %y) nounwind {
+; CHECK-LABEL: avgflooru_i128_multi_use:
+; CHECK:       // %bb.0: // %start
+; CHECK-NEXT:    str x30, [sp, #-64]! // 8-byte Folded Spill
+; CHECK-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    eor x23, x3, x1
+; CHECK-NEXT:    eor x24, x2, x0
+; CHECK-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x21, x1
+; CHECK-NEXT:    mov x22, x0
+; CHECK-NEXT:    mov x0, x24
+; CHECK-NEXT:    mov x1, x23
+; CHECK-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x19, x3
+; CHECK-NEXT:    mov x20, x2
+; CHECK-NEXT:    bl use
+; CHECK-NEXT:    extr x0, x23, x24, #1
+; CHECK-NEXT:    lsr x1, x23, #1
+; CHECK-NEXT:    bl use
+; CHECK-NEXT:    adds x8, x22, x20
+; CHECK-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    adcs x9, x21, x19
+; CHECK-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    cset w10, hs
+; CHECK-NEXT:    extr x0, x9, x8, #1
+; CHECK-NEXT:    extr x1, x10, x9, #1
+; CHECK-NEXT:    ldr x30, [sp], #64 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+start:
+  %xor = xor i128 %y, %x
+  call void @use(i128 %xor)
+  %lshr = lshr i128 %xor, 1
+  call void @use(i128 %lshr)
+  %and = and i128 %y, %x
+  %add = add i128 %lshr, %and
+  ret i128 %add
+}
+
+; the 'avgflooru_i128_negative` shouldn't combine because it's not
+; an avgflooru operation, which is what we're targeting
+
+define i128 @avgflooru_i128_negative(i128 %x, i128 %y) {
+; CHECK-LABEL: avgflooru_i128_negative:
+; CHECK:       // %bb.0: // %start
+; CHECK-NEXT:    mvn x8, x0
+; CHECK-NEXT:    and x9, x2, x0
+; CHECK-NEXT:    mvn x10, x1
+; CHECK-NEXT:    and x11, x3, x1
+; CHECK-NEXT:    adds x0, x8, x9
+; CHECK-NEXT:    adc x1, x10, x11
+; CHECK-NEXT:    ret
+start:
+  %xor = xor i128 %x, -1
+  %and = and i128 %y, %x
+  %add = add i128 %xor, %and
+  ret i128 %add
+}
+
+; This negative test case shouldn't work, i32 is already properly
+; handled in terms of legalization, compared to the i128
+
+define i32 @avgflooru_i128_negative2(i32 %x, i32 %y) {
+; CHECK-LABEL: avgflooru_i128_negative2:
+; CHECK:       // %bb.0: // %start
+; CHECK-NEXT:    mov w8, w1
+; CHECK-NEXT:    add x8, x8, w0, uxtw
+; CHECK-NEXT:    lsr x0, x8, #1
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+start:
+  %xor = xor i32 %y, %x
+  %lshr = lshr i32 %xor, 1
+  %and = and i32 %y, %x
+  %add = add i32 %lshr, %and
+  ret i32 %add
+}
+
+define <2 x i128> @avgflooru_i128_vec(<2 x i128> %x, <2 x i128> %y) {
+; CHECK-LABEL: avgflooru_i128_vec:
+; CHECK:       // %bb.0: // %start
+; CHECK-NEXT:    adds x8, x0, x4
+; CHECK-NEXT:    adcs x9, x1, x5
+; CHECK-NEXT:    cset w10, hs
+; CHECK-NEXT:    adds x11, x2, x6
+; CHECK-NEXT:    extr x0, x9, x8, #1
+; CHECK-NEXT:    adcs x12, x3, x7
+; CHECK-NEXT:    extr x1, x10, x9, #1
+; CHECK-NEXT:    extr x11, x12, x11, #1
+; CHECK-NEXT:    cset w13, hs
+; CHECK-NEXT:    extr x3, x13, x12, #1
+; CHECK-NEXT:    fmov d0, x11
+; CHECK-NEXT:    mov v0.d[1], x3
+; CHECK-NEXT:    fmov x2, d0
+; CHECK-NEXT:    ret
+start:
+  %xor = xor <2 x i128> %y, %x
+  %lshr = lshr <2 x i128> %xor, <i128 1, i128 1>
+  %and = and <2 x i128> %y, %x
+  %add = add <2 x i128> %lshr, %and
+  ret <2 x i128> %add
+}

diff  --git a/llvm/test/CodeGen/RISCV/avgflooru.ll b/llvm/test/CodeGen/RISCV/avgflooru.ll
index b58aaab6aaf4a..fa88c3760e455 100644
--- a/llvm/test/CodeGen/RISCV/avgflooru.ll
+++ b/llvm/test/CodeGen/RISCV/avgflooru.ll
@@ -164,18 +164,20 @@ define i32 @test_ext_i32(i32 %a0, i32 %a1) nounwind {
 define i64 @test_fixed_i64(i64 %a0, i64 %a1) nounwind {
 ; RV32I-LABEL: test_fixed_i64:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    and a4, a1, a3
-; RV32I-NEXT:    xor a1, a1, a3
-; RV32I-NEXT:    srli a3, a1, 1
-; RV32I-NEXT:    add a3, a4, a3
-; RV32I-NEXT:    slli a1, a1, 31
-; RV32I-NEXT:    xor a4, a0, a2
-; RV32I-NEXT:    srli a4, a4, 1
-; RV32I-NEXT:    or a1, a4, a1
-; RV32I-NEXT:    and a2, a0, a2
-; RV32I-NEXT:    add a0, a2, a1
+; RV32I-NEXT:    add a4, a3, a1
+; RV32I-NEXT:    add a0, a2, a0
 ; RV32I-NEXT:    sltu a1, a0, a2
-; RV32I-NEXT:    add a1, a3, a1
+; RV32I-NEXT:    add a2, a4, a1
+; RV32I-NEXT:    beq a2, a3, .LBB6_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    sltu a1, a2, a3
+; RV32I-NEXT:  .LBB6_2:
+; RV32I-NEXT:    slli a1, a1, 31
+; RV32I-NEXT:    srli a3, a2, 1
+; RV32I-NEXT:    or a1, a3, a1
+; RV32I-NEXT:    slli a2, a2, 31
+; RV32I-NEXT:    srli a0, a0, 1
+; RV32I-NEXT:    or a0, a0, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_fixed_i64:
@@ -195,18 +197,20 @@ define i64 @test_fixed_i64(i64 %a0, i64 %a1) nounwind {
 define i64 @test_ext_i64(i64 %a0, i64 %a1) nounwind {
 ; RV32I-LABEL: test_ext_i64:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    and a4, a1, a3
-; RV32I-NEXT:    xor a1, a1, a3
-; RV32I-NEXT:    srli a3, a1, 1
-; RV32I-NEXT:    add a3, a4, a3
-; RV32I-NEXT:    slli a1, a1, 31
-; RV32I-NEXT:    xor a4, a0, a2
-; RV32I-NEXT:    srli a4, a4, 1
-; RV32I-NEXT:    or a1, a4, a1
-; RV32I-NEXT:    and a2, a0, a2
-; RV32I-NEXT:    add a0, a2, a1
+; RV32I-NEXT:    add a4, a3, a1
+; RV32I-NEXT:    add a0, a2, a0
 ; RV32I-NEXT:    sltu a1, a0, a2
-; RV32I-NEXT:    add a1, a3, a1
+; RV32I-NEXT:    add a2, a4, a1
+; RV32I-NEXT:    beq a2, a3, .LBB7_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    sltu a1, a2, a3
+; RV32I-NEXT:  .LBB7_2:
+; RV32I-NEXT:    slli a1, a1, 31
+; RV32I-NEXT:    srli a3, a2, 1
+; RV32I-NEXT:    or a1, a3, a1
+; RV32I-NEXT:    slli a2, a2, 31
+; RV32I-NEXT:    srli a0, a0, 1
+; RV32I-NEXT:    or a0, a0, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_ext_i64:

diff  --git a/llvm/test/CodeGen/X86/avgflooru-i128.ll b/llvm/test/CodeGen/X86/avgflooru-i128.ll
new file mode 100644
index 0000000000000..da16a7da48ca6
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avgflooru-i128.ll
@@ -0,0 +1,145 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=x86_64 < %s | FileCheck %s
+
+define i128 @avgflooru_i128(i128 %x, i128 %y) {
+; CHECK-LABEL: avgflooru_i128:
+; CHECK:       # %bb.0: # %start
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    addq %rdx, %rax
+; CHECK-NEXT:    adcq %rcx, %rsi
+; CHECK-NEXT:    setb %cl
+; CHECK-NEXT:    shrdq $1, %rsi, %rax
+; CHECK-NEXT:    movzbl %cl, %edx
+; CHECK-NEXT:    shldq $63, %rsi, %rdx
+; CHECK-NEXT:    retq
+start:
+  %xor = xor i128 %y, %x
+  %lshr = lshr i128 %xor, 1
+  %and = and i128 %y, %x
+  %add = add i128 %lshr, %and
+  ret i128 %add
+}
+
+declare void @use(i8)
+
+define i128 @avgflooru_i128_multi_use(i128 %x, i128 %y) nounwind {
+; CHECK-LABEL: avgflooru_i128_multi_use:
+; CHECK:       # %bb.0: # %start
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %r13
+; CHECK-NEXT:    pushq %r12
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    movq %rcx, %rbx
+; CHECK-NEXT:    movq %rdx, %r14
+; CHECK-NEXT:    movq %rsi, %r15
+; CHECK-NEXT:    movq %rdi, %r12
+; CHECK-NEXT:    movq %rdx, %r13
+; CHECK-NEXT:    xorq %rdi, %r13
+; CHECK-NEXT:    movq %rcx, %rbp
+; CHECK-NEXT:    xorq %rsi, %rbp
+; CHECK-NEXT:    movq %r13, %rdi
+; CHECK-NEXT:    movq %rbp, %rsi
+; CHECK-NEXT:    callq use at PLT
+; CHECK-NEXT:    shrdq $1, %rbp, %r13
+; CHECK-NEXT:    shrq %rbp
+; CHECK-NEXT:    movq %r13, %rdi
+; CHECK-NEXT:    movq %rbp, %rsi
+; CHECK-NEXT:    callq use at PLT
+; CHECK-NEXT:    addq %r14, %r12
+; CHECK-NEXT:    adcq %rbx, %r15
+; CHECK-NEXT:    setb %al
+; CHECK-NEXT:    shrdq $1, %r15, %r12
+; CHECK-NEXT:    movzbl %al, %edx
+; CHECK-NEXT:    shldq $63, %r15, %rdx
+; CHECK-NEXT:    movq %r12, %rax
+; CHECK-NEXT:    addq $8, %rsp
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r12
+; CHECK-NEXT:    popq %r13
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:    retq
+start:
+  %xor = xor i128 %y, %x
+  call void @use(i128 %xor)
+  %lshr = lshr i128 %xor, 1
+  call void @use(i128 %lshr)
+  %and = and i128 %y, %x
+  %add = add i128 %lshr, %and
+  ret i128 %add
+}
+
+; This test case shouldn't combine because it's not
+; an avgflooru operation
+
+define i128 @avgflooru_i128_negative(i128 %x, i128 %y) {
+; CHECK-LABEL: avgflooru_i128_negative:
+; CHECK:       # %bb.0: # %start
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    andq %rsi, %rcx
+; CHECK-NEXT:    notq %rsi
+; CHECK-NEXT:    andq %rdi, %rdx
+; CHECK-NEXT:    notq %rax
+; CHECK-NEXT:    addq %rdx, %rax
+; CHECK-NEXT:    adcq %rcx, %rsi
+; CHECK-NEXT:    movq %rsi, %rdx
+; CHECK-NEXT:    retq
+start:
+  %xor = xor i128 %x, -1
+  %and = and i128 %y, %x
+  %add = add i128 %xor, %and
+  ret i128 %add
+}
+
+; This negative test case shouldn't combine, i32 is already properly
+; handled in terms of legalization, compared to the i128
+
+define i32 @avgflooru_i128_negative2(i32 %x, i32 %y) {
+; CHECK-LABEL: avgflooru_i128_negative2:
+; CHECK:       # %bb.0: # %start
+; CHECK-NEXT:    movl %edi, %ecx
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    addq %rcx, %rax
+; CHECK-NEXT:    shrq %rax
+; CHECK-NEXT:    # kill: def $eax killed $eax killed $rax
+; CHECK-NEXT:    retq
+start:
+  %xor = xor i32 %y, %x
+  %lshr = lshr i32 %xor, 1
+  %and = and i32 %y, %x
+  %add = add i32 %lshr, %and
+  ret i32 %add
+}
+
+define <2 x i128> @avgflooru_i128_vec(<2 x i128> %x, <2 x i128> %y) {
+; CHECK-LABEL: avgflooru_i128_vec:
+; CHECK:       # %bb.0: # %start
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    addq {{[0-9]+}}(%rsp), %rsi
+; CHECK-NEXT:    adcq {{[0-9]+}}(%rsp), %rdx
+; CHECK-NEXT:    setb %dil
+; CHECK-NEXT:    movzbl %dil, %edi
+; CHECK-NEXT:    shldq $63, %rdx, %rdi
+; CHECK-NEXT:    addq {{[0-9]+}}(%rsp), %rcx
+; CHECK-NEXT:    adcq {{[0-9]+}}(%rsp), %r8
+; CHECK-NEXT:    setb %r9b
+; CHECK-NEXT:    movzbl %r9b, %r9d
+; CHECK-NEXT:    shldq $63, %r8, %r9
+; CHECK-NEXT:    shldq $63, %rsi, %rdx
+; CHECK-NEXT:    shldq $63, %rcx, %r8
+; CHECK-NEXT:    movq %r8, 16(%rax)
+; CHECK-NEXT:    movq %rdx, (%rax)
+; CHECK-NEXT:    movq %r9, 24(%rax)
+; CHECK-NEXT:    movq %rdi, 8(%rax)
+; CHECK-NEXT:    retq
+start:
+  %xor = xor <2 x i128> %y, %x
+  %lshr = lshr <2 x i128> %xor, <i128 1, i128 1>
+  %and = and <2 x i128> %y, %x
+  %add = add <2 x i128> %lshr, %and
+  ret <2 x i128> %add
+}

diff  --git a/llvm/test/CodeGen/X86/avgflooru-scalar.ll b/llvm/test/CodeGen/X86/avgflooru-scalar.ll
index d21c9d65ea9c8..0c91a9da5720a 100644
--- a/llvm/test/CodeGen/X86/avgflooru-scalar.ll
+++ b/llvm/test/CodeGen/X86/avgflooru-scalar.ll
@@ -168,26 +168,14 @@ define i32 @test_ext_i32(i32 %a0, i32 %a1) nounwind {
 define i64 @test_fixed_i64(i64 %a0, i64 %a1) nounwind {
 ; X86-LABEL: test_fixed_i64:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    xorl %esi, %ebx
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    xorl %edi, %edx
-; X86-NEXT:    shrdl $1, %edx, %ebx
-; X86-NEXT:    andl %edi, %ecx
-; X86-NEXT:    shrl %edx
-; X86-NEXT:    andl %esi, %eax
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    adcl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    setb %dl
+; X86-NEXT:    movzbl %dl, %edx
+; X86-NEXT:    shldl $31, %eax, %edx
+; X86-NEXT:    shldl $31, %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_fixed_i64:
@@ -208,26 +196,14 @@ define i64 @test_fixed_i64(i64 %a0, i64 %a1) nounwind {
 define i64 @test_ext_i64(i64 %a0, i64 %a1) nounwind {
 ; X86-LABEL: test_ext_i64:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    xorl %esi, %ebx
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    xorl %edi, %edx
-; X86-NEXT:    shrdl $1, %edx, %ebx
-; X86-NEXT:    andl %edi, %ecx
-; X86-NEXT:    shrl %edx
-; X86-NEXT:    andl %esi, %eax
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    adcl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    setb %dl
+; X86-NEXT:    movzbl %dl, %edx
+; X86-NEXT:    shldl $31, %eax, %edx
+; X86-NEXT:    shldl $31, %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_ext_i64: