[llvm] 0121b1a - Revert "[TargetLowering][RISCV][X86] Support even divisors in expandDIVREMByConstant."

Mon Oct 10 14:53:42 PDT 2022

Author: Craig Topper
Date: 2022-10-10T14:53:29-07:00
New Revision: 0121b1a4ac8d521dd6c2465bf4541434d6c5f940

URL: https://github.com/llvm/llvm-project/commit/0121b1a4ac8d521dd6c2465bf4541434d6c5f940
DIFF: https://github.com/llvm/llvm-project/commit/0121b1a4ac8d521dd6c2465bf4541434d6c5f940.diff

LOG: Revert "[TargetLowering][RISCV][X86] Support even divisors in expandDIVREMByConstant."

This reverts commit d4facda414b6b9b8b1a34bc7e6b7c15172775318.

This has been reported to cause failures. Reverting while I investigate.

Added: 
    

Modified: 
    llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
    llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll
    llvm/test/CodeGen/RISCV/split-urem-by-constant.ll
    llvm/test/CodeGen/X86/divide-by-constant.ll
    llvm/test/CodeGen/X86/divmod128.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 9dc84a6b4dbbd..a3070fe31c47e 100644

--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -7168,15 +7168,8 @@ bool TargetLowering::expandMUL(SDNode *N, SDValue &Lo, SDValue &Hi, EVT HiLoVT,
 //   Remainder = Sum % Constant
 // This is based on "Remainder by Summing Digits" from Hacker's Delight.
 //
-// If Constant is even, we can shift right the dividend and the divisor by the
-// number of trailing zeros in Constant before computing the remainder. Then
-// fixup the remainder by shifting it left by the number of trailing zeros and
-// adding the bits that were shifted out of the dividend.
-//
-// For division, we can compute the remainder using the algorithm described
-// above, subtract it from the dividend to get an exact multiple of Constant.
-// Then multiply that extact multiple by the multiplicative inverse modulo
-// (1 << (BitWidth / 2)).
+// For division, we can compute the remainder, subtract it from the dividend,
+// and then multiply by the multiplicative inverse modulo (1 << (BitWidth / 2)).
 bool TargetLowering::expandDIVREMByConstant(SDNode *N,
                                             SmallVectorImpl<SDValue> &Result,
                                             EVT HiLoVT, SelectionDAG &DAG,
@@ -7195,7 +7188,7 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
   if (!CN)
     return false;
 
-  APInt Divisor = CN->getAPIntValue();
+  const APInt &Divisor = CN->getAPIntValue();
   unsigned BitWidth = Divisor.getBitWidth();
   unsigned HBitWidth = BitWidth / 2;
   assert(VT.getScalarSizeInBits() == BitWidth &&
@@ -7216,17 +7209,10 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
   if (DAG.shouldOptForSize())
     return false;
 
-  // Early out for 0 or 1 divisors.
-  if (Divisor.ule(1))
+  // Early out for 0, 1 or even divisors.
+  if (Divisor.ule(1) || Divisor[0] == 0)
     return false;
 
-  // If the divisor is even, shift it until it becomes odd.
-  unsigned TrailingZeros = 0;
-  if (!Divisor[0]) {
-    TrailingZeros = Divisor.countTrailingZeros();
-    Divisor.lshrInPlace(TrailingZeros);
-  }
-
   SDLoc dl(N);
   SDValue Sum;
 
@@ -7243,35 +7229,17 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
                        DAG.getIntPtrConstant(1, dl));
     }
 
-    SDValue ShiftedLL = LL;
-    SDValue ShiftedLH = LH;
-
-    // Shift the input by the number of TrailingZeros in the divisor. The
-    // shifted out bits will be added to the remainder later.
-    if (TrailingZeros) {
-      ShiftedLL = DAG.getNode(
-          ISD::OR, dl, HiLoVT,
-          DAG.getNode(ISD::SRL, dl, HiLoVT, ShiftedLL,
-                      DAG.getShiftAmountConstant(TrailingZeros, HiLoVT, dl)),
-          DAG.getNode(ISD::SHL, dl, HiLoVT, ShiftedLH,
-                      DAG.getShiftAmountConstant(HBitWidth - TrailingZeros,
-                                                 HiLoVT, dl)));
-      ShiftedLH =
-          DAG.getNode(ISD::SRL, dl, HiLoVT, ShiftedLH,
-                      DAG.getShiftAmountConstant(TrailingZeros, HiLoVT, dl));
-    }
-
     // Use addcarry if we can, otherwise use a compare to detect overflow.
     EVT SetCCType =
         getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), HiLoVT);
     if (isOperationLegalOrCustom(ISD::ADDCARRY, HiLoVT)) {
       SDVTList VTList = DAG.getVTList(HiLoVT, SetCCType);
-      Sum = DAG.getNode(ISD::UADDO, dl, VTList, ShiftedLL, ShiftedLH);
+      Sum = DAG.getNode(ISD::UADDO, dl, VTList, LL, LH);
       Sum = DAG.getNode(ISD::ADDCARRY, dl, VTList, Sum,
                         DAG.getConstant(0, dl, HiLoVT), Sum.getValue(1));
     } else {
-      Sum = DAG.getNode(ISD::ADD, dl, HiLoVT, ShiftedLL, ShiftedLH);
-      SDValue Carry = DAG.getSetCC(dl, SetCCType, Sum, ShiftedLL, ISD::SETULT);
+      Sum = DAG.getNode(ISD::ADD, dl, HiLoVT, LL, LH);
+      SDValue Carry = DAG.getSetCC(dl, SetCCType, Sum, LL, ISD::SETULT);
       // If the boolean for the target is 0 or 1, we can add the setcc result
       // directly.
       if (getBooleanContents(HiLoVT) ==
@@ -7295,17 +7263,6 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
   // High half of the remainder is 0.
   SDValue RemH = DAG.getConstant(0, dl, HiLoVT);
 
-  // If we shifted the input, shift the remainder left and add the bits we
-  // shifted off the input.
-  if (TrailingZeros) {
-    APInt Mask = APInt::getLowBitsSet(HBitWidth, TrailingZeros);
-    RemL = DAG.getNode(ISD::SHL, dl, HiLoVT, RemL,
-                       DAG.getShiftAmountConstant(TrailingZeros, HiLoVT, dl));
-    RemL = DAG.getNode(ISD::ADD, dl, HiLoVT, RemL,
-                       DAG.getNode(ISD::AND, dl, HiLoVT, LL,
-                                   DAG.getConstant(Mask, dl, HiLoVT)));
-  }
-
   // If we only want remainder, we're done.
   if (Opcode == ISD::UREM) {
     Result.push_back(RemL);

diff  --git a/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll b/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll
index fd75ebc943274..dbfea8a03212a 100644
--- a/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll
+++ b/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll
@@ -502,65 +502,24 @@ define iXLen2 @test_udiv_65537(iXLen2 %x) nounwind {
 define iXLen2 @test_udiv_12(iXLen2 %x) nounwind {
 ; RV32-LABEL: test_udiv_12:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    slli a2, a1, 30
-; RV32-NEXT:    srli a3, a0, 2
-; RV32-NEXT:    or a2, a3, a2
-; RV32-NEXT:    srli a3, a1, 2
-; RV32-NEXT:    add a3, a2, a3
-; RV32-NEXT:    sltu a2, a3, a2
-; RV32-NEXT:    add a2, a3, a2
-; RV32-NEXT:    lui a3, 699051
-; RV32-NEXT:    addi a4, a3, -1365
-; RV32-NEXT:    mulhu a5, a2, a4
-; RV32-NEXT:    srli a6, a5, 1
-; RV32-NEXT:    andi a5, a5, -2
-; RV32-NEXT:    add a5, a5, a6
-; RV32-NEXT:    sub a2, a2, a5
-; RV32-NEXT:    slli a2, a2, 2
-; RV32-NEXT:    andi a5, a0, 3
-; RV32-NEXT:    or a2, a2, a5
-; RV32-NEXT:    sub a5, a0, a2
-; RV32-NEXT:    addi a3, a3, -1366
-; RV32-NEXT:    mul a3, a5, a3
-; RV32-NEXT:    mulhu a6, a5, a4
-; RV32-NEXT:    add a3, a6, a3
-; RV32-NEXT:    sltu a0, a0, a2
-; RV32-NEXT:    sub a0, a1, a0
-; RV32-NEXT:    mul a0, a0, a4
-; RV32-NEXT:    add a1, a3, a0
-; RV32-NEXT:    mul a0, a5, a4
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    li a2, 12
+; RV32-NEXT:    li a3, 0
+; RV32-NEXT:    call __udivdi3 at plt
+; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: test_udiv_12:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    slli a2, a1, 62
-; RV64-NEXT:    srli a3, a0, 2
-; RV64-NEXT:    or a2, a3, a2
-; RV64-NEXT:    srli a3, a1, 2
-; RV64-NEXT:    lui a4, %hi(.LCPI10_0)
-; RV64-NEXT:    ld a4, %lo(.LCPI10_0)(a4)
-; RV64-NEXT:    add a3, a2, a3
-; RV64-NEXT:    sltu a2, a3, a2
-; RV64-NEXT:    add a2, a3, a2
-; RV64-NEXT:    mulhu a3, a2, a4
-; RV64-NEXT:    srli a5, a3, 1
-; RV64-NEXT:    andi a3, a3, -2
-; RV64-NEXT:    add a3, a3, a5
-; RV64-NEXT:    sub a2, a2, a3
-; RV64-NEXT:    slli a2, a2, 2
-; RV64-NEXT:    lui a3, %hi(.LCPI10_1)
-; RV64-NEXT:    ld a3, %lo(.LCPI10_1)(a3)
-; RV64-NEXT:    andi a5, a0, 3
-; RV64-NEXT:    or a2, a2, a5
-; RV64-NEXT:    sub a5, a0, a2
-; RV64-NEXT:    mul a3, a5, a3
-; RV64-NEXT:    mulhu a6, a5, a4
-; RV64-NEXT:    add a3, a6, a3
-; RV64-NEXT:    sltu a0, a0, a2
-; RV64-NEXT:    sub a0, a1, a0
-; RV64-NEXT:    mul a0, a0, a4
-; RV64-NEXT:    add a1, a3, a0
-; RV64-NEXT:    mul a0, a5, a4
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    li a2, 12
+; RV64-NEXT:    li a3, 0
+; RV64-NEXT:    call __udivti3 at plt
+; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
   %a = udiv iXLen2 %x, 12
   ret iXLen2 %a

diff  --git a/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll b/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll
index 7a52de8f066e7..c0e210a7ec46e 100644
--- a/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll
+++ b/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll
@@ -335,46 +335,24 @@ define iXLen2 @test_urem_65537(iXLen2 %x) nounwind {
 define iXLen2 @test_urem_12(iXLen2 %x) nounwind {
 ; RV32-LABEL: test_urem_12:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    slli a2, a1, 30
-; RV32-NEXT:    srli a3, a0, 2
-; RV32-NEXT:    or a2, a3, a2
-; RV32-NEXT:    srli a1, a1, 2
-; RV32-NEXT:    add a1, a2, a1
-; RV32-NEXT:    sltu a2, a1, a2
-; RV32-NEXT:    add a1, a1, a2
-; RV32-NEXT:    lui a2, 699051
-; RV32-NEXT:    addi a2, a2, -1365
-; RV32-NEXT:    mulhu a2, a1, a2
-; RV32-NEXT:    srli a3, a2, 1
-; RV32-NEXT:    andi a2, a2, -2
-; RV32-NEXT:    add a2, a2, a3
-; RV32-NEXT:    sub a1, a1, a2
-; RV32-NEXT:    slli a1, a1, 2
-; RV32-NEXT:    andi a0, a0, 3
-; RV32-NEXT:    or a0, a1, a0
-; RV32-NEXT:    li a1, 0
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    li a2, 12
+; RV32-NEXT:    li a3, 0
+; RV32-NEXT:    call __umoddi3 at plt
+; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: test_urem_12:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    slli a2, a1, 62
-; RV64-NEXT:    srli a3, a0, 2
-; RV64-NEXT:    or a2, a3, a2
-; RV64-NEXT:    srli a1, a1, 2
-; RV64-NEXT:    lui a3, %hi(.LCPI10_0)
-; RV64-NEXT:    ld a3, %lo(.LCPI10_0)(a3)
-; RV64-NEXT:    add a1, a2, a1
-; RV64-NEXT:    sltu a2, a1, a2
-; RV64-NEXT:    add a1, a1, a2
-; RV64-NEXT:    mulhu a2, a1, a3
-; RV64-NEXT:    srli a3, a2, 1
-; RV64-NEXT:    andi a2, a2, -2
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    sub a1, a1, a2
-; RV64-NEXT:    slli a1, a1, 2
-; RV64-NEXT:    andi a0, a0, 3
-; RV64-NEXT:    or a0, a1, a0
-; RV64-NEXT:    li a1, 0
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    li a2, 12
+; RV64-NEXT:    li a3, 0
+; RV64-NEXT:    call __umodti3 at plt
+; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
   %a = urem iXLen2 %x, 12
   ret iXLen2 %a

diff  --git a/llvm/test/CodeGen/X86/divide-by-constant.ll b/llvm/test/CodeGen/X86/divide-by-constant.ll
index 2f8a19b36b623..8f6d3ddb60e41 100644
--- a/llvm/test/CodeGen/X86/divide-by-constant.ll
+++ b/llvm/test/CodeGen/X86/divide-by-constant.ll
@@ -735,24 +735,13 @@ entry:
 define i64 @urem_i64_12(i64 %x) nounwind {
 ; X32-LABEL: urem_i64_12:
 ; X32:       # %bb.0: # %entry
-; X32-NEXT:    pushl %esi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    shrl $2, %eax
-; X32-NEXT:    shldl $30, %esi, %ecx
-; X32-NEXT:    addl %eax, %ecx
-; X32-NEXT:    adcl $0, %ecx
-; X32-NEXT:    movl $-1431655765, %edx # imm = 0xAAAAAAAB
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    mull %edx
-; X32-NEXT:    shrl %edx
-; X32-NEXT:    leal (%edx,%edx,2), %eax
-; X32-NEXT:    subl %eax, %ecx
-; X32-NEXT:    andl $3, %esi
-; X32-NEXT:    leal (%esi,%ecx,4), %eax
-; X32-NEXT:    xorl %edx, %edx
-; X32-NEXT:    popl %esi
+; X32-NEXT:    subl $12, %esp
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $12
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    calll __umoddi3
+; X32-NEXT:    addl $28, %esp
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: urem_i64_12:
@@ -1127,37 +1116,13 @@ entry:
 define i64 @udiv_i64_12(i64 %x) nounwind {
 ; X32-LABEL: udiv_i64_12:
 ; X32:       # %bb.0: # %entry
-; X32-NEXT:    pushl %ebx
-; X32-NEXT:    pushl %edi
-; X32-NEXT:    pushl %esi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    shrl $2, %eax
-; X32-NEXT:    movl %edi, %esi
-; X32-NEXT:    shldl $30, %ecx, %esi
-; X32-NEXT:    addl %eax, %esi
-; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl $-1431655765, %ebx # imm = 0xAAAAAAAB
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    mull %ebx
-; X32-NEXT:    shrl %edx
-; X32-NEXT:    leal (%edx,%edx,2), %eax
-; X32-NEXT:    subl %eax, %esi
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    andl $3, %eax
-; X32-NEXT:    leal (%eax,%esi,4), %eax
-; X32-NEXT:    subl %eax, %ecx
-; X32-NEXT:    sbbl $0, %edi
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    mull %ebx
-; X32-NEXT:    imull $-1431655766, %ecx, %ecx # imm = 0xAAAAAAAA
-; X32-NEXT:    addl %ecx, %edx
-; X32-NEXT:    imull $-1431655765, %edi, %ecx # imm = 0xAAAAAAAB
-; X32-NEXT:    addl %ecx, %edx
-; X32-NEXT:    popl %esi
-; X32-NEXT:    popl %edi
-; X32-NEXT:    popl %ebx
+; X32-NEXT:    subl $12, %esp
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $12
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    calll __udivdi3
+; X32-NEXT:    addl $28, %esp
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: udiv_i64_12:

diff  --git a/llvm/test/CodeGen/X86/divmod128.ll b/llvm/test/CodeGen/X86/divmod128.ll
index cb234263384a1..4549598ca3ed9 100644
--- a/llvm/test/CodeGen/X86/divmod128.ll
+++ b/llvm/test/CodeGen/X86/divmod128.ll
@@ -425,39 +425,27 @@ entry:
 define i128 @urem_i128_12(i128 %x) nounwind {
 ; X86-64-LABEL: urem_i128_12:
 ; X86-64:       # %bb.0: # %entry
-; X86-64-NEXT:    movq %rsi, %rax
-; X86-64-NEXT:    shldq $62, %rdi, %rax
-; X86-64-NEXT:    shrq $2, %rsi
-; X86-64-NEXT:    addq %rax, %rsi
-; X86-64-NEXT:    adcq $0, %rsi
-; X86-64-NEXT:    movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB
-; X86-64-NEXT:    movq %rsi, %rax
-; X86-64-NEXT:    mulq %rcx
-; X86-64-NEXT:    shrq %rdx
-; X86-64-NEXT:    leaq (%rdx,%rdx,2), %rax
-; X86-64-NEXT:    subq %rax, %rsi
-; X86-64-NEXT:    andl $3, %edi
-; X86-64-NEXT:    leaq (%rdi,%rsi,4), %rax
-; X86-64-NEXT:    xorl %edx, %edx
+; X86-64-NEXT:    pushq %rax
+; X86-64-NEXT:    movl $12, %edx
+; X86-64-NEXT:    xorl %ecx, %ecx
+; X86-64-NEXT:    callq __umodti3 at PLT
+; X86-64-NEXT:    popq %rcx
 ; X86-64-NEXT:    retq
 ;
 ; WIN64-LABEL: urem_i128_12:
 ; WIN64:       # %bb.0: # %entry
-; WIN64-NEXT:    movq %rdx, %r8
-; WIN64-NEXT:    movq %rdx, %rax
-; WIN64-NEXT:    shldq $62, %rcx, %rax
-; WIN64-NEXT:    shrq $2, %r8
-; WIN64-NEXT:    addq %rax, %r8
-; WIN64-NEXT:    adcq $0, %r8
-; WIN64-NEXT:    movabsq $-6148914691236517205, %rdx # imm = 0xAAAAAAAAAAAAAAAB
-; WIN64-NEXT:    movq %r8, %rax
-; WIN64-NEXT:    mulq %rdx
-; WIN64-NEXT:    shrq %rdx
-; WIN64-NEXT:    leaq (%rdx,%rdx,2), %rax
-; WIN64-NEXT:    subq %rax, %r8
-; WIN64-NEXT:    andl $3, %ecx
-; WIN64-NEXT:    leaq (%rcx,%r8,4), %rax
-; WIN64-NEXT:    xorl %edx, %edx
+; WIN64-NEXT:    subq $72, %rsp
+; WIN64-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movq $12, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; WIN64-NEXT:    callq __umodti3
+; WIN64-NEXT:    movq %xmm0, %rax
+; WIN64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; WIN64-NEXT:    movq %xmm0, %rdx
+; WIN64-NEXT:    addq $72, %rsp
 ; WIN64-NEXT:    retq
 entry:
   %rem = urem i128 %x, 12
@@ -899,59 +887,27 @@ entry:
 define i128 @udiv_i128_12(i128 %x) nounwind {
 ; X86-64-LABEL: udiv_i128_12:
 ; X86-64:       # %bb.0: # %entry
-; X86-64-NEXT:    movq %rsi, %rax
-; X86-64-NEXT:    shldq $62, %rdi, %rax
-; X86-64-NEXT:    movq %rsi, %rcx
-; X86-64-NEXT:    shrq $2, %rcx
-; X86-64-NEXT:    addq %rax, %rcx
-; X86-64-NEXT:    adcq $0, %rcx
-; X86-64-NEXT:    movabsq $-6148914691236517205, %r8 # imm = 0xAAAAAAAAAAAAAAAB
-; X86-64-NEXT:    movq %rcx, %rax
-; X86-64-NEXT:    mulq %r8
-; X86-64-NEXT:    shrq %rdx
-; X86-64-NEXT:    leaq (%rdx,%rdx,2), %rax
-; X86-64-NEXT:    subq %rax, %rcx
-; X86-64-NEXT:    movl %edi, %eax
-; X86-64-NEXT:    andl $3, %eax
-; X86-64-NEXT:    leaq (%rax,%rcx,4), %rax
-; X86-64-NEXT:    subq %rax, %rdi
-; X86-64-NEXT:    sbbq $0, %rsi
-; X86-64-NEXT:    movabsq $-6148914691236517206, %rcx # imm = 0xAAAAAAAAAAAAAAAA
-; X86-64-NEXT:    imulq %rdi, %rcx
-; X86-64-NEXT:    movq %rdi, %rax
-; X86-64-NEXT:    mulq %r8
-; X86-64-NEXT:    addq %rcx, %rdx
-; X86-64-NEXT:    imulq %rsi, %r8
-; X86-64-NEXT:    addq %r8, %rdx
+; X86-64-NEXT:    pushq %rax
+; X86-64-NEXT:    movl $12, %edx
+; X86-64-NEXT:    xorl %ecx, %ecx
+; X86-64-NEXT:    callq __udivti3 at PLT
+; X86-64-NEXT:    popq %rcx
 ; X86-64-NEXT:    retq
 ;
 ; WIN64-LABEL: udiv_i128_12:
 ; WIN64:       # %bb.0: # %entry
-; WIN64-NEXT:    movq %rdx, %r8
-; WIN64-NEXT:    movq %rdx, %rax
-; WIN64-NEXT:    shldq $62, %rcx, %rax
-; WIN64-NEXT:    movq %rdx, %r9
-; WIN64-NEXT:    shrq $2, %r9
-; WIN64-NEXT:    addq %rax, %r9
-; WIN64-NEXT:    adcq $0, %r9
-; WIN64-NEXT:    movabsq $-6148914691236517205, %r10 # imm = 0xAAAAAAAAAAAAAAAB
-; WIN64-NEXT:    movq %r9, %rax
-; WIN64-NEXT:    mulq %r10
-; WIN64-NEXT:    shrq %rdx
-; WIN64-NEXT:    leaq (%rdx,%rdx,2), %rax
-; WIN64-NEXT:    subq %rax, %r9
-; WIN64-NEXT:    movl %ecx, %eax
-; WIN64-NEXT:    andl $3, %eax
-; WIN64-NEXT:    leaq (%rax,%r9,4), %rax
-; WIN64-NEXT:    subq %rax, %rcx
-; WIN64-NEXT:    sbbq $0, %r8
-; WIN64-NEXT:    movabsq $-6148914691236517206, %r9 # imm = 0xAAAAAAAAAAAAAAAA
-; WIN64-NEXT:    imulq %rcx, %r9
-; WIN64-NEXT:    movq %rcx, %rax
-; WIN64-NEXT:    mulq %r10
-; WIN64-NEXT:    addq %r9, %rdx
-; WIN64-NEXT:    imulq %r10, %r8
-; WIN64-NEXT:    addq %r8, %rdx
+; WIN64-NEXT:    subq $72, %rsp
+; WIN64-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movq $12, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; WIN64-NEXT:    callq __udivti3
+; WIN64-NEXT:    movq %xmm0, %rax
+; WIN64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; WIN64-NEXT:    movq %xmm0, %rdx
+; WIN64-NEXT:    addq $72, %rsp
 ; WIN64-NEXT:    retq
 entry:
   %rem = udiv i128 %x, 12