[llvm] 65aaecc - Revert "[TargetLowering][RISCV][X86] Support even divisors in expandDIVREMByConstant."

Mon Oct 24 07:13:18 PDT 2022

Author: Craig Topper
Date: 2022-10-24T07:12:54-07:00
New Revision: 65aaecca8842dec30d03734a7fe8ce33c5afec81

URL: https://github.com/llvm/llvm-project/commit/65aaecca8842dec30d03734a7fe8ce33c5afec81
DIFF: https://github.com/llvm/llvm-project/commit/65aaecca8842dec30d03734a7fe8ce33c5afec81.diff

LOG: Revert "[TargetLowering][RISCV][X86] Support even divisors in expandDIVREMByConstant."

This reverts commit f6a7b47820904c5e69cc4f133d382c74a87c44e8.

I received a report that this fails on 32-bit X86.

Added: 
    

Modified: 
    llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
    llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll
    llvm/test/CodeGen/RISCV/split-urem-by-constant.ll
    llvm/test/CodeGen/X86/divide-by-constant.ll
    llvm/test/CodeGen/X86/divmod128.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 33631f290e284..2aa8c04f43251 100644

--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -7168,17 +7168,8 @@ bool TargetLowering::expandMUL(SDNode *N, SDValue &Lo, SDValue &Hi, EVT HiLoVT,
 //   Remainder = Sum % Constant
 // This is based on "Remainder by Summing Digits" from Hacker's Delight.
 //
-// For division, we can compute the remainder using the algorithm described
-// above, subtract it from the dividend to get an exact multiple of Constant.
-// Then multiply that extact multiply by the multiplicative inverse modulo
-// (1 << (BitWidth / 2)) to get the quotient.
-
-// If Constant is even, we can shift right the dividend and the divisor by the
-// number of trailing zeros in Constant before applying the remainder algorithm.
-// If we're after the quotient, we can subtract this value from the shifted
-// dividend and multiply by the multiplicative inverse of the shifted divisor.
-// If we want the remainder, we shift the value left by the number of trailing
-// zeros and add the bits that were shifted out of the dividend.
+// For division, we can compute the remainder, subtract it from the dividend,
+// and then multiply by the multiplicative inverse modulo (1 << (BitWidth / 2)).
 bool TargetLowering::expandDIVREMByConstant(SDNode *N,
                                             SmallVectorImpl<SDValue> &Result,
                                             EVT HiLoVT, SelectionDAG &DAG,
@@ -7197,7 +7188,7 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
   if (!CN)
     return false;
 
-  APInt Divisor = CN->getAPIntValue();
+  const APInt &Divisor = CN->getAPIntValue();
   unsigned BitWidth = Divisor.getBitWidth();
   unsigned HBitWidth = BitWidth / 2;
   assert(VT.getScalarSizeInBits() == BitWidth &&
@@ -7218,20 +7209,12 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
   if (DAG.shouldOptForSize())
     return false;
 
-  // Early out for 0 or 1 divisors.
-  if (Divisor.ule(1))
+  // Early out for 0, 1 or even divisors.
+  if (Divisor.ule(1) || Divisor[0] == 0)
     return false;
 
-  // If the divisor is even, shift it until it becomes odd.
-  unsigned TrailingZeros = 0;
-  if (!Divisor[0]) {
-    TrailingZeros = Divisor.countTrailingZeros();
-    Divisor.lshrInPlace(TrailingZeros);
-  }
-
   SDLoc dl(N);
   SDValue Sum;
-  SDValue PartialRem;
 
   // If (1 << HBitWidth) % divisor == 1, we can add the two halves together and
   // then add in the carry.
@@ -7246,27 +7229,6 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
                        DAG.getIntPtrConstant(1, dl));
     }
 
-    // Shift the input by the number of TrailingZeros in the divisor. The
-    // shifted out bits will be added to the remainder later.
-    if (TrailingZeros) {
-      LL = DAG.getNode(
-          ISD::OR, dl, HiLoVT,
-          DAG.getNode(ISD::SRL, dl, HiLoVT, LL,
-                      DAG.getShiftAmountConstant(TrailingZeros, HiLoVT, dl)),
-          DAG.getNode(ISD::SHL, dl, HiLoVT, LH,
-                      DAG.getShiftAmountConstant(HBitWidth - TrailingZeros,
-                                                 HiLoVT, dl)));
-      LH = DAG.getNode(ISD::SRL, dl, HiLoVT, LH,
-                       DAG.getShiftAmountConstant(TrailingZeros, HiLoVT, dl));
-
-      // Save the shifted off bits if we need the remainder.
-      if (Opcode != ISD::UDIV) {
-        APInt Mask = APInt::getLowBitsSet(HBitWidth, TrailingZeros);
-        PartialRem = DAG.getNode(ISD::AND, dl, HiLoVT, LL,
-                                 DAG.getConstant(Mask, dl, HiLoVT));
-      }
-    }
-
     // Use addcarry if we can, otherwise use a compare to detect overflow.
     EVT SetCCType =
         getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), HiLoVT);
@@ -7298,45 +7260,45 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
   SDValue RemL =
       DAG.getNode(ISD::UREM, dl, HiLoVT, Sum,
                   DAG.getConstant(Divisor.trunc(HBitWidth), dl, HiLoVT));
+  // High half of the remainder is 0.
   SDValue RemH = DAG.getConstant(0, dl, HiLoVT);
 
-  if (Opcode != ISD::UREM) {
-    // Subtract the remainder from the shifted dividend.
-    SDValue Dividend = DAG.getNode(ISD::BUILD_PAIR, dl, VT, LL, LH);
-    SDValue Rem = DAG.getNode(ISD::BUILD_PAIR, dl, VT, RemL, RemH);
-
-    Dividend = DAG.getNode(ISD::SUB, dl, VT, Dividend, Rem);
-
-    // Multiply by the multiplicative inverse of the divisor modulo
-    // (1 << BitWidth).
-    APInt Mod = APInt::getSignedMinValue(BitWidth + 1);
-    APInt MulFactor = Divisor.zext(BitWidth + 1);
-    MulFactor = MulFactor.multiplicativeInverse(Mod);
-    MulFactor = MulFactor.trunc(BitWidth);
-
-    SDValue Quotient = DAG.getNode(ISD::MUL, dl, VT, Dividend,
-                                   DAG.getConstant(MulFactor, dl, VT));
-
-    // Split the quotient into low and high parts.
-    SDValue QuotL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HiLoVT, Quotient,
-                                DAG.getIntPtrConstant(0, dl));
-    SDValue QuotH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HiLoVT, Quotient,
-                                DAG.getIntPtrConstant(1, dl));
-    Result.push_back(QuotL);
-    Result.push_back(QuotH);
-  }
-
-  if (Opcode != ISD::UDIV) {
-    // If we shifted the input, shift the remainder left and add the bits we
-    // shifted off the input.
-    if (TrailingZeros) {
-      APInt Mask = APInt::getLowBitsSet(HBitWidth, TrailingZeros);
-      RemL = DAG.getNode(ISD::SHL, dl, HiLoVT, RemL,
-                         DAG.getShiftAmountConstant(TrailingZeros, HiLoVT, dl));
-      RemL = DAG.getNode(ISD::ADD, dl, HiLoVT, RemL, PartialRem);
-    }
+  // If we only want remainder, we're done.
+  if (Opcode == ISD::UREM) {
+    Result.push_back(RemL);
+    Result.push_back(RemH);
+    return true;
+  }
+
+  // Otherwise, we need to compute the quotient.
+
+  // Join the remainder halves.
+  SDValue Rem = DAG.getNode(ISD::BUILD_PAIR, dl, VT, RemL, RemH);
+
+  // Subtract the remainder from the input.
+  SDValue In = DAG.getNode(ISD::SUB, dl, VT, N->getOperand(0), Rem);
+
+  // Multiply by the multiplicative inverse of the divisor modulo
+  // (1 << BitWidth).
+  APInt Mod = APInt::getSignedMinValue(BitWidth + 1);
+  APInt MulFactor = Divisor.zext(BitWidth + 1);
+  MulFactor = MulFactor.multiplicativeInverse(Mod);
+  MulFactor = MulFactor.trunc(BitWidth);
+
+  SDValue Quotient =
+      DAG.getNode(ISD::MUL, dl, VT, In, DAG.getConstant(MulFactor, dl, VT));
+
+  // Split the quotient into low and high parts.
+  SDValue QuotL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HiLoVT, Quotient,
+                              DAG.getIntPtrConstant(0, dl));
+  SDValue QuotH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HiLoVT, Quotient,
+                              DAG.getIntPtrConstant(1, dl));
+  Result.push_back(QuotL);
+  Result.push_back(QuotH);
+  // For DIVREM, also return the remainder parts.
+  if (Opcode == ISD::UDIVREM) {
     Result.push_back(RemL);
-    Result.push_back(DAG.getConstant(0, dl, HiLoVT));
+    Result.push_back(RemH);
   }
 
   return true;

diff  --git a/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll b/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll
index cd2b7aa9f351d..dbfea8a03212a 100644
--- a/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll
+++ b/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll
@@ -502,59 +502,24 @@ define iXLen2 @test_udiv_65537(iXLen2 %x) nounwind {
 define iXLen2 @test_udiv_12(iXLen2 %x) nounwind {
 ; RV32-LABEL: test_udiv_12:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    slli a2, a1, 30
-; RV32-NEXT:    srli a0, a0, 2
-; RV32-NEXT:    or a0, a0, a2
-; RV32-NEXT:    srli a1, a1, 2
-; RV32-NEXT:    add a2, a0, a1
-; RV32-NEXT:    sltu a3, a2, a0
-; RV32-NEXT:    add a2, a2, a3
-; RV32-NEXT:    lui a3, 699051
-; RV32-NEXT:    addi a4, a3, -1365
-; RV32-NEXT:    mulhu a5, a2, a4
-; RV32-NEXT:    srli a6, a5, 1
-; RV32-NEXT:    andi a5, a5, -2
-; RV32-NEXT:    add a5, a5, a6
-; RV32-NEXT:    sub a2, a2, a5
-; RV32-NEXT:    sub a5, a0, a2
-; RV32-NEXT:    addi a3, a3, -1366
-; RV32-NEXT:    mul a3, a5, a3
-; RV32-NEXT:    mulhu a6, a5, a4
-; RV32-NEXT:    add a3, a6, a3
-; RV32-NEXT:    sltu a0, a0, a2
-; RV32-NEXT:    sub a0, a1, a0
-; RV32-NEXT:    mul a0, a0, a4
-; RV32-NEXT:    add a1, a3, a0
-; RV32-NEXT:    mul a0, a5, a4
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    li a2, 12
+; RV32-NEXT:    li a3, 0
+; RV32-NEXT:    call __udivdi3 at plt
+; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: test_udiv_12:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    slli a2, a1, 62
-; RV64-NEXT:    srli a0, a0, 2
-; RV64-NEXT:    or a0, a0, a2
-; RV64-NEXT:    srli a1, a1, 2
-; RV64-NEXT:    lui a2, %hi(.LCPI10_0)
-; RV64-NEXT:    ld a2, %lo(.LCPI10_0)(a2)
-; RV64-NEXT:    add a3, a0, a1
-; RV64-NEXT:    sltu a4, a3, a0
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    mulhu a4, a3, a2
-; RV64-NEXT:    srli a5, a4, 1
-; RV64-NEXT:    andi a4, a4, -2
-; RV64-NEXT:    lui a6, %hi(.LCPI10_1)
-; RV64-NEXT:    ld a6, %lo(.LCPI10_1)(a6)
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    sub a3, a3, a4
-; RV64-NEXT:    sub a4, a0, a3
-; RV64-NEXT:    mul a5, a4, a6
-; RV64-NEXT:    mulhu a6, a4, a2
-; RV64-NEXT:    add a5, a6, a5
-; RV64-NEXT:    sltu a0, a0, a3
-; RV64-NEXT:    sub a0, a1, a0
-; RV64-NEXT:    mul a0, a0, a2
-; RV64-NEXT:    add a1, a5, a0
-; RV64-NEXT:    mul a0, a4, a2
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    li a2, 12
+; RV64-NEXT:    li a3, 0
+; RV64-NEXT:    call __udivti3 at plt
+; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
   %a = udiv iXLen2 %x, 12
   ret iXLen2 %a

diff  --git a/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll b/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll
index aa235c48c56a9..c0e210a7ec46e 100644
--- a/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll
+++ b/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll
@@ -335,46 +335,24 @@ define iXLen2 @test_urem_65537(iXLen2 %x) nounwind {
 define iXLen2 @test_urem_12(iXLen2 %x) nounwind {
 ; RV32-LABEL: test_urem_12:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    slli a2, a1, 30
-; RV32-NEXT:    srli a0, a0, 2
-; RV32-NEXT:    or a0, a0, a2
-; RV32-NEXT:    srli a1, a1, 2
-; RV32-NEXT:    add a1, a0, a1
-; RV32-NEXT:    sltu a2, a1, a0
-; RV32-NEXT:    add a1, a1, a2
-; RV32-NEXT:    lui a2, 699051
-; RV32-NEXT:    addi a2, a2, -1365
-; RV32-NEXT:    mulhu a2, a1, a2
-; RV32-NEXT:    srli a3, a2, 1
-; RV32-NEXT:    andi a2, a2, -2
-; RV32-NEXT:    add a2, a2, a3
-; RV32-NEXT:    sub a1, a1, a2
-; RV32-NEXT:    slli a1, a1, 2
-; RV32-NEXT:    andi a0, a0, 3
-; RV32-NEXT:    or a0, a1, a0
-; RV32-NEXT:    li a1, 0
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    li a2, 12
+; RV32-NEXT:    li a3, 0
+; RV32-NEXT:    call __umoddi3 at plt
+; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: test_urem_12:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    slli a2, a1, 62
-; RV64-NEXT:    srli a0, a0, 2
-; RV64-NEXT:    or a0, a0, a2
-; RV64-NEXT:    srli a1, a1, 2
-; RV64-NEXT:    lui a2, %hi(.LCPI10_0)
-; RV64-NEXT:    ld a2, %lo(.LCPI10_0)(a2)
-; RV64-NEXT:    add a1, a0, a1
-; RV64-NEXT:    sltu a3, a1, a0
-; RV64-NEXT:    add a1, a1, a3
-; RV64-NEXT:    mulhu a2, a1, a2
-; RV64-NEXT:    srli a3, a2, 1
-; RV64-NEXT:    andi a2, a2, -2
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    sub a1, a1, a2
-; RV64-NEXT:    slli a1, a1, 2
-; RV64-NEXT:    andi a0, a0, 3
-; RV64-NEXT:    or a0, a1, a0
-; RV64-NEXT:    li a1, 0
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    li a2, 12
+; RV64-NEXT:    li a3, 0
+; RV64-NEXT:    call __umodti3 at plt
+; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
   %a = urem iXLen2 %x, 12
   ret iXLen2 %a

diff  --git a/llvm/test/CodeGen/X86/divide-by-constant.ll b/llvm/test/CodeGen/X86/divide-by-constant.ll
index 9197dc0c1d64e..8f6d3ddb60e41 100644
--- a/llvm/test/CodeGen/X86/divide-by-constant.ll
+++ b/llvm/test/CodeGen/X86/divide-by-constant.ll
@@ -735,23 +735,13 @@ entry:
 define i64 @urem_i64_12(i64 %x) nounwind {
 ; X32-LABEL: urem_i64_12:
 ; X32:       # %bb.0: # %entry
-; X32-NEXT:    pushl %esi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    shrdl $2, %ecx, %esi
-; X32-NEXT:    shrl $2, %ecx
-; X32-NEXT:    addl %esi, %ecx
-; X32-NEXT:    adcl $0, %ecx
-; X32-NEXT:    movl $-1431655765, %edx # imm = 0xAAAAAAAB
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    mull %edx
-; X32-NEXT:    shrl %edx
-; X32-NEXT:    leal (%edx,%edx,2), %eax
-; X32-NEXT:    subl %eax, %ecx
-; X32-NEXT:    andl $3, %esi
-; X32-NEXT:    leal (%esi,%ecx,4), %eax
-; X32-NEXT:    xorl %edx, %edx
-; X32-NEXT:    popl %esi
+; X32-NEXT:    subl $12, %esp
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $12
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    calll __umoddi3
+; X32-NEXT:    addl $28, %esp
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: urem_i64_12:
@@ -1126,33 +1116,13 @@ entry:
 define i64 @udiv_i64_12(i64 %x) nounwind {
 ; X32-LABEL: udiv_i64_12:
 ; X32:       # %bb.0: # %entry
-; X32-NEXT:    pushl %ebx
-; X32-NEXT:    pushl %edi
-; X32-NEXT:    pushl %esi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT:    shrdl $2, %edi, %ecx
-; X32-NEXT:    shrl $2, %edi
-; X32-NEXT:    movl %ecx, %esi
-; X32-NEXT:    addl %edi, %esi
-; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl $-1431655765, %ebx # imm = 0xAAAAAAAB
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    mull %ebx
-; X32-NEXT:    shrl %edx
-; X32-NEXT:    leal (%edx,%edx,2), %eax
-; X32-NEXT:    subl %eax, %esi
-; X32-NEXT:    subl %esi, %ecx
-; X32-NEXT:    sbbl $0, %edi
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    mull %ebx
-; X32-NEXT:    imull $-1431655766, %ecx, %ecx # imm = 0xAAAAAAAA
-; X32-NEXT:    addl %ecx, %edx
-; X32-NEXT:    imull $-1431655765, %edi, %ecx # imm = 0xAAAAAAAB
-; X32-NEXT:    addl %ecx, %edx
-; X32-NEXT:    popl %esi
-; X32-NEXT:    popl %edi
-; X32-NEXT:    popl %ebx
+; X32-NEXT:    subl $12, %esp
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $12
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    calll __udivdi3
+; X32-NEXT:    addl $28, %esp
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: udiv_i64_12:

diff  --git a/llvm/test/CodeGen/X86/divmod128.ll b/llvm/test/CodeGen/X86/divmod128.ll
index b0657f4f8456d..4549598ca3ed9 100644
--- a/llvm/test/CodeGen/X86/divmod128.ll
+++ b/llvm/test/CodeGen/X86/divmod128.ll
@@ -425,37 +425,27 @@ entry:
 define i128 @urem_i128_12(i128 %x) nounwind {
 ; X86-64-LABEL: urem_i128_12:
 ; X86-64:       # %bb.0: # %entry
-; X86-64-NEXT:    shrdq $2, %rsi, %rdi
-; X86-64-NEXT:    shrq $2, %rsi
-; X86-64-NEXT:    addq %rdi, %rsi
-; X86-64-NEXT:    adcq $0, %rsi
-; X86-64-NEXT:    movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB
-; X86-64-NEXT:    movq %rsi, %rax
-; X86-64-NEXT:    mulq %rcx
-; X86-64-NEXT:    shrq %rdx
-; X86-64-NEXT:    leaq (%rdx,%rdx,2), %rax
-; X86-64-NEXT:    subq %rax, %rsi
-; X86-64-NEXT:    andl $3, %edi
-; X86-64-NEXT:    leaq (%rdi,%rsi,4), %rax
-; X86-64-NEXT:    xorl %edx, %edx
+; X86-64-NEXT:    pushq %rax
+; X86-64-NEXT:    movl $12, %edx
+; X86-64-NEXT:    xorl %ecx, %ecx
+; X86-64-NEXT:    callq __umodti3 at PLT
+; X86-64-NEXT:    popq %rcx
 ; X86-64-NEXT:    retq
 ;
 ; WIN64-LABEL: urem_i128_12:
 ; WIN64:       # %bb.0: # %entry
-; WIN64-NEXT:    movq %rdx, %r8
-; WIN64-NEXT:    shrdq $2, %rdx, %rcx
-; WIN64-NEXT:    shrq $2, %r8
-; WIN64-NEXT:    addq %rcx, %r8
-; WIN64-NEXT:    adcq $0, %r8
-; WIN64-NEXT:    movabsq $-6148914691236517205, %rdx # imm = 0xAAAAAAAAAAAAAAAB
-; WIN64-NEXT:    movq %r8, %rax
-; WIN64-NEXT:    mulq %rdx
-; WIN64-NEXT:    shrq %rdx
-; WIN64-NEXT:    leaq (%rdx,%rdx,2), %rax
-; WIN64-NEXT:    subq %rax, %r8
-; WIN64-NEXT:    andl $3, %ecx
-; WIN64-NEXT:    leaq (%rcx,%r8,4), %rax
-; WIN64-NEXT:    xorl %edx, %edx
+; WIN64-NEXT:    subq $72, %rsp
+; WIN64-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movq $12, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; WIN64-NEXT:    callq __umodti3
+; WIN64-NEXT:    movq %xmm0, %rax
+; WIN64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; WIN64-NEXT:    movq %xmm0, %rdx
+; WIN64-NEXT:    addq $72, %rsp
 ; WIN64-NEXT:    retq
 entry:
   %rem = urem i128 %x, 12
@@ -897,51 +887,27 @@ entry:
 define i128 @udiv_i128_12(i128 %x) nounwind {
 ; X86-64-LABEL: udiv_i128_12:
 ; X86-64:       # %bb.0: # %entry
-; X86-64-NEXT:    shrdq $2, %rsi, %rdi
-; X86-64-NEXT:    shrq $2, %rsi
-; X86-64-NEXT:    movq %rdi, %rcx
-; X86-64-NEXT:    addq %rsi, %rcx
-; X86-64-NEXT:    adcq $0, %rcx
-; X86-64-NEXT:    movabsq $-6148914691236517205, %r8 # imm = 0xAAAAAAAAAAAAAAAB
-; X86-64-NEXT:    movq %rcx, %rax
-; X86-64-NEXT:    mulq %r8
-; X86-64-NEXT:    shrq %rdx
-; X86-64-NEXT:    leaq (%rdx,%rdx,2), %rax
-; X86-64-NEXT:    subq %rax, %rcx
-; X86-64-NEXT:    subq %rcx, %rdi
-; X86-64-NEXT:    sbbq $0, %rsi
-; X86-64-NEXT:    movabsq $-6148914691236517206, %rcx # imm = 0xAAAAAAAAAAAAAAAA
-; X86-64-NEXT:    imulq %rdi, %rcx
-; X86-64-NEXT:    movq %rdi, %rax
-; X86-64-NEXT:    mulq %r8
-; X86-64-NEXT:    addq %rcx, %rdx
-; X86-64-NEXT:    imulq %rsi, %r8
-; X86-64-NEXT:    addq %r8, %rdx
+; X86-64-NEXT:    pushq %rax
+; X86-64-NEXT:    movl $12, %edx
+; X86-64-NEXT:    xorl %ecx, %ecx
+; X86-64-NEXT:    callq __udivti3 at PLT
+; X86-64-NEXT:    popq %rcx
 ; X86-64-NEXT:    retq
 ;
 ; WIN64-LABEL: udiv_i128_12:
 ; WIN64:       # %bb.0: # %entry
-; WIN64-NEXT:    movq %rdx, %r8
-; WIN64-NEXT:    shrdq $2, %rdx, %rcx
-; WIN64-NEXT:    shrq $2, %r8
-; WIN64-NEXT:    movq %rcx, %r9
-; WIN64-NEXT:    addq %r8, %r9
-; WIN64-NEXT:    adcq $0, %r9
-; WIN64-NEXT:    movabsq $-6148914691236517205, %r10 # imm = 0xAAAAAAAAAAAAAAAB
-; WIN64-NEXT:    movq %r9, %rax
-; WIN64-NEXT:    mulq %r10
-; WIN64-NEXT:    shrq %rdx
-; WIN64-NEXT:    leaq (%rdx,%rdx,2), %rax
-; WIN64-NEXT:    subq %rax, %r9
-; WIN64-NEXT:    subq %r9, %rcx
-; WIN64-NEXT:    sbbq $0, %r8
-; WIN64-NEXT:    movabsq $-6148914691236517206, %r9 # imm = 0xAAAAAAAAAAAAAAAA
-; WIN64-NEXT:    imulq %rcx, %r9
-; WIN64-NEXT:    movq %rcx, %rax
-; WIN64-NEXT:    mulq %r10
-; WIN64-NEXT:    addq %r9, %rdx
-; WIN64-NEXT:    imulq %r10, %r8
-; WIN64-NEXT:    addq %r8, %rdx
+; WIN64-NEXT:    subq $72, %rsp
+; WIN64-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movq $12, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; WIN64-NEXT:    callq __udivti3
+; WIN64-NEXT:    movq %xmm0, %rax
+; WIN64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; WIN64-NEXT:    movq %xmm0, %rdx
+; WIN64-NEXT:    addq $72, %rsp
 ; WIN64-NEXT:    retq
 entry:
   %rem = udiv i128 %x, 12