[llvm] f6a7b47 - [TargetLowering][RISCV][X86] Support even divisors in expandDIVREMByConstant.

Sat Oct 22 23:47:50 PDT 2022

Author: Craig Topper
Date: 2022-10-22T23:35:33-07:00
New Revision: f6a7b47820904c5e69cc4f133d382c74a87c44e8

URL: https://github.com/llvm/llvm-project/commit/f6a7b47820904c5e69cc4f133d382c74a87c44e8
DIFF: https://github.com/llvm/llvm-project/commit/f6a7b47820904c5e69cc4f133d382c74a87c44e8.diff

LOG: [TargetLowering][RISCV][X86] Support even divisors in expandDIVREMByConstant.

If the divisor is even, we can first shift the dividend and divisor
right by the number of trailing zeros. Now the divisor is odd and we
can do the original algorithm to calculate a remainder. Then we shift
that remainder left by the number of trailing zeros and add the bits
that were shifted out of the dividend.

Reviewed By: RKSimon

Differential Revision: https://reviews.llvm.org/D135541

Added: 
    

Modified: 
    llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
    llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll
    llvm/test/CodeGen/RISCV/split-urem-by-constant.ll
    llvm/test/CodeGen/X86/divide-by-constant.ll
    llvm/test/CodeGen/X86/divmod128.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 2aa8c04f4325..33631f290e28 100644

--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -7168,8 +7168,17 @@ bool TargetLowering::expandMUL(SDNode *N, SDValue &Lo, SDValue &Hi, EVT HiLoVT,
 //   Remainder = Sum % Constant
 // This is based on "Remainder by Summing Digits" from Hacker's Delight.
 //
-// For division, we can compute the remainder, subtract it from the dividend,
-// and then multiply by the multiplicative inverse modulo (1 << (BitWidth / 2)).
+// For division, we can compute the remainder using the algorithm described
+// above, subtract it from the dividend to get an exact multiple of Constant.
+// Then multiply that extact multiply by the multiplicative inverse modulo
+// (1 << (BitWidth / 2)) to get the quotient.
+
+// If Constant is even, we can shift right the dividend and the divisor by the
+// number of trailing zeros in Constant before applying the remainder algorithm.
+// If we're after the quotient, we can subtract this value from the shifted
+// dividend and multiply by the multiplicative inverse of the shifted divisor.
+// If we want the remainder, we shift the value left by the number of trailing
+// zeros and add the bits that were shifted out of the dividend.
 bool TargetLowering::expandDIVREMByConstant(SDNode *N,
                                             SmallVectorImpl<SDValue> &Result,
                                             EVT HiLoVT, SelectionDAG &DAG,
@@ -7188,7 +7197,7 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
   if (!CN)
     return false;
 
-  const APInt &Divisor = CN->getAPIntValue();
+  APInt Divisor = CN->getAPIntValue();
   unsigned BitWidth = Divisor.getBitWidth();
   unsigned HBitWidth = BitWidth / 2;
   assert(VT.getScalarSizeInBits() == BitWidth &&
@@ -7209,12 +7218,20 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
   if (DAG.shouldOptForSize())
     return false;
 
-  // Early out for 0, 1 or even divisors.
-  if (Divisor.ule(1) || Divisor[0] == 0)
+  // Early out for 0 or 1 divisors.
+  if (Divisor.ule(1))
     return false;
 
+  // If the divisor is even, shift it until it becomes odd.
+  unsigned TrailingZeros = 0;
+  if (!Divisor[0]) {
+    TrailingZeros = Divisor.countTrailingZeros();
+    Divisor.lshrInPlace(TrailingZeros);
+  }
+
   SDLoc dl(N);
   SDValue Sum;
+  SDValue PartialRem;
 
   // If (1 << HBitWidth) % divisor == 1, we can add the two halves together and
   // then add in the carry.
@@ -7229,6 +7246,27 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
                        DAG.getIntPtrConstant(1, dl));
     }
 
+    // Shift the input by the number of TrailingZeros in the divisor. The
+    // shifted out bits will be added to the remainder later.
+    if (TrailingZeros) {
+      LL = DAG.getNode(
+          ISD::OR, dl, HiLoVT,
+          DAG.getNode(ISD::SRL, dl, HiLoVT, LL,
+                      DAG.getShiftAmountConstant(TrailingZeros, HiLoVT, dl)),
+          DAG.getNode(ISD::SHL, dl, HiLoVT, LH,
+                      DAG.getShiftAmountConstant(HBitWidth - TrailingZeros,
+                                                 HiLoVT, dl)));
+      LH = DAG.getNode(ISD::SRL, dl, HiLoVT, LH,
+                       DAG.getShiftAmountConstant(TrailingZeros, HiLoVT, dl));
+
+      // Save the shifted off bits if we need the remainder.
+      if (Opcode != ISD::UDIV) {
+        APInt Mask = APInt::getLowBitsSet(HBitWidth, TrailingZeros);
+        PartialRem = DAG.getNode(ISD::AND, dl, HiLoVT, LL,
+                                 DAG.getConstant(Mask, dl, HiLoVT));
+      }
+    }
+
     // Use addcarry if we can, otherwise use a compare to detect overflow.
     EVT SetCCType =
         getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), HiLoVT);
@@ -7260,45 +7298,45 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
   SDValue RemL =
       DAG.getNode(ISD::UREM, dl, HiLoVT, Sum,
                   DAG.getConstant(Divisor.trunc(HBitWidth), dl, HiLoVT));
-  // High half of the remainder is 0.
   SDValue RemH = DAG.getConstant(0, dl, HiLoVT);
 
-  // If we only want remainder, we're done.
-  if (Opcode == ISD::UREM) {
-    Result.push_back(RemL);
-    Result.push_back(RemH);
-    return true;
-  }
-
-  // Otherwise, we need to compute the quotient.
-
-  // Join the remainder halves.
-  SDValue Rem = DAG.getNode(ISD::BUILD_PAIR, dl, VT, RemL, RemH);
-
-  // Subtract the remainder from the input.
-  SDValue In = DAG.getNode(ISD::SUB, dl, VT, N->getOperand(0), Rem);
-
-  // Multiply by the multiplicative inverse of the divisor modulo
-  // (1 << BitWidth).
-  APInt Mod = APInt::getSignedMinValue(BitWidth + 1);
-  APInt MulFactor = Divisor.zext(BitWidth + 1);
-  MulFactor = MulFactor.multiplicativeInverse(Mod);
-  MulFactor = MulFactor.trunc(BitWidth);
-
-  SDValue Quotient =
-      DAG.getNode(ISD::MUL, dl, VT, In, DAG.getConstant(MulFactor, dl, VT));
-
-  // Split the quotient into low and high parts.
-  SDValue QuotL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HiLoVT, Quotient,
-                              DAG.getIntPtrConstant(0, dl));
-  SDValue QuotH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HiLoVT, Quotient,
-                              DAG.getIntPtrConstant(1, dl));
-  Result.push_back(QuotL);
-  Result.push_back(QuotH);
-  // For DIVREM, also return the remainder parts.
-  if (Opcode == ISD::UDIVREM) {
+  if (Opcode != ISD::UREM) {
+    // Subtract the remainder from the shifted dividend.
+    SDValue Dividend = DAG.getNode(ISD::BUILD_PAIR, dl, VT, LL, LH);
+    SDValue Rem = DAG.getNode(ISD::BUILD_PAIR, dl, VT, RemL, RemH);
+
+    Dividend = DAG.getNode(ISD::SUB, dl, VT, Dividend, Rem);
+
+    // Multiply by the multiplicative inverse of the divisor modulo
+    // (1 << BitWidth).
+    APInt Mod = APInt::getSignedMinValue(BitWidth + 1);
+    APInt MulFactor = Divisor.zext(BitWidth + 1);
+    MulFactor = MulFactor.multiplicativeInverse(Mod);
+    MulFactor = MulFactor.trunc(BitWidth);
+
+    SDValue Quotient = DAG.getNode(ISD::MUL, dl, VT, Dividend,
+                                   DAG.getConstant(MulFactor, dl, VT));
+
+    // Split the quotient into low and high parts.
+    SDValue QuotL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HiLoVT, Quotient,
+                                DAG.getIntPtrConstant(0, dl));
+    SDValue QuotH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HiLoVT, Quotient,
+                                DAG.getIntPtrConstant(1, dl));
+    Result.push_back(QuotL);
+    Result.push_back(QuotH);
+  }
+
+  if (Opcode != ISD::UDIV) {
+    // If we shifted the input, shift the remainder left and add the bits we
+    // shifted off the input.
+    if (TrailingZeros) {
+      APInt Mask = APInt::getLowBitsSet(HBitWidth, TrailingZeros);
+      RemL = DAG.getNode(ISD::SHL, dl, HiLoVT, RemL,
+                         DAG.getShiftAmountConstant(TrailingZeros, HiLoVT, dl));
+      RemL = DAG.getNode(ISD::ADD, dl, HiLoVT, RemL, PartialRem);
+    }
     Result.push_back(RemL);
-    Result.push_back(RemH);
+    Result.push_back(DAG.getConstant(0, dl, HiLoVT));
   }
 
   return true;

diff  --git a/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll b/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll
index dbfea8a03212..cd2b7aa9f351 100644
--- a/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll
+++ b/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll
@@ -502,24 +502,59 @@ define iXLen2 @test_udiv_65537(iXLen2 %x) nounwind {
 define iXLen2 @test_udiv_12(iXLen2 %x) nounwind {
 ; RV32-LABEL: test_udiv_12:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT:    li a2, 12
-; RV32-NEXT:    li a3, 0
-; RV32-NEXT:    call __udivdi3 at plt
-; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    slli a2, a1, 30
+; RV32-NEXT:    srli a0, a0, 2
+; RV32-NEXT:    or a0, a0, a2
+; RV32-NEXT:    srli a1, a1, 2
+; RV32-NEXT:    add a2, a0, a1
+; RV32-NEXT:    sltu a3, a2, a0
+; RV32-NEXT:    add a2, a2, a3
+; RV32-NEXT:    lui a3, 699051
+; RV32-NEXT:    addi a4, a3, -1365
+; RV32-NEXT:    mulhu a5, a2, a4
+; RV32-NEXT:    srli a6, a5, 1
+; RV32-NEXT:    andi a5, a5, -2
+; RV32-NEXT:    add a5, a5, a6
+; RV32-NEXT:    sub a2, a2, a5
+; RV32-NEXT:    sub a5, a0, a2
+; RV32-NEXT:    addi a3, a3, -1366
+; RV32-NEXT:    mul a3, a5, a3
+; RV32-NEXT:    mulhu a6, a5, a4
+; RV32-NEXT:    add a3, a6, a3
+; RV32-NEXT:    sltu a0, a0, a2
+; RV32-NEXT:    sub a0, a1, a0
+; RV32-NEXT:    mul a0, a0, a4
+; RV32-NEXT:    add a1, a3, a0
+; RV32-NEXT:    mul a0, a5, a4
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: test_udiv_12:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -16
-; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64-NEXT:    li a2, 12
-; RV64-NEXT:    li a3, 0
-; RV64-NEXT:    call __udivti3 at plt
-; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    slli a2, a1, 62
+; RV64-NEXT:    srli a0, a0, 2
+; RV64-NEXT:    or a0, a0, a2
+; RV64-NEXT:    srli a1, a1, 2
+; RV64-NEXT:    lui a2, %hi(.LCPI10_0)
+; RV64-NEXT:    ld a2, %lo(.LCPI10_0)(a2)
+; RV64-NEXT:    add a3, a0, a1
+; RV64-NEXT:    sltu a4, a3, a0
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    mulhu a4, a3, a2
+; RV64-NEXT:    srli a5, a4, 1
+; RV64-NEXT:    andi a4, a4, -2
+; RV64-NEXT:    lui a6, %hi(.LCPI10_1)
+; RV64-NEXT:    ld a6, %lo(.LCPI10_1)(a6)
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    sub a3, a3, a4
+; RV64-NEXT:    sub a4, a0, a3
+; RV64-NEXT:    mul a5, a4, a6
+; RV64-NEXT:    mulhu a6, a4, a2
+; RV64-NEXT:    add a5, a6, a5
+; RV64-NEXT:    sltu a0, a0, a3
+; RV64-NEXT:    sub a0, a1, a0
+; RV64-NEXT:    mul a0, a0, a2
+; RV64-NEXT:    add a1, a5, a0
+; RV64-NEXT:    mul a0, a4, a2
 ; RV64-NEXT:    ret
   %a = udiv iXLen2 %x, 12
   ret iXLen2 %a

diff  --git a/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll b/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll
index c0e210a7ec46..aa235c48c56a 100644
--- a/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll
+++ b/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll
@@ -335,24 +335,46 @@ define iXLen2 @test_urem_65537(iXLen2 %x) nounwind {
 define iXLen2 @test_urem_12(iXLen2 %x) nounwind {
 ; RV32-LABEL: test_urem_12:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT:    li a2, 12
-; RV32-NEXT:    li a3, 0
-; RV32-NEXT:    call __umoddi3 at plt
-; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    slli a2, a1, 30
+; RV32-NEXT:    srli a0, a0, 2
+; RV32-NEXT:    or a0, a0, a2
+; RV32-NEXT:    srli a1, a1, 2
+; RV32-NEXT:    add a1, a0, a1
+; RV32-NEXT:    sltu a2, a1, a0
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    lui a2, 699051
+; RV32-NEXT:    addi a2, a2, -1365
+; RV32-NEXT:    mulhu a2, a1, a2
+; RV32-NEXT:    srli a3, a2, 1
+; RV32-NEXT:    andi a2, a2, -2
+; RV32-NEXT:    add a2, a2, a3
+; RV32-NEXT:    sub a1, a1, a2
+; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    andi a0, a0, 3
+; RV32-NEXT:    or a0, a1, a0
+; RV32-NEXT:    li a1, 0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: test_urem_12:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -16
-; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64-NEXT:    li a2, 12
-; RV64-NEXT:    li a3, 0
-; RV64-NEXT:    call __umodti3 at plt
-; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    slli a2, a1, 62
+; RV64-NEXT:    srli a0, a0, 2
+; RV64-NEXT:    or a0, a0, a2
+; RV64-NEXT:    srli a1, a1, 2
+; RV64-NEXT:    lui a2, %hi(.LCPI10_0)
+; RV64-NEXT:    ld a2, %lo(.LCPI10_0)(a2)
+; RV64-NEXT:    add a1, a0, a1
+; RV64-NEXT:    sltu a3, a1, a0
+; RV64-NEXT:    add a1, a1, a3
+; RV64-NEXT:    mulhu a2, a1, a2
+; RV64-NEXT:    srli a3, a2, 1
+; RV64-NEXT:    andi a2, a2, -2
+; RV64-NEXT:    add a2, a2, a3
+; RV64-NEXT:    sub a1, a1, a2
+; RV64-NEXT:    slli a1, a1, 2
+; RV64-NEXT:    andi a0, a0, 3
+; RV64-NEXT:    or a0, a1, a0
+; RV64-NEXT:    li a1, 0
 ; RV64-NEXT:    ret
   %a = urem iXLen2 %x, 12
   ret iXLen2 %a

diff  --git a/llvm/test/CodeGen/X86/divide-by-constant.ll b/llvm/test/CodeGen/X86/divide-by-constant.ll
index 8f6d3ddb60e4..9197dc0c1d64 100644
--- a/llvm/test/CodeGen/X86/divide-by-constant.ll
+++ b/llvm/test/CodeGen/X86/divide-by-constant.ll
@@ -735,13 +735,23 @@ entry:
 define i64 @urem_i64_12(i64 %x) nounwind {
 ; X32-LABEL: urem_i64_12:
 ; X32:       # %bb.0: # %entry
-; X32-NEXT:    subl $12, %esp
-; X32-NEXT:    pushl $0
-; X32-NEXT:    pushl $12
-; X32-NEXT:    pushl {{[0-9]+}}(%esp)
-; X32-NEXT:    pushl {{[0-9]+}}(%esp)
-; X32-NEXT:    calll __umoddi3
-; X32-NEXT:    addl $28, %esp
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    shrdl $2, %ecx, %esi
+; X32-NEXT:    shrl $2, %ecx
+; X32-NEXT:    addl %esi, %ecx
+; X32-NEXT:    adcl $0, %ecx
+; X32-NEXT:    movl $-1431655765, %edx # imm = 0xAAAAAAAB
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    mull %edx
+; X32-NEXT:    shrl %edx
+; X32-NEXT:    leal (%edx,%edx,2), %eax
+; X32-NEXT:    subl %eax, %ecx
+; X32-NEXT:    andl $3, %esi
+; X32-NEXT:    leal (%esi,%ecx,4), %eax
+; X32-NEXT:    xorl %edx, %edx
+; X32-NEXT:    popl %esi
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: urem_i64_12:
@@ -1116,13 +1126,33 @@ entry:
 define i64 @udiv_i64_12(i64 %x) nounwind {
 ; X32-LABEL: udiv_i64_12:
 ; X32:       # %bb.0: # %entry
-; X32-NEXT:    subl $12, %esp
-; X32-NEXT:    pushl $0
-; X32-NEXT:    pushl $12
-; X32-NEXT:    pushl {{[0-9]+}}(%esp)
-; X32-NEXT:    pushl {{[0-9]+}}(%esp)
-; X32-NEXT:    calll __udivdi3
-; X32-NEXT:    addl $28, %esp
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    shrdl $2, %edi, %ecx
+; X32-NEXT:    shrl $2, %edi
+; X32-NEXT:    movl %ecx, %esi
+; X32-NEXT:    addl %edi, %esi
+; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    movl $-1431655765, %ebx # imm = 0xAAAAAAAB
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    mull %ebx
+; X32-NEXT:    shrl %edx
+; X32-NEXT:    leal (%edx,%edx,2), %eax
+; X32-NEXT:    subl %eax, %esi
+; X32-NEXT:    subl %esi, %ecx
+; X32-NEXT:    sbbl $0, %edi
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    mull %ebx
+; X32-NEXT:    imull $-1431655766, %ecx, %ecx # imm = 0xAAAAAAAA
+; X32-NEXT:    addl %ecx, %edx
+; X32-NEXT:    imull $-1431655765, %edi, %ecx # imm = 0xAAAAAAAB
+; X32-NEXT:    addl %ecx, %edx
+; X32-NEXT:    popl %esi
+; X32-NEXT:    popl %edi
+; X32-NEXT:    popl %ebx
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: udiv_i64_12:

diff  --git a/llvm/test/CodeGen/X86/divmod128.ll b/llvm/test/CodeGen/X86/divmod128.ll
index 4549598ca3ed..b0657f4f8456 100644
--- a/llvm/test/CodeGen/X86/divmod128.ll
+++ b/llvm/test/CodeGen/X86/divmod128.ll
@@ -425,27 +425,37 @@ entry:
 define i128 @urem_i128_12(i128 %x) nounwind {
 ; X86-64-LABEL: urem_i128_12:
 ; X86-64:       # %bb.0: # %entry
-; X86-64-NEXT:    pushq %rax
-; X86-64-NEXT:    movl $12, %edx
-; X86-64-NEXT:    xorl %ecx, %ecx
-; X86-64-NEXT:    callq __umodti3 at PLT
-; X86-64-NEXT:    popq %rcx
+; X86-64-NEXT:    shrdq $2, %rsi, %rdi
+; X86-64-NEXT:    shrq $2, %rsi
+; X86-64-NEXT:    addq %rdi, %rsi
+; X86-64-NEXT:    adcq $0, %rsi
+; X86-64-NEXT:    movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB
+; X86-64-NEXT:    movq %rsi, %rax
+; X86-64-NEXT:    mulq %rcx
+; X86-64-NEXT:    shrq %rdx
+; X86-64-NEXT:    leaq (%rdx,%rdx,2), %rax
+; X86-64-NEXT:    subq %rax, %rsi
+; X86-64-NEXT:    andl $3, %edi
+; X86-64-NEXT:    leaq (%rdi,%rsi,4), %rax
+; X86-64-NEXT:    xorl %edx, %edx
 ; X86-64-NEXT:    retq
 ;
 ; WIN64-LABEL: urem_i128_12:
 ; WIN64:       # %bb.0: # %entry
-; WIN64-NEXT:    subq $72, %rsp
-; WIN64-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
-; WIN64-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
-; WIN64-NEXT:    movq $12, {{[0-9]+}}(%rsp)
-; WIN64-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
-; WIN64-NEXT:    callq __umodti3
-; WIN64-NEXT:    movq %xmm0, %rax
-; WIN64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; WIN64-NEXT:    movq %xmm0, %rdx
-; WIN64-NEXT:    addq $72, %rsp
+; WIN64-NEXT:    movq %rdx, %r8
+; WIN64-NEXT:    shrdq $2, %rdx, %rcx
+; WIN64-NEXT:    shrq $2, %r8
+; WIN64-NEXT:    addq %rcx, %r8
+; WIN64-NEXT:    adcq $0, %r8
+; WIN64-NEXT:    movabsq $-6148914691236517205, %rdx # imm = 0xAAAAAAAAAAAAAAAB
+; WIN64-NEXT:    movq %r8, %rax
+; WIN64-NEXT:    mulq %rdx
+; WIN64-NEXT:    shrq %rdx
+; WIN64-NEXT:    leaq (%rdx,%rdx,2), %rax
+; WIN64-NEXT:    subq %rax, %r8
+; WIN64-NEXT:    andl $3, %ecx
+; WIN64-NEXT:    leaq (%rcx,%r8,4), %rax
+; WIN64-NEXT:    xorl %edx, %edx
 ; WIN64-NEXT:    retq
 entry:
   %rem = urem i128 %x, 12
@@ -887,27 +897,51 @@ entry:
 define i128 @udiv_i128_12(i128 %x) nounwind {
 ; X86-64-LABEL: udiv_i128_12:
 ; X86-64:       # %bb.0: # %entry
-; X86-64-NEXT:    pushq %rax
-; X86-64-NEXT:    movl $12, %edx
-; X86-64-NEXT:    xorl %ecx, %ecx
-; X86-64-NEXT:    callq __udivti3 at PLT
-; X86-64-NEXT:    popq %rcx
+; X86-64-NEXT:    shrdq $2, %rsi, %rdi
+; X86-64-NEXT:    shrq $2, %rsi
+; X86-64-NEXT:    movq %rdi, %rcx
+; X86-64-NEXT:    addq %rsi, %rcx
+; X86-64-NEXT:    adcq $0, %rcx
+; X86-64-NEXT:    movabsq $-6148914691236517205, %r8 # imm = 0xAAAAAAAAAAAAAAAB
+; X86-64-NEXT:    movq %rcx, %rax
+; X86-64-NEXT:    mulq %r8
+; X86-64-NEXT:    shrq %rdx
+; X86-64-NEXT:    leaq (%rdx,%rdx,2), %rax
+; X86-64-NEXT:    subq %rax, %rcx
+; X86-64-NEXT:    subq %rcx, %rdi
+; X86-64-NEXT:    sbbq $0, %rsi
+; X86-64-NEXT:    movabsq $-6148914691236517206, %rcx # imm = 0xAAAAAAAAAAAAAAAA
+; X86-64-NEXT:    imulq %rdi, %rcx
+; X86-64-NEXT:    movq %rdi, %rax
+; X86-64-NEXT:    mulq %r8
+; X86-64-NEXT:    addq %rcx, %rdx
+; X86-64-NEXT:    imulq %rsi, %r8
+; X86-64-NEXT:    addq %r8, %rdx
 ; X86-64-NEXT:    retq
 ;
 ; WIN64-LABEL: udiv_i128_12:
 ; WIN64:       # %bb.0: # %entry
-; WIN64-NEXT:    subq $72, %rsp
-; WIN64-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
-; WIN64-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
-; WIN64-NEXT:    movq $12, {{[0-9]+}}(%rsp)
-; WIN64-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
-; WIN64-NEXT:    callq __udivti3
-; WIN64-NEXT:    movq %xmm0, %rax
-; WIN64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; WIN64-NEXT:    movq %xmm0, %rdx
-; WIN64-NEXT:    addq $72, %rsp
+; WIN64-NEXT:    movq %rdx, %r8
+; WIN64-NEXT:    shrdq $2, %rdx, %rcx
+; WIN64-NEXT:    shrq $2, %r8
+; WIN64-NEXT:    movq %rcx, %r9
+; WIN64-NEXT:    addq %r8, %r9
+; WIN64-NEXT:    adcq $0, %r9
+; WIN64-NEXT:    movabsq $-6148914691236517205, %r10 # imm = 0xAAAAAAAAAAAAAAAB
+; WIN64-NEXT:    movq %r9, %rax
+; WIN64-NEXT:    mulq %r10
+; WIN64-NEXT:    shrq %rdx
+; WIN64-NEXT:    leaq (%rdx,%rdx,2), %rax
+; WIN64-NEXT:    subq %rax, %r9
+; WIN64-NEXT:    subq %r9, %rcx
+; WIN64-NEXT:    sbbq $0, %r8
+; WIN64-NEXT:    movabsq $-6148914691236517206, %r9 # imm = 0xAAAAAAAAAAAAAAAA
+; WIN64-NEXT:    imulq %rcx, %r9
+; WIN64-NEXT:    movq %rcx, %rax
+; WIN64-NEXT:    mulq %r10
+; WIN64-NEXT:    addq %r9, %rdx
+; WIN64-NEXT:    imulq %r10, %r8
+; WIN64-NEXT:    addq %r8, %rdx
 ; WIN64-NEXT:    retq
 entry:
   %rem = udiv i128 %x, 12