[llvm] [SelectionDAG][X86] Fold `sub(x, mul(divrem(x,y)[0], y))` to `divrem(x, y)[1]` (PR #136565)

Mon Apr 21 22:05:59 PDT 2025

https://github.com/el-ev updated https://github.com/llvm/llvm-project/pull/136565

>From c2ef371a904cdd5bbdc504990d107988f0f4aaa4 Mon Sep 17 00:00:00 2001
From: Iris Shi <0.0 at owo.li>
Date: Tue, 22 Apr 2025 13:01:23 +0800
Subject: [PATCH 1/2] 1

---
 .../X86/div-rem-pair-recomposition-signed.ll  | 44 +++++++++++++++++--
 1 file changed, 40 insertions(+), 4 deletions(-)

diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
index 880062bbc4f9e..98ae173cb3d5a 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
@@ -1194,6 +1194,42 @@ define <2 x i64> @vector_i128_i64(<2 x i64> %x, <2 x i64> %y, ptr %divdst) nounw
   ret <2 x i64> %t2
 }
 
+define i32 @scalar_i32_const_pow2_divisor(i32 %0, ptr %1) minsize nounwind {
+; X86-LABEL: scalar_i32_const_pow2_divisor:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl $256, %edi # imm = 0x100
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    cltd
+; X86-NEXT:    idivl %edi
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    shll $8, %edx
+; X86-NEXT:    subl %edx, %ecx
+; X86-NEXT:    movl %ecx, (%esi)
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl
+;
+; X64-LABEL: scalar_i32_const_pow2_divisor:
+; X64:       # %bb.0:
+; X64-NEXT:    movl $256, %ecx # imm = 0x100
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    cltd
+; X64-NEXT:    idivl %ecx
+; X64-NEXT:    movl %eax, %ecx
+; X64-NEXT:    shll $8, %ecx
+; X64-NEXT:    subl %ecx, %edi
+; X64-NEXT:    movl %edi, (%rsi)
+; X64-NEXT:    retq
+  %3 = srem i32 %0, 256
+  store i32 %3, ptr %1, align 4
+  %4 = sdiv i32 %0, 256
+  ret i32 %4
+}
+
 ; Special tests.
 
 define i32 @scalar_i32_commutative(i32 %x, ptr %ysrc, ptr %divdst) nounwind {
@@ -1296,14 +1332,14 @@ define i32 @multiple_bb(i32 %x, i32 %y, ptr %divdst, i1 zeroext %store_srem, ptr
 ; X86-NEXT:    idivl %esi
 ; X86-NEXT:    movl %eax, (%edi)
 ; X86-NEXT:    testb %bl, %bl
-; X86-NEXT:    je .LBB11_2
+; X86-NEXT:    je .LBB12_2
 ; X86-NEXT:  # %bb.1: # %do_srem
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    imull %esi, %edi
 ; X86-NEXT:    subl %edi, %ecx
 ; X86-NEXT:    movl %ecx, (%edx)
-; X86-NEXT:  .LBB11_2: # %end
+; X86-NEXT:  .LBB12_2: # %end
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -1317,13 +1353,13 @@ define i32 @multiple_bb(i32 %x, i32 %y, ptr %divdst, i1 zeroext %store_srem, ptr
 ; X64-NEXT:    idivl %esi
 ; X64-NEXT:    movl %eax, (%r9)
 ; X64-NEXT:    testl %ecx, %ecx
-; X64-NEXT:    je .LBB11_2
+; X64-NEXT:    je .LBB12_2
 ; X64-NEXT:  # %bb.1: # %do_srem
 ; X64-NEXT:    movl %eax, %ecx
 ; X64-NEXT:    imull %esi, %ecx
 ; X64-NEXT:    subl %ecx, %edi
 ; X64-NEXT:    movl %edi, (%r8)
-; X64-NEXT:  .LBB11_2: # %end
+; X64-NEXT:  .LBB12_2: # %end
 ; X64-NEXT:    retq
   %div = sdiv i32 %x, %y
   store i32 %div, ptr %divdst, align 4

>From a09b4acc099d18b1322ceaf96c68c2cc41ad016f Mon Sep 17 00:00:00 2001
From: Iris Shi <0.0 at owo.li>
Date: Mon, 21 Apr 2025 21:18:08 +0800
Subject: [PATCH 2/2] [SelectionDAG][X86] Fold `sub(x, mul(divrem(x,y)[0], y))`
 to `divrem(x, y)[1]`

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  59 ++++++++
 .../X86/div-rem-pair-recomposition-signed.ll  | 135 ++++++------------
 .../div-rem-pair-recomposition-unsigned.ll    | 110 +++++---------
 3 files changed, 132 insertions(+), 172 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index b175e35385ec6..f5f96e6fcf27d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -3867,6 +3867,62 @@ static SDValue foldSubCtlzNot(SDNode *N, SelectionDAG &DAG) {
   return Matcher.getNode(ISD::CTLZ_ZERO_UNDEF, DL, VT, Not);
 }
 
+// Fold sub(x, mul(divrem(x,y)[0], y)) to divrem(x, y)[1]
+static SDValue foldRemainderIdiom(SDNode *N, SelectionDAG &DAG) {
+  assert(N->getOpcode() == ISD::SUB && "Node must be a SUB");
+  SDValue Sub0 = N->getOperand(0);
+  SDValue Sub1 = N->getOperand(1);
+  SDLoc DL(N);
+
+  auto CheckAndFoldMulCase = [&](SDValue DivRem, SDValue MaybeY) -> SDValue {
+    if ((DivRem.getOpcode() == ISD::SDIVREM ||
+         DivRem.getOpcode() == ISD::UDIVREM) &&
+        DivRem.getResNo() == 0 && DivRem.getOperand(0) == Sub0 &&
+        DivRem.getOperand(1) == MaybeY) {
+      return SDValue(DivRem.getNode(), 1);
+    }
+    return SDValue();
+  };
+
+  if (Sub1.getOpcode() == ISD::MUL) {
+    // (sub x, (mul divrem(x,y)[0], y))
+    SDValue Mul0 = Sub1.getOperand(0);
+    SDValue Mul1 = Sub1.getOperand(1);
+
+    SDValue Res = CheckAndFoldMulCase(Mul0, Mul1);
+    if (Res)
+      return Res;
+
+    Res = CheckAndFoldMulCase(Mul1, Mul0);
+    if (Res)
+      return Res;
+
+  } else if (Sub1.getOpcode() == ISD::SHL) {
+    // Handle (sub x, (shl divrem(x,y)[0], C)) where y = 1 << C
+    SDValue Shl0 = Sub1.getOperand(0);
+    SDValue Shl1 = Sub1.getOperand(1);
+    // Check if Shl0 is divrem(x, Y)[0]
+    if ((Shl0.getOpcode() == ISD::SDIVREM ||
+         Shl0.getOpcode() == ISD::UDIVREM) &&
+        Shl0.getResNo() == 0 && Shl0.getOperand(0) == Sub0) {
+
+      SDValue Divisor = Shl0.getOperand(1);
+
+      ConstantSDNode *DivC = isConstOrConstSplat(Divisor);
+      ConstantSDNode *ShC = isConstOrConstSplat(Shl1);
+      if (!DivC || !ShC) {
+        return SDValue();
+      }
+
+      if (DivC->getAPIntValue().isPowerOf2() &&
+          DivC->getAPIntValue().logBase2() == ShC->getAPIntValue()) {
+        return SDValue(Shl0.getNode(), 1);
+      }
+    }
+  }
+  return SDValue();
+}
+
 // Since it may not be valid to emit a fold to zero for vector initializers
 // check if we can before folding.
 static SDValue tryFoldToZero(const SDLoc &DL, const TargetLowering &TLI, EVT VT,
@@ -4094,6 +4150,9 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
   if (SDValue V = foldSubToUSubSat(VT, N, DL))
     return V;
 
+  if (SDValue V = foldRemainderIdiom(N, DAG))
+    return V;
+
   // (A - B) - 1  ->  add (xor B, -1), A
   if (sd_match(N, m_Sub(m_OneUse(m_Sub(m_Value(A), m_Value(B))), m_One())))
     return DAG.getNode(ISD::ADD, DL, VT, A, DAG.getNOT(DL, B, VT));
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
index 98ae173cb3d5a..8f82a5bc6554e 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
@@ -11,25 +11,20 @@
 define i8 @scalar_i8(i8 %x, i8 %y, ptr %divdst) nounwind {
 ; X86-LABEL: scalar_i8:
 ; X86:       # %bb.0:
+; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    idivb {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-NEXT:    movsbl %cl, %eax
-; X86-NEXT:    idivb %ch
+; X86-NEXT:    movsbl %ah, %ecx
 ; X86-NEXT:    movb %al, (%edx)
-; X86-NEXT:    mulb %ch
-; X86-NEXT:    subb %al, %cl
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: scalar_i8:
 ; X64:       # %bb.0:
-; X64-NEXT:    movsbl %dil, %ecx
-; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    movsbl %dil, %eax
 ; X64-NEXT:    idivb %sil
+; X64-NEXT:    movsbl %ah, %ecx
 ; X64-NEXT:    movb %al, (%rdx)
-; X64-NEXT:    mulb %sil
-; X64-NEXT:    subb %al, %cl
 ; X64-NEXT:    movl %ecx, %eax
 ; X64-NEXT:    retq
   %div = sdiv i8 %x, %y
@@ -42,34 +37,23 @@ define i8 @scalar_i8(i8 %x, i8 %y, ptr %divdst) nounwind {
 define i16 @scalar_i16(i16 %x, i16 %y, ptr %divdst) nounwind {
 ; X86-LABEL: scalar_i16:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cwtd
-; X86-NEXT:    idivw %si
-; X86-NEXT:    # kill: def $ax killed $ax def $eax
-; X86-NEXT:    movw %ax, (%edi)
-; X86-NEXT:    imull %eax, %esi
-; X86-NEXT:    subl %esi, %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
+; X86-NEXT:    idivw {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movw %ax, (%ecx)
+; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: scalar_i16:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    cwtd
 ; X64-NEXT:    idivw %si
-; X64-NEXT:    # kill: def $ax killed $ax def $eax
 ; X64-NEXT:    movw %ax, (%rcx)
-; X64-NEXT:    imull %eax, %esi
-; X64-NEXT:    subl %esi, %edi
-; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movl %edx, %eax
 ; X64-NEXT:    retq
   %div = sdiv i16 %x, %y
   store i16 %div, ptr %divdst, align 4
@@ -81,20 +65,12 @@ define i16 @scalar_i16(i16 %x, i16 %y, ptr %divdst) nounwind {
 define i32 @scalar_i32(i32 %x, i32 %y, ptr %divdst) nounwind {
 ; X86-LABEL: scalar_i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cltd
-; X86-NEXT:    idivl %edi
-; X86-NEXT:    movl %eax, (%esi)
-; X86-NEXT:    imull %edi, %eax
-; X86-NEXT:    subl %eax, %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
+; X86-NEXT:    idivl {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %eax, (%ecx)
+; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: scalar_i32:
@@ -104,9 +80,7 @@ define i32 @scalar_i32(i32 %x, i32 %y, ptr %divdst) nounwind {
 ; X64-NEXT:    cltd
 ; X64-NEXT:    idivl %esi
 ; X64-NEXT:    movl %eax, (%rcx)
-; X64-NEXT:    imull %esi, %eax
-; X64-NEXT:    subl %eax, %edi
-; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movl %edx, %eax
 ; X64-NEXT:    retq
   %div = sdiv i32 %x, %y
   store i32 %div, ptr %divdst, align 4
@@ -158,9 +132,7 @@ define i64 @scalar_i64(i64 %x, i64 %y, ptr %divdst) nounwind {
 ; X64-NEXT:    cqto
 ; X64-NEXT:    idivq %rsi
 ; X64-NEXT:    movq %rax, (%rcx)
-; X64-NEXT:    imulq %rsi, %rax
-; X64-NEXT:    subq %rax, %rdi
-; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movq %rdx, %rax
 ; X64-NEXT:    retq
   %div = sdiv i64 %x, %y
   store i64 %div, ptr %divdst, align 4
@@ -1197,32 +1169,21 @@ define <2 x i64> @vector_i128_i64(<2 x i64> %x, <2 x i64> %y, ptr %divdst) nounw
 define i32 @scalar_i32_const_pow2_divisor(i32 %0, ptr %1) minsize nounwind {
 ; X86-LABEL: scalar_i32_const_pow2_divisor:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $256, %edi # imm = 0x100
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl $256, %ecx # imm = 0x100
 ; X86-NEXT:    cltd
-; X86-NEXT:    idivl %edi
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    shll $8, %edx
-; X86-NEXT:    subl %edx, %ecx
-; X86-NEXT:    movl %ecx, (%esi)
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
+; X86-NEXT:    idivl %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %edx, (%ecx)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: scalar_i32_const_pow2_divisor:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl $256, %ecx # imm = 0x100
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movl $256, %ecx # imm = 0x100
 ; X64-NEXT:    cltd
 ; X64-NEXT:    idivl %ecx
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    shll $8, %ecx
-; X64-NEXT:    subl %ecx, %edi
-; X64-NEXT:    movl %edi, (%rsi)
+; X64-NEXT:    movl %edx, (%rsi)
 ; X64-NEXT:    retq
   %3 = srem i32 %0, 256
   store i32 %3, ptr %1, align 4
@@ -1235,34 +1196,23 @@ define i32 @scalar_i32_const_pow2_divisor(i32 %0, ptr %1) minsize nounwind {
 define i32 @scalar_i32_commutative(i32 %x, ptr %ysrc, ptr %divdst) nounwind {
 ; X86-LABEL: scalar_i32_commutative:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl (%eax), %edi
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    cltd
-; X86-NEXT:    idivl %edi
-; X86-NEXT:    movl %eax, (%esi)
-; X86-NEXT:    imull %eax, %edi
-; X86-NEXT:    subl %edi, %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
+; X86-NEXT:    idivl (%ecx)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %eax, (%ecx)
+; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: scalar_i32_commutative:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movl (%rsi), %esi
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    cltd
-; X64-NEXT:    idivl %esi
+; X64-NEXT:    idivl (%rsi)
 ; X64-NEXT:    movl %eax, (%rcx)
-; X64-NEXT:    imull %eax, %esi
-; X64-NEXT:    subl %esi, %edi
-; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movl %edx, %eax
 ; X64-NEXT:    retq
   %y = load i32, ptr %ysrc, align 4
   %div = sdiv i32 %x, %y
@@ -1276,24 +1226,20 @@ define i32 @scalar_i32_commutative(i32 %x, ptr %ysrc, ptr %divdst) nounwind {
 define i32 @extrause(i32 %x, i32 %y, ptr %divdst, ptr %t1dst) nounwind {
 ; X86-LABEL: extrause:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    cltd
-; X86-NEXT:    idivl %ebx
+; X86-NEXT:    idivl %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl %eax, (%edi)
-; X86-NEXT:    imull %ebx, %eax
+; X86-NEXT:    imull %ecx, %eax
 ; X86-NEXT:    movl %eax, (%esi)
-; X86-NEXT:    subl %eax, %ecx
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: extrause:
@@ -1305,8 +1251,7 @@ define i32 @extrause(i32 %x, i32 %y, ptr %divdst, ptr %t1dst) nounwind {
 ; X64-NEXT:    movl %eax, (%r8)
 ; X64-NEXT:    imull %esi, %eax
 ; X64-NEXT:    movl %eax, (%rcx)
-; X64-NEXT:    subl %eax, %edi
-; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movl %edx, %eax
 ; X64-NEXT:    retq
   %div = sdiv i32 %x, %y
   store i32 %div, ptr %divdst, align 4
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
index 6fdde0b14a984..0bef9ee50bd54 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
@@ -11,25 +11,20 @@
 define i8 @scalar_i8(i8 %x, i8 %y, ptr %divdst) nounwind {
 ; X86-LABEL: scalar_i8:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    divb {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    divb %ch
+; X86-NEXT:    movzbl %ah, %ecx
 ; X86-NEXT:    movb %al, (%edx)
-; X86-NEXT:    mulb %ch
-; X86-NEXT:    subb %al, %cl
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: scalar_i8:
 ; X64:       # %bb.0:
-; X64-NEXT:    movzbl %dil, %ecx
-; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    movzbl %dil, %eax
 ; X64-NEXT:    divb %sil
+; X64-NEXT:    movzbl %ah, %ecx
 ; X64-NEXT:    movb %al, (%rdx)
-; X64-NEXT:    mulb %sil
-; X64-NEXT:    subb %al, %cl
 ; X64-NEXT:    movl %ecx, %eax
 ; X64-NEXT:    retq
   %div = udiv i8 %x, %y
@@ -42,34 +37,23 @@ define i8 @scalar_i8(i8 %x, i8 %y, ptr %divdst) nounwind {
 define i16 @scalar_i16(i16 %x, i16 %y, ptr %divdst) nounwind {
 ; X86-LABEL: scalar_i16:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    divw %si
-; X86-NEXT:    # kill: def $ax killed $ax def $eax
-; X86-NEXT:    movw %ax, (%edi)
-; X86-NEXT:    imull %eax, %esi
-; X86-NEXT:    subl %esi, %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
+; X86-NEXT:    divw {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movw %ax, (%ecx)
+; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: scalar_i16:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    divw %si
-; X64-NEXT:    # kill: def $ax killed $ax def $eax
 ; X64-NEXT:    movw %ax, (%rcx)
-; X64-NEXT:    imull %eax, %esi
-; X64-NEXT:    subl %esi, %edi
-; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movl %edx, %eax
 ; X64-NEXT:    retq
   %div = udiv i16 %x, %y
   store i16 %div, ptr %divdst, align 4
@@ -81,20 +65,12 @@ define i16 @scalar_i16(i16 %x, i16 %y, ptr %divdst) nounwind {
 define i32 @scalar_i32(i32 %x, i32 %y, ptr %divdst) nounwind {
 ; X86-LABEL: scalar_i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    divl %edi
-; X86-NEXT:    movl %eax, (%esi)
-; X86-NEXT:    imull %edi, %eax
-; X86-NEXT:    subl %eax, %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
+; X86-NEXT:    divl {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %eax, (%ecx)
+; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: scalar_i32:
@@ -104,9 +80,7 @@ define i32 @scalar_i32(i32 %x, i32 %y, ptr %divdst) nounwind {
 ; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    divl %esi
 ; X64-NEXT:    movl %eax, (%rcx)
-; X64-NEXT:    imull %esi, %eax
-; X64-NEXT:    subl %eax, %edi
-; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movl %edx, %eax
 ; X64-NEXT:    retq
   %div = udiv i32 %x, %y
   store i32 %div, ptr %divdst, align 4
@@ -158,9 +132,7 @@ define i64 @scalar_i64(i64 %x, i64 %y, ptr %divdst) nounwind {
 ; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    divq %rsi
 ; X64-NEXT:    movq %rax, (%rcx)
-; X64-NEXT:    imulq %rsi, %rax
-; X64-NEXT:    subq %rax, %rdi
-; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movq %rdx, %rax
 ; X64-NEXT:    retq
   %div = udiv i64 %x, %y
   store i64 %div, ptr %divdst, align 4
@@ -1153,34 +1125,23 @@ define <2 x i64> @vector_i128_i64(<2 x i64> %x, <2 x i64> %y, ptr %divdst) nounw
 define i32 @scalar_i32_commutative(i32 %x, ptr %ysrc, ptr %divdst) nounwind {
 ; X86-LABEL: scalar_i32_commutative:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl (%eax), %edi
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    divl %edi
-; X86-NEXT:    movl %eax, (%esi)
-; X86-NEXT:    imull %eax, %edi
-; X86-NEXT:    subl %edi, %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
+; X86-NEXT:    divl (%ecx)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %eax, (%ecx)
+; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: scalar_i32_commutative:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movl (%rsi), %esi
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    divl %esi
+; X64-NEXT:    divl (%rsi)
 ; X64-NEXT:    movl %eax, (%rcx)
-; X64-NEXT:    imull %eax, %esi
-; X64-NEXT:    subl %esi, %edi
-; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movl %edx, %eax
 ; X64-NEXT:    retq
   %y = load i32, ptr %ysrc, align 4
   %div = udiv i32 %x, %y
@@ -1194,24 +1155,20 @@ define i32 @scalar_i32_commutative(i32 %x, ptr %ysrc, ptr %divdst) nounwind {
 define i32 @extrause(i32 %x, i32 %y, ptr %divdst, ptr %t1dst) nounwind {
 ; X86-LABEL: extrause:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    divl %ebx
+; X86-NEXT:    divl %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl %eax, (%edi)
-; X86-NEXT:    imull %ebx, %eax
+; X86-NEXT:    imull %ecx, %eax
 ; X86-NEXT:    movl %eax, (%esi)
-; X86-NEXT:    subl %eax, %ecx
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: extrause:
@@ -1223,8 +1180,7 @@ define i32 @extrause(i32 %x, i32 %y, ptr %divdst, ptr %t1dst) nounwind {
 ; X64-NEXT:    movl %eax, (%r8)
 ; X64-NEXT:    imull %esi, %eax
 ; X64-NEXT:    movl %eax, (%rcx)
-; X64-NEXT:    subl %eax, %edi
-; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movl %edx, %eax
 ; X64-NEXT:    retq
   %div = udiv i32 %x, %y
   store i32 %div, ptr %divdst, align 4