[llvm] [SelectionDAG] Optimize 32-bit udiv with 33-bit magic constants on 64-bit targets (PR #181288)

Thu Feb 12 21:16:01 PST 2026

https://github.com/herumi updated https://github.com/llvm/llvm-project/pull/181288

>From ae3673be45d1c9f4e1864b897238ae099f73eaac Mon Sep 17 00:00:00 2001
From: MITSUNARI Shigeo <herumi at nifty.com>
Date: Tue, 10 Feb 2026 16:01:52 +0900
Subject: [PATCH 1/2] [SelectionDAG] Optimize 32-bit udiv with 33-bit magic
 constants on 64-bit targets

Compiler optimization for constant division of uint32_t variables (such as x / 7)
is based on the Granlund-Montgomery method (1994). However, the GM method for the
IsAdd=true case (33-bit magic constants) was optimized for 32-bit CPUs, not 64-bit CPUs.

This patch provides optimizations specifically for 64-bit CPUs (such as x86_64 and
Apple M-series) by pre-shifting the 33-bit magic constant left by (64-a) bits and
using the high 64 bits of a 64x64->128 bit multiplication directly. This eliminates
the add/sub/shift sequence required by the traditional approach.

Before (7 instructions for x / 7 on x86_64):
  movl    %edi, %eax
  imulq   $613566757, %rax, %rax
  shrq    $32, %rax
  subl    %eax, %edi
  shrl    %edi
  addl    %edi, %eax
  shrl    $2, %eax

After with BMI2 (3 instructions):
  movl    %edi, %edx
  movabsq $2635249153617166336, %rax
  mulxq   %rax, %rax, %rax

Without BMI2 (4 instructions):
  movl    %edi, %eax
  movabsq $2635249153617166336, %rcx
  mulq    %rcx
  movq    %rdx, %rax

Benchmark results (1 billion divisions by 7, 19, and 107):

| CPU        | Original (sec) | Optimized (sec) | Speedup |
|------------|----------------|-----------------|---------|
| Intel Xeon | 6.40           | 3.83            | 40.2%   |
| Apple M4   | 6.70           | 3.38            | 49.6%   |

The optimization applies when:
- Element size is 32 bits
- Type is scalar (not vector) - SIMD lacks 64x64->128 high multiply instructions
- Target pointer size is 64 bits or larger
- UnsignedDivisionByConstantInfo::IsAdd is true (33-bit magic)

Affected divisors: 7, 19, 21, 27, 31, 35, 37, etc. (about 23% of 31-bit divisors)
Architectures: x86_64, AArch64, RISC-V64 (any 64-bit target with 64x64->128 bit multiply)

Tests added for X86, AArch64, and RISC-V64 architectures.
---
 .../CodeGen/SelectionDAG/TargetLowering.cpp   | 104 ++++++++++---
 llvm/test/CodeGen/AArch64/rem-by-const.ll     |  27 ++--
 ...sve-streaming-mode-fixed-length-int-div.ll |  59 ++------
 .../AArch64/udiv-const-optimization.ll        |  61 ++++++++
 llvm/test/CodeGen/AArch64/urem-lkk.ll         |  47 +++---
 llvm/test/CodeGen/RISCV/div-by-constant.ll    |  34 ++---
 .../CodeGen/RISCV/udiv-const-optimization.ll  |  66 ++++++++
 llvm/test/CodeGen/RISCV/urem-lkk.ll           |  24 ++-
 llvm/test/CodeGen/X86/fold-loop-of-urem.ll    |  14 +-
 .../CodeGen/X86/udiv-const-optimization.ll    | 141 ++++++++++++++++++
 llvm/test/CodeGen/X86/urem-lkk.ll             |  31 ++--
 11 files changed, 449 insertions(+), 159 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/udiv-const-optimization.ll
 create mode 100644 llvm/test/CodeGen/RISCV/udiv-const-optimization.ll
 create mode 100644 llvm/test/CodeGen/X86/udiv-const-optimization.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index e4b4d80896fa7..c3829002b5b36 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -12,6 +12,7 @@
 
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallString.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/CodeGen/Analysis.h"
@@ -6791,6 +6792,7 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
   const unsigned SVTBits = SVT.getSizeInBits();
 
   bool UseNPQ = false, UsePreShift = false, UsePostShift = false;
+  bool Use33BitOptimization = false;
   SmallVector<SDValue, 16> PreShifts, PostShifts, MagicFactors, NPQFactors;
 
   auto BuildUDIVPattern = [&](ConstantSDNode *C) {
@@ -6812,23 +6814,51 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
           UnsignedDivisionByConstantInfo::get(
               Divisor, std::min(KnownLeadingZeros, Divisor.countl_zero()));
 
-      MagicFactor = DAG.getConstant(magics.Magic.zext(SVTBits), dl, SVT);
-
-      assert(magics.PreShift < Divisor.getBitWidth() &&
-             "We shouldn't generate an undefined shift!");
-      assert(magics.PostShift < Divisor.getBitWidth() &&
-             "We shouldn't generate an undefined shift!");
-      assert((!magics.IsAdd || magics.PreShift == 0) &&
-             "Unexpected pre-shift");
-      PreShift = DAG.getConstant(magics.PreShift, dl, ShSVT);
-      PostShift = DAG.getConstant(magics.PostShift, dl, ShSVT);
-      NPQFactor = DAG.getConstant(
-          magics.IsAdd ? APInt::getOneBitSet(SVTBits, EltBits - 1)
-                       : APInt::getZero(SVTBits),
-          dl, SVT);
-      UseNPQ |= magics.IsAdd;
-      UsePreShift |= magics.PreShift != 0;
-      UsePostShift |= magics.PostShift != 0;
+      // Our Approach: For 32-bit division with IsAdd (33-bit
+      // magic case), use optimized method: preshift c by (64-a) bits to
+      // eliminate runtime shift. This requires 64x64->128 bit multiplication.
+      // Only apply to scalar types since SIMD lacks 64x64->128 high multiply.
+      // Note: IsAdd=true implies PreShift=0 by algorithm design.
+      // Check if 64-bit MULHU is available before applying this optimization.
+      EVT WideVT64 = EVT::getIntegerVT(*DAG.getContext(), 64);
+      bool Has64BitMULHU =
+          isOperationLegalOrCustom(ISD::MULHU, WideVT64, IsAfterLegalization) ||
+          isOperationLegalOrCustom(ISD::UMUL_LOHI, WideVT64,
+                                   IsAfterLegalization);
+      if (EltBits == 32 && !VT.isVector() && Has64BitMULHU && magics.IsAdd) {
+        // For IsAdd case, actual magic constant is 2^32 + Magic (33-bit)
+        unsigned OriginalShift = magics.PostShift + 33;
+        APInt RealMagic = APInt(65, 1).shl(32) + magics.Magic.zext(65); // 2^32 + Magic
+        Use33BitOptimization = true;
+        // Shift the constant left by (64 - OriginalShift) to avoid runtime shift
+        APInt ShiftedMagic = RealMagic.shl(64 - OriginalShift).trunc(64);
+        MagicFactor = DAG.getConstant(ShiftedMagic, dl,
+                                       EVT::getIntegerVT(*DAG.getContext(), 64));
+        PreShift = DAG.getConstant(0, dl, ShSVT);
+        PostShift = DAG.getConstant(0, dl, ShSVT);
+        NPQFactor = DAG.getConstant(APInt::getZero(SVTBits), dl, SVT);
+        UseNPQ = false;
+        UsePreShift = false;
+        UsePostShift = false;
+      } else {
+        MagicFactor = DAG.getConstant(magics.Magic.zext(SVTBits), dl, SVT);
+
+        assert(magics.PreShift < Divisor.getBitWidth() &&
+               "We shouldn't generate an undefined shift!");
+        assert(magics.PostShift < Divisor.getBitWidth() &&
+               "We shouldn't generate an undefined shift!");
+        assert((!magics.IsAdd || magics.PreShift == 0) &&
+               "Unexpected pre-shift");
+        PreShift = DAG.getConstant(magics.PreShift, dl, ShSVT);
+        PostShift = DAG.getConstant(magics.PostShift, dl, ShSVT);
+        NPQFactor = DAG.getConstant(
+            magics.IsAdd ? APInt::getOneBitSet(SVTBits, EltBits - 1)
+                         : APInt::getZero(SVTBits),
+            dl, SVT);
+        UseNPQ |= magics.IsAdd;
+        UsePreShift |= magics.PreShift != 0;
+        UsePostShift |= magics.PostShift != 0;
+      }
     }
 
     PreShifts.push_back(PreShift);
@@ -6864,6 +6894,46 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
     PostShift = PostShifts[0];
   }
 
+  // Our Approach: Use optimized 33-bit method for 32-bit division
+  if (Use33BitOptimization) {
+    // x is i32, MagicFactor is pre-shifted i64 constant
+    // Compute: (i64(x) * MagicFactor) >> 64
+    EVT WideVT = EVT::getIntegerVT(*DAG.getContext(), 64);
+    SDValue X64 = DAG.getNode(ISD::ZERO_EXTEND, dl, WideVT, N0);
+
+    // Get the pre-shifted constant (it's already in MagicFactor as i64)
+    SDValue MagicFactor64 = isa<ConstantSDNode>(MagicFactor)
+        ? MagicFactor
+        : MagicFactors[0];
+
+    SDValue Result;
+    // Perform 64x64 -> 128 multiplication and extract high 64 bits
+    if (isOperationLegalOrCustom(ISD::MULHU, WideVT, IsAfterLegalization)) {
+      SDValue High = DAG.getNode(ISD::MULHU, dl, WideVT, X64, MagicFactor64);
+      Created.push_back(High.getNode());
+      // Truncate back to i32
+      Result = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
+    } else if (isOperationLegalOrCustom(ISD::UMUL_LOHI, WideVT, IsAfterLegalization)) {
+      SDValue LoHi = DAG.getNode(ISD::UMUL_LOHI, dl,
+                                  DAG.getVTList(WideVT, WideVT),
+                                  X64, MagicFactor64);
+      SDValue High = SDValue(LoHi.getNode(), 1);
+      Created.push_back(LoHi.getNode());
+      Result = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
+    } else {
+      // Fallback to standard path if 64-bit MULHU is not available
+      Use33BitOptimization = false;
+      goto standard_path;
+    }
+
+    // Handle divisor == 1 case with SELECT
+    EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+    SDValue One = DAG.getConstant(1, dl, VT);
+    SDValue IsOne = DAG.getSetCC(dl, SetCCVT, N1, One, ISD::SETEQ);
+    return DAG.getSelect(dl, VT, IsOne, N0, Result);
+  }
+
+standard_path:
   SDValue Q = N0;
   if (UsePreShift) {
     Q = DAG.getNode(ISD::SRL, dl, VT, Q, PreShift);
diff --git a/llvm/test/CodeGen/AArch64/rem-by-const.ll b/llvm/test/CodeGen/AArch64/rem-by-const.ll
index 927d46612f443..c1f745c2f42de 100644
--- a/llvm/test/CodeGen/AArch64/rem-by-const.ll
+++ b/llvm/test/CodeGen/AArch64/rem-by-const.ll
@@ -306,13 +306,11 @@ entry:
 define i32 @ui32_7(i32 %a, i32 %b) {
 ; CHECK-SD-LABEL: ui32_7:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mov w8, #18725 // =0x4925
-; CHECK-SD-NEXT:    movk w8, #9362, lsl #16
-; CHECK-SD-NEXT:    umull x8, w0, w8
-; CHECK-SD-NEXT:    lsr x8, x8, #32
-; CHECK-SD-NEXT:    sub w9, w0, w8
-; CHECK-SD-NEXT:    add w8, w8, w9, lsr #1
-; CHECK-SD-NEXT:    lsr w8, w8, #2
+; CHECK-SD-NEXT:    mov x8, #2684354560 // =0xa0000000
+; CHECK-SD-NEXT:    mov w9, w0
+; CHECK-SD-NEXT:    movk x8, #18724, lsl #32
+; CHECK-SD-NEXT:    movk x8, #9362, lsl #48
+; CHECK-SD-NEXT:    umulh x8, x9, x8
 ; CHECK-SD-NEXT:    sub w8, w8, w8, lsl #3
 ; CHECK-SD-NEXT:    add w0, w0, w8
 ; CHECK-SD-NEXT:    ret
@@ -2558,20 +2556,19 @@ define <3 x i32> @uv3i32_7(<3 x i32> %d, <3 x i32> %e) {
 ; CHECK-SD-LABEL: uv3i32_7:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    mov w8, #18725 // =0x4925
-; CHECK-SD-NEXT:    mov w9, v0.s[2]
+; CHECK-SD-NEXT:    mov x9, #2684354560 // =0xa0000000
 ; CHECK-SD-NEXT:    movk w8, #9362, lsl #16
+; CHECK-SD-NEXT:    movk x9, #18724, lsl #32
 ; CHECK-SD-NEXT:    dup v1.2s, w8
-; CHECK-SD-NEXT:    umull x8, w9, w8
+; CHECK-SD-NEXT:    mov w8, v0.s[2]
+; CHECK-SD-NEXT:    movk x9, #9362, lsl #48
 ; CHECK-SD-NEXT:    umull v1.2d, v0.2s, v1.2s
-; CHECK-SD-NEXT:    lsr x8, x8, #32
-; CHECK-SD-NEXT:    sub w10, w9, w8
+; CHECK-SD-NEXT:    umulh x9, x8, x9
 ; CHECK-SD-NEXT:    shrn v1.2s, v1.2d, #32
-; CHECK-SD-NEXT:    add w8, w8, w10, lsr #1
-; CHECK-SD-NEXT:    lsr w8, w8, #2
+; CHECK-SD-NEXT:    sub w9, w9, w9, lsl #3
 ; CHECK-SD-NEXT:    sub v2.2s, v0.2s, v1.2s
-; CHECK-SD-NEXT:    sub w8, w8, w8, lsl #3
+; CHECK-SD:         add w8, w8, w9
 ; CHECK-SD-NEXT:    ushll v2.2d, v2.2s, #0
-; CHECK-SD-NEXT:    add w8, w9, w8
 ; CHECK-SD-NEXT:    shrn v2.2s, v2.2d, #1
 ; CHECK-SD-NEXT:    add v1.2s, v2.2s, v1.2s
 ; CHECK-SD-NEXT:    movi v2.2s, #7
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
index d22274e8312ca..21ff07b8bc008 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
@@ -1791,61 +1791,30 @@ define void @udiv_constantsplat_v8i32(ptr %a)  {
 ; NONEON-NOSVE-LABEL: udiv_constantsplat_v8i32:
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT:    mov w8, #8969 // =0x2309
-; NONEON-NOSVE-NEXT:    movk w8, #22765, lsl #16
+; NONEON-NOSVE-NEXT:    mov x8, #301989888 // =0x12000000
+; NONEON-NOSVE-NEXT:    movk x8, #55878, lsl #32
+; NONEON-NOSVE-NEXT:    movk x8, #689, lsl #48
 ; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
 ; NONEON-NOSVE-NEXT:    ldr w9, [sp, #28]
-; NONEON-NOSVE-NEXT:    umull x10, w9, w8
-; NONEON-NOSVE-NEXT:    lsr x10, x10, #32
-; NONEON-NOSVE-NEXT:    sub w9, w9, w10
-; NONEON-NOSVE-NEXT:    add w9, w10, w9, lsr #1
-; NONEON-NOSVE-NEXT:    lsr w11, w9, #6
+; NONEON-NOSVE-NEXT:    umulh x10, x9, x8
 ; NONEON-NOSVE-NEXT:    ldr w9, [sp, #24]
-; NONEON-NOSVE-NEXT:    umull x10, w9, w8
-; NONEON-NOSVE-NEXT:    lsr x10, x10, #32
-; NONEON-NOSVE-NEXT:    sub w9, w9, w10
-; NONEON-NOSVE-NEXT:    add w9, w10, w9, lsr #1
-; NONEON-NOSVE-NEXT:    lsr w9, w9, #6
-; NONEON-NOSVE-NEXT:    stp w9, w11, [sp, #56]
+; NONEON-NOSVE-NEXT:    umulh x9, x9, x8
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #56]
 ; NONEON-NOSVE-NEXT:    ldr w9, [sp, #20]
-; NONEON-NOSVE-NEXT:    umull x10, w9, w8
-; NONEON-NOSVE-NEXT:    lsr x10, x10, #32
-; NONEON-NOSVE-NEXT:    sub w9, w9, w10
-; NONEON-NOSVE-NEXT:    add w9, w10, w9, lsr #1
-; NONEON-NOSVE-NEXT:    lsr w11, w9, #6
+; NONEON-NOSVE-NEXT:    umulh x10, x9, x8
 ; NONEON-NOSVE-NEXT:    ldr w9, [sp, #16]
-; NONEON-NOSVE-NEXT:    umull x10, w9, w8
-; NONEON-NOSVE-NEXT:    lsr x10, x10, #32
-; NONEON-NOSVE-NEXT:    sub w9, w9, w10
-; NONEON-NOSVE-NEXT:    add w9, w10, w9, lsr #1
-; NONEON-NOSVE-NEXT:    lsr w9, w9, #6
-; NONEON-NOSVE-NEXT:    stp w9, w11, [sp, #48]
+; NONEON-NOSVE-NEXT:    umulh x9, x9, x8
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #48]
 ; NONEON-NOSVE-NEXT:    ldr w9, [sp, #12]
-; NONEON-NOSVE-NEXT:    umull x10, w9, w8
-; NONEON-NOSVE-NEXT:    lsr x10, x10, #32
-; NONEON-NOSVE-NEXT:    sub w9, w9, w10
-; NONEON-NOSVE-NEXT:    add w9, w10, w9, lsr #1
-; NONEON-NOSVE-NEXT:    lsr w11, w9, #6
+; NONEON-NOSVE-NEXT:    umulh x10, x9, x8
 ; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8]
-; NONEON-NOSVE-NEXT:    umull x10, w9, w8
-; NONEON-NOSVE-NEXT:    lsr x10, x10, #32
-; NONEON-NOSVE-NEXT:    sub w9, w9, w10
-; NONEON-NOSVE-NEXT:    add w9, w10, w9, lsr #1
-; NONEON-NOSVE-NEXT:    lsr w9, w9, #6
-; NONEON-NOSVE-NEXT:    stp w9, w11, [sp, #40]
+; NONEON-NOSVE-NEXT:    umulh x9, x9, x8
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #40]
 ; NONEON-NOSVE-NEXT:    ldr w9, [sp, #4]
-; NONEON-NOSVE-NEXT:    umull x10, w9, w8
-; NONEON-NOSVE-NEXT:    lsr x10, x10, #32
-; NONEON-NOSVE-NEXT:    sub w9, w9, w10
-; NONEON-NOSVE-NEXT:    add w9, w10, w9, lsr #1
-; NONEON-NOSVE-NEXT:    lsr w10, w9, #6
+; NONEON-NOSVE-NEXT:    umulh x10, x9, x8
 ; NONEON-NOSVE-NEXT:    ldr w9, [sp]
-; NONEON-NOSVE-NEXT:    umull x8, w9, w8
-; NONEON-NOSVE-NEXT:    lsr x8, x8, #32
-; NONEON-NOSVE-NEXT:    sub w9, w9, w8
-; NONEON-NOSVE-NEXT:    add w8, w8, w9, lsr #1
-; NONEON-NOSVE-NEXT:    lsr w8, w8, #6
+; NONEON-NOSVE-NEXT:    umulh x8, x9, x8
 ; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #32]
 ; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
diff --git a/llvm/test/CodeGen/AArch64/udiv-const-optimization.ll b/llvm/test/CodeGen/AArch64/udiv-const-optimization.ll
new file mode 100644
index 0000000000000..d1282d376f0aa
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/udiv-const-optimization.ll
@@ -0,0 +1,61 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64-unknown-linux-gnu | FileCheck %s
+
+; Test optimization of 32-bit unsigned division by constants with 33-bit magic
+; constants (IsAdd=true) on AArch64. The optimization uses the umulh instruction.
+
+define i32 @udiv_by_7(i32 %x) nounwind {
+; CHECK-LABEL: udiv_by_7:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, #2684354560 // =0xa0000000
+; CHECK-NEXT:    mov w9, w0
+; CHECK-NEXT:    movk x8, #18724, lsl #32
+; CHECK-NEXT:    movk x8, #9362, lsl #48
+; CHECK-NEXT:    umulh x0, x9, x8
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+  %div = udiv i32 %x, 7
+  ret i32 %div
+}
+
+define i32 @udiv_by_19(i32 %x) nounwind {
+; CHECK-LABEL: udiv_by_19:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, #1476395008 // =0x58000000
+; CHECK-NEXT:    mov w9, w0
+; CHECK-NEXT:    movk x8, #17246, lsl #32
+; CHECK-NEXT:    movk x8, #3449, lsl #48
+; CHECK-NEXT:    umulh x0, x9, x8
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+  %div = udiv i32 %x, 19
+  ret i32 %div
+}
+
+define i32 @udiv_by_21(i32 %x) nounwind {
+; CHECK-LABEL: udiv_by_21:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, #939524096 // =0x38000000
+; CHECK-NEXT:    mov w9, w0
+; CHECK-NEXT:    movk x8, #49932, lsl #32
+; CHECK-NEXT:    movk x8, #3120, lsl #48
+; CHECK-NEXT:    umulh x0, x9, x8
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+  %div = udiv i32 %x, 21
+  ret i32 %div
+}
+
+; Test non-optimized case
+define i32 @udiv_by_3(i32 %x) nounwind {
+; CHECK-LABEL: udiv_by_3:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #43691 // =0xaaab
+; CHECK-NEXT:    movk w8, #43690, lsl #16
+; CHECK-NEXT:    umull x8, w0, w8
+; CHECK-NEXT:    lsr x0, x8, #33
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+  %div = udiv i32 %x, 3
+  ret i32 %div
+}
diff --git a/llvm/test/CodeGen/AArch64/urem-lkk.ll b/llvm/test/CodeGen/AArch64/urem-lkk.ll
index 40016c7e4ce0f..c09924b1b430f 100644
--- a/llvm/test/CodeGen/AArch64/urem-lkk.ll
+++ b/llvm/test/CodeGen/AArch64/urem-lkk.ll
@@ -3,18 +3,29 @@
 ; RUN: llc -mtriple=aarch64-unknown-linux-gnu -global-isel < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 define i32 @fold_urem_positive_odd(i32 %x) {
-; CHECK-LABEL: fold_urem_positive_odd:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #8969 // =0x2309
-; CHECK-NEXT:    movk w8, #22765, lsl #16
-; CHECK-NEXT:    umull x8, w0, w8
-; CHECK-NEXT:    lsr x8, x8, #32
-; CHECK-NEXT:    sub w9, w0, w8
-; CHECK-NEXT:    add w8, w8, w9, lsr #1
-; CHECK-NEXT:    mov w9, #95 // =0x5f
-; CHECK-NEXT:    lsr w8, w8, #6
-; CHECK-NEXT:    msub w0, w8, w9, w0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: fold_urem_positive_odd:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    mov x8, #301989888 // =0x12000000
+; CHECK-SD-NEXT:    mov w9, w0
+; CHECK-SD-NEXT:    movk x8, #55878, lsl #32
+; CHECK-SD-NEXT:    movk x8, #689, lsl #48
+; CHECK-SD-NEXT:    umulh x8, x9, x8
+; CHECK-SD-NEXT:    mov w9, #95 // =0x5f
+; CHECK-SD-NEXT:    msub w0, w8, w9, w0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fold_urem_positive_odd:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, #8969 // =0x2309
+; CHECK-GI-NEXT:    movk w8, #22765, lsl #16
+; CHECK-GI-NEXT:    umull x8, w0, w8
+; CHECK-GI-NEXT:    lsr x8, x8, #32
+; CHECK-GI-NEXT:    sub w9, w0, w8
+; CHECK-GI-NEXT:    add w8, w8, w9, lsr #1
+; CHECK-GI-NEXT:    mov w9, #95 // =0x5f
+; CHECK-GI-NEXT:    lsr w8, w8, #6
+; CHECK-GI-NEXT:    msub w0, w8, w9, w0
+; CHECK-GI-NEXT:    ret
   %1 = urem i32 %x, 95
   ret i32 %1
 }
@@ -37,14 +48,12 @@ define i32 @fold_urem_positive_even(i32 %x) {
 define i32 @combine_urem_udiv(i32 %x) {
 ; CHECK-SD-LABEL: combine_urem_udiv:
 ; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    mov w8, #8969 // =0x2309
-; CHECK-SD-NEXT:    movk w8, #22765, lsl #16
-; CHECK-SD-NEXT:    umull x8, w0, w8
-; CHECK-SD-NEXT:    lsr x8, x8, #32
-; CHECK-SD-NEXT:    sub w9, w0, w8
-; CHECK-SD-NEXT:    add w8, w8, w9, lsr #1
+; CHECK-SD-NEXT:    mov x8, #301989888 // =0x12000000
+; CHECK-SD-NEXT:    mov w9, w0
+; CHECK-SD-NEXT:    movk x8, #55878, lsl #32
+; CHECK-SD-NEXT:    movk x8, #689, lsl #48
+; CHECK-SD-NEXT:    umulh x8, x9, x8
 ; CHECK-SD-NEXT:    mov w9, #95 // =0x5f
-; CHECK-SD-NEXT:    lsr w8, w8, #6
 ; CHECK-SD-NEXT:    msub w9, w8, w9, w0
 ; CHECK-SD-NEXT:    add w0, w9, w8
 ; CHECK-SD-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/div-by-constant.ll b/llvm/test/CodeGen/RISCV/div-by-constant.ll
index 53c3f5841ba0f..24c882daa113d 100644
--- a/llvm/test/CodeGen/RISCV/div-by-constant.ll
+++ b/llvm/test/CodeGen/RISCV/div-by-constant.ll
@@ -48,29 +48,25 @@ define i32 @udiv_constant_add(i32 %a) nounwind {
 ;
 ; RV64IM-LABEL: udiv_constant_add:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    slli a1, a0, 32
-; RV64IM-NEXT:    lui a2, 149797
-; RV64IM-NEXT:    addi a2, a2, -1755
-; RV64IM-NEXT:    slli a2, a2, 32
-; RV64IM-NEXT:    mulhu a1, a1, a2
-; RV64IM-NEXT:    srli a1, a1, 32
-; RV64IM-NEXT:    sub a0, a0, a1
-; RV64IM-NEXT:    srliw a0, a0, 1
-; RV64IM-NEXT:    add a0, a0, a1
-; RV64IM-NEXT:    srli a0, a0, 2
+; RV64IM-NEXT:    slli a0, a0, 32
+; RV64IM-NEXT:    lui a1, 293
+; RV64IM-NEXT:    srli a0, a0, 32
+; RV64IM-NEXT:    addi a1, a1, -1755
+; RV64IM-NEXT:    slli a1, a1, 12
+; RV64IM-NEXT:    addi a1, a1, -1755
+; RV64IM-NEXT:    slli a1, a1, 29
+; RV64IM-NEXT:    mulhu a0, a0, a1
 ; RV64IM-NEXT:    ret
 ;
 ; RV64IMZB-LABEL: udiv_constant_add:
 ; RV64IMZB:       # %bb.0:
-; RV64IMZB-NEXT:    zext.w a1, a0
-; RV64IMZB-NEXT:    lui a2, 149797
-; RV64IMZB-NEXT:    addi a2, a2, -1755
-; RV64IMZB-NEXT:    mul a1, a1, a2
-; RV64IMZB-NEXT:    srli a1, a1, 32
-; RV64IMZB-NEXT:    sub a0, a0, a1
-; RV64IMZB-NEXT:    srliw a0, a0, 1
-; RV64IMZB-NEXT:    add a0, a0, a1
-; RV64IMZB-NEXT:    srli a0, a0, 2
+; RV64IMZB-NEXT:    zext.w a0, a0
+; RV64IMZB-NEXT:    lui a1, 293
+; RV64IMZB-NEXT:    addi a1, a1, -1755
+; RV64IMZB-NEXT:    slli a1, a1, 12
+; RV64IMZB-NEXT:    addi a1, a1, -1755
+; RV64IMZB-NEXT:    slli a1, a1, 29
+; RV64IMZB-NEXT:    mulhu a0, a0, a1
 ; RV64IMZB-NEXT:    ret
   %1 = udiv i32 %a, 7
   ret i32 %1
diff --git a/llvm/test/CodeGen/RISCV/udiv-const-optimization.ll b/llvm/test/CodeGen/RISCV/udiv-const-optimization.ll
new file mode 100644
index 0000000000000..5485a5b230a27
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/udiv-const-optimization.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=riscv64-unknown-linux-gnu -mattr=+m | FileCheck %s
+
+; Test optimization of 32-bit unsigned division by constants with 33-bit magic
+; constants (IsAdd=true) on RISC-V64. The optimization uses the mulhu instruction.
+
+define i32 @udiv_by_7(i32 %x) nounwind {
+; CHECK-LABEL: udiv_by_7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 32
+; CHECK-NEXT:    lui a1, 293
+; CHECK-NEXT:    srli a0, a0, 32
+; CHECK-NEXT:    addi a1, a1, -1755
+; CHECK-NEXT:    slli a1, a1, 12
+; CHECK-NEXT:    addi a1, a1, -1755
+; CHECK-NEXT:    slli a1, a1, 29
+; CHECK-NEXT:    mulhu a0, a0, a1
+; CHECK-NEXT:    ret
+  %div = udiv i32 %x, 7
+  ret i32 %div
+}
+
+define i32 @udiv_by_19(i32 %x) nounwind {
+; CHECK-LABEL: udiv_by_19:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 32
+; CHECK-NEXT:    lui a1, 717447
+; CHECK-NEXT:    srli a0, a0, 32
+; CHECK-NEXT:    addi a1, a1, -1077
+; CHECK-NEXT:    slli a1, a1, 31
+; CHECK-NEXT:    srli a1, a1, 4
+; CHECK-NEXT:    mulhu a0, a0, a1
+; CHECK-NEXT:    ret
+  %div = udiv i32 %x, 19
+  ret i32 %div
+}
+
+define i32 @udiv_by_21(i32 %x) nounwind {
+; CHECK-LABEL: udiv_by_21:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 32
+; CHECK-NEXT:    lui a1, 549254
+; CHECK-NEXT:    srli a0, a0, 32
+; CHECK-NEXT:    addi a1, a1, 391
+; CHECK-NEXT:    slli a1, a1, 31
+; CHECK-NEXT:    srli a1, a1, 4
+; CHECK-NEXT:    mulhu a0, a0, a1
+; CHECK-NEXT:    ret
+  %div = udiv i32 %x, 21
+  ret i32 %div
+}
+
+; Test non-optimized case
+define i32 @udiv_by_3(i32 %x) nounwind {
+; CHECK-LABEL: udiv_by_3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 32
+; CHECK-NEXT:    lui a1, 699051
+; CHECK-NEXT:    addi a1, a1, -1365
+; CHECK-NEXT:    slli a1, a1, 32
+; CHECK-NEXT:    mulhu a0, a0, a1
+; CHECK-NEXT:    srli a0, a0, 33
+; CHECK-NEXT:    ret
+  %div = udiv i32 %x, 3
+  ret i32 %div
+}
diff --git a/llvm/test/CodeGen/RISCV/urem-lkk.ll b/llvm/test/CodeGen/RISCV/urem-lkk.ll
index ee496123ba7b4..449e56c82e74c 100644
--- a/llvm/test/CodeGen/RISCV/urem-lkk.ll
+++ b/llvm/test/CodeGen/RISCV/urem-lkk.ll
@@ -43,15 +43,13 @@ define i32 @fold_urem_positive_odd(i32 %x) nounwind {
 ; RV64IM-LABEL: fold_urem_positive_odd:
 ; RV64IM:       # %bb.0:
 ; RV64IM-NEXT:    slli a1, a0, 32
-; RV64IM-NEXT:    lui a2, 364242
+; RV64IM-NEXT:    lui a2, 172
+; RV64IM-NEXT:    srli a1, a1, 32
+; RV64IM-NEXT:    addi a2, a2, 1897
+; RV64IM-NEXT:    slli a2, a2, 13
 ; RV64IM-NEXT:    addi a2, a2, 777
-; RV64IM-NEXT:    slli a2, a2, 32
+; RV64IM-NEXT:    slli a2, a2, 25
 ; RV64IM-NEXT:    mulhu a1, a1, a2
-; RV64IM-NEXT:    srli a1, a1, 32
-; RV64IM-NEXT:    sub a2, a0, a1
-; RV64IM-NEXT:    srliw a2, a2, 1
-; RV64IM-NEXT:    add a1, a2, a1
-; RV64IM-NEXT:    srli a1, a1, 6
 ; RV64IM-NEXT:    li a2, 95
 ; RV64IM-NEXT:    mul a1, a1, a2
 ; RV64IM-NEXT:    subw a0, a0, a1
@@ -169,16 +167,14 @@ define i32 @combine_urem_udiv(i32 %x) nounwind {
 ; RV64IM-LABEL: combine_urem_udiv:
 ; RV64IM:       # %bb.0:
 ; RV64IM-NEXT:    slli a1, a0, 32
-; RV64IM-NEXT:    lui a2, 364242
+; RV64IM-NEXT:    lui a2, 172
+; RV64IM-NEXT:    srli a1, a1, 32
+; RV64IM-NEXT:    addi a2, a2, 1897
+; RV64IM-NEXT:    slli a2, a2, 13
 ; RV64IM-NEXT:    addi a2, a2, 777
-; RV64IM-NEXT:    slli a2, a2, 32
+; RV64IM-NEXT:    slli a2, a2, 25
 ; RV64IM-NEXT:    mulhu a1, a1, a2
-; RV64IM-NEXT:    srli a1, a1, 32
-; RV64IM-NEXT:    sub a2, a0, a1
-; RV64IM-NEXT:    srliw a2, a2, 1
-; RV64IM-NEXT:    add a1, a2, a1
 ; RV64IM-NEXT:    li a2, 95
-; RV64IM-NEXT:    srli a1, a1, 6
 ; RV64IM-NEXT:    mul a2, a1, a2
 ; RV64IM-NEXT:    add a0, a0, a1
 ; RV64IM-NEXT:    subw a0, a0, a2
diff --git a/llvm/test/CodeGen/X86/fold-loop-of-urem.ll b/llvm/test/CodeGen/X86/fold-loop-of-urem.ll
index c9c88f7258435..cb1c078ee5129 100644
--- a/llvm/test/CodeGen/X86/fold-loop-of-urem.ll
+++ b/llvm/test/CodeGen/X86/fold-loop-of-urem.ll
@@ -798,20 +798,14 @@ define void @simple_urem_skip_const_rem_amt(i32 %N) nounwind {
 ; CHECK-NEXT:    movl %edi, %ebx
 ; CHECK-NEXT:    addl $-4, %ebx
 ; CHECK-NEXT:    movl $4, %ebp
-; CHECK-NEXT:    movl $2938661835, %r14d # imm = 0xAF286BCB
+; CHECK-NEXT:    movabsq $970881267157434368, %r14 # imm = 0xD79435E58000000
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB13_2: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    movl %ebp, %eax
-; CHECK-NEXT:    imulq %r14, %rax
-; CHECK-NEXT:    shrq $32, %rax
-; CHECK-NEXT:    movl %ebp, %ecx
-; CHECK-NEXT:    subl %eax, %ecx
-; CHECK-NEXT:    shrl %ecx
-; CHECK-NEXT:    addl %eax, %ecx
-; CHECK-NEXT:    shrl $4, %ecx
-; CHECK-NEXT:    leal (%rcx,%rcx,8), %eax
-; CHECK-NEXT:    leal (%rcx,%rax,2), %eax
+; CHECK-NEXT:    mulq %r14
+; CHECK-NEXT:    leal (%rdx,%rdx,8), %eax
+; CHECK-NEXT:    leal (%rdx,%rax,2), %eax
 ; CHECK-NEXT:    movl %ebp, %edi
 ; CHECK-NEXT:    subl %eax, %edi
 ; CHECK-NEXT:    callq use.i32 at PLT
diff --git a/llvm/test/CodeGen/X86/udiv-const-optimization.ll b/llvm/test/CodeGen/X86/udiv-const-optimization.ll
new file mode 100644
index 0000000000000..a4fa413bab038
--- /dev/null
+++ b/llvm/test/CodeGen/X86/udiv-const-optimization.ll
@@ -0,0 +1,141 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+bmi2 | FileCheck %s --check-prefix=X64-BMI2
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu | FileCheck %s --check-prefix=X86
+
+; Test optimization of 32-bit unsigned division by constants with 33-bit magic
+; constants (IsAdd=true) on 64-bit targets. The optimization uses pre-shifted
+; constants and 64x64->128 bit multiplication to reduce instruction count.
+
+define i32 @udiv_by_7(i32 %x) nounwind {
+; X64-LABEL: udiv_by_7:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movabsq $2635249153617166336, %rcx # imm = 0x24924924A0000000
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rdx, %rax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
+; X64-NEXT:    retq
+;
+; X64-BMI2-LABEL: udiv_by_7:
+; X64-BMI2:       # %bb.0:
+; X64-BMI2-NEXT:    movl %edi, %edx
+; X64-BMI2-NEXT:    movabsq $2635249153617166336, %rax # imm = 0x24924924A0000000
+; X64-BMI2-NEXT:    mulxq %rax, %rax, %rax
+; X64-BMI2-NEXT:    # kill: def $eax killed $eax killed $rax
+; X64-BMI2-NEXT:    retq
+;
+; X86-LABEL: udiv_by_7:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl $613566757, %edx # imm = 0x24924925
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %edx
+; X86-NEXT:    subl %edx, %ecx
+; X86-NEXT:    shrl %ecx
+; X86-NEXT:    addl %edx, %ecx
+; X86-NEXT:    shrl $2, %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    retl
+  %div = udiv i32 %x, 7
+  ret i32 %div
+}
+
+define i32 @udiv_by_19(i32 %x) nounwind {
+; X64-LABEL: udiv_by_19:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movabsq $970881267157434368, %rcx # imm = 0xD79435E58000000
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rdx, %rax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
+; X64-NEXT:    retq
+;
+; X64-BMI2-LABEL: udiv_by_19:
+; X64-BMI2:       # %bb.0:
+; X64-BMI2-NEXT:    movl %edi, %edx
+; X64-BMI2-NEXT:    movabsq $970881267157434368, %rax # imm = 0xD79435E58000000
+; X64-BMI2-NEXT:    mulxq %rax, %rax, %rax
+; X64-BMI2-NEXT:    # kill: def $eax killed $eax killed $rax
+; X64-BMI2-NEXT:    retq
+;
+; X86-LABEL: udiv_by_19:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl ${{-?[0-9]+}}, %edx # imm = 0xAF286BCB
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %edx
+; X86-NEXT:    subl %edx, %ecx
+; X86-NEXT:    shrl %ecx
+; X86-NEXT:    addl %edx, %ecx
+; X86-NEXT:    shrl $4, %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    retl
+  %div = udiv i32 %x, 19
+  ret i32 %div
+}
+
+define i32 @udiv_by_21(i32 %x) nounwind {
+; X64-LABEL: udiv_by_21:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movabsq $878416384583794688, %rcx # imm = 0xC30C30C38000000
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rdx, %rax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
+; X64-NEXT:    retq
+;
+; X64-BMI2-LABEL: udiv_by_21:
+; X64-BMI2:       # %bb.0:
+; X64-BMI2-NEXT:    movl %edi, %edx
+; X64-BMI2-NEXT:    movabsq $878416384583794688, %rax # imm = 0xC30C30C38000000
+; X64-BMI2-NEXT:    mulxq %rax, %rax, %rax
+; X64-BMI2-NEXT:    # kill: def $eax killed $eax killed $rax
+; X64-BMI2-NEXT:    retq
+;
+; X86-LABEL: udiv_by_21:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl ${{-?[0-9]+}}, %edx # imm = 0x86186187
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %edx
+; X86-NEXT:    subl %edx, %ecx
+; X86-NEXT:    shrl %ecx
+; X86-NEXT:    addl %edx, %ecx
+; X86-NEXT:    shrl $4, %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    retl
+  %div = udiv i32 %x, 21
+  ret i32 %div
+}
+
+; Test non-optimized case: divisor that doesn't use IsAdd
+define i32 @udiv_by_3(i32 %x) nounwind {
+; X64-LABEL: udiv_by_3:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %ecx
+; X64-NEXT:    movl $2863311531, %eax # imm = 0xAAAAAAAB
+; X64-NEXT:    imulq %rcx, %rax
+; X64-NEXT:    shrq $33, %rax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
+; X64-NEXT:    retq
+;
+; X64-BMI2-LABEL: udiv_by_3:
+; X64-BMI2:       # %bb.0:
+; X64-BMI2-NEXT:    movl %edi, %ecx
+; X64-BMI2-NEXT:    movl $2863311531, %eax # imm = 0xAAAAAAAB
+; X64-BMI2-NEXT:    imulq %rcx, %rax
+; X64-BMI2-NEXT:    shrq $33, %rax
+; X64-BMI2-NEXT:    # kill: def $eax killed $eax killed $rax
+; X64-BMI2-NEXT:    retq
+;
+; X86-LABEL: udiv_by_3:
+; X86:       # %bb.0:
+; X86-NEXT:    movl ${{-?[0-9]+}}, %eax # imm = 0xAAAAAAAB
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    shrl %eax
+; X86-NEXT:    retl
+  %div = udiv i32 %x, 3
+  ret i32 %div
+}
diff --git a/llvm/test/CodeGen/X86/urem-lkk.ll b/llvm/test/CodeGen/X86/urem-lkk.ll
index 573f875544cd4..4ac12eae0e5b9 100644
--- a/llvm/test/CodeGen/X86/urem-lkk.ll
+++ b/llvm/test/CodeGen/X86/urem-lkk.ll
@@ -5,16 +5,11 @@ define i32 @fold_urem_positive_odd(i32 %x) {
 ; CHECK-LABEL: fold_urem_positive_odd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    movl %edi, %ecx
-; CHECK-NEXT:    imulq $1491936009, %rcx, %rcx # imm = 0x58ED2309
-; CHECK-NEXT:    shrq $32, %rcx
-; CHECK-NEXT:    movl %edi, %edx
-; CHECK-NEXT:    subl %ecx, %edx
-; CHECK-NEXT:    shrl %edx
-; CHECK-NEXT:    addl %ecx, %edx
-; CHECK-NEXT:    shrl $6, %edx
-; CHECK-NEXT:    imull $95, %edx, %ecx
-; CHECK-NEXT:    subl %ecx, %eax
+; CHECK-NEXT:    movabsq $194176253438197760, %rcx # imm = 0x2B1DA4612000000
+; CHECK-NEXT:    mulq %rcx
+; CHECK-NEXT:    imull $95, %edx, %eax
+; CHECK-NEXT:    subl %eax, %edi
+; CHECK-NEXT:    movl %edi, %eax
 ; CHECK-NEXT:    retq
   %1 = urem i32 %x, 95
   ret i32 %1
@@ -41,17 +36,13 @@ define i32 @fold_urem_positive_even(i32 %x) {
 define i32 @combine_urem_udiv(i32 %x) {
 ; CHECK-LABEL: combine_urem_udiv:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    imulq $1491936009, %rax, %rcx # imm = 0x58ED2309
-; CHECK-NEXT:    shrq $32, %rcx
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    subl %ecx, %eax
-; CHECK-NEXT:    shrl %eax
-; CHECK-NEXT:    addl %ecx, %eax
-; CHECK-NEXT:    shrl $6, %eax
-; CHECK-NEXT:    imull $95, %eax, %ecx
-; CHECK-NEXT:    subl %ecx, %edi
-; CHECK-NEXT:    addl %edi, %eax
+; CHECK-NEXT:    movabsq $194176253438197760, %rcx # imm = 0x2B1DA4612000000
+; CHECK-NEXT:    mulq %rcx
+; CHECK-NEXT:    imull $95, %edx, %eax
+; CHECK-NEXT:    subl %eax, %edi
+; CHECK-NEXT:    leal (%rdi,%rdx), %eax
 ; CHECK-NEXT:    retq
   %1 = urem i32 %x, 95
   %2 = udiv i32 %x, 95

>From 6ecca57678053b1c51a715ee93603142a524d26d Mon Sep 17 00:00:00 2001
From: MITSUNARI Shigeo <herumi at nifty.com>
Date: Fri, 13 Feb 2026 12:51:09 +0900
Subject: [PATCH 2/2] Apply review suggestions from llvm/llvm-project#181288

---
 .../CodeGen/SelectionDAG/TargetLowering.cpp   | 34 +++++++------------
 1 file changed, 12 insertions(+), 22 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index c3829002b5b36..2d149a10c3036 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -12,7 +12,6 @@
 
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/CodeGen/Analysis.h"
@@ -6792,6 +6791,9 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
   const unsigned SVTBits = SVT.getSizeInBits();
 
   bool UseNPQ = false, UsePreShift = false, UsePostShift = false;
+  EVT WideVT64 = EVT::getIntegerVT(*DAG.getContext(), 64);
+  bool HasWideVT64MULHU = isOperationLegalOrCustom(ISD::MULHU, WideVT64, IsAfterLegalization);
+  bool HasWideVT64UMUL_LOHI = isOperationLegalOrCustom(ISD::UMUL_LOHI, WideVT64, IsAfterLegalization);
   bool Use33BitOptimization = false;
   SmallVector<SDValue, 16> PreShifts, PostShifts, MagicFactors, NPQFactors;
 
@@ -6814,18 +6816,13 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
           UnsignedDivisionByConstantInfo::get(
               Divisor, std::min(KnownLeadingZeros, Divisor.countl_zero()));
 
-      // Our Approach: For 32-bit division with IsAdd (33-bit
-      // magic case), use optimized method: preshift c by (64-a) bits to
-      // eliminate runtime shift. This requires 64x64->128 bit multiplication.
+      // For 32-bit division with IsAdd (33-bit magic case), use optimized method:
+      // preshift c by (64-a) bits to eliminate runtime shift.
+      // This requires 64x64->128 bit multiplication.
       // Only apply to scalar types since SIMD lacks 64x64->128 high multiply.
       // Note: IsAdd=true implies PreShift=0 by algorithm design.
       // Check if 64-bit MULHU is available before applying this optimization.
-      EVT WideVT64 = EVT::getIntegerVT(*DAG.getContext(), 64);
-      bool Has64BitMULHU =
-          isOperationLegalOrCustom(ISD::MULHU, WideVT64, IsAfterLegalization) ||
-          isOperationLegalOrCustom(ISD::UMUL_LOHI, WideVT64,
-                                   IsAfterLegalization);
-      if (EltBits == 32 && !VT.isVector() && Has64BitMULHU && magics.IsAdd) {
+        if (EltBits == 32 && !VT.isVector() && (HasWideVT64MULHU || HasWideVT64UMUL_LOHI) && magics.IsAdd) {
         // For IsAdd case, actual magic constant is 2^32 + Magic (33-bit)
         unsigned OriginalShift = magics.PostShift + 33;
         APInt RealMagic = APInt(65, 1).shl(32) + magics.Magic.zext(65); // 2^32 + Magic
@@ -6894,12 +6891,10 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
     PostShift = PostShifts[0];
   }
 
-  // Our Approach: Use optimized 33-bit method for 32-bit division
   if (Use33BitOptimization) {
     // x is i32, MagicFactor is pre-shifted i64 constant
     // Compute: (i64(x) * MagicFactor) >> 64
-    EVT WideVT = EVT::getIntegerVT(*DAG.getContext(), 64);
-    SDValue X64 = DAG.getNode(ISD::ZERO_EXTEND, dl, WideVT, N0);
+    SDValue X64 = DAG.getNode(ISD::ZERO_EXTEND, dl, WideVT64, N0);
 
     // Get the pre-shifted constant (it's already in MagicFactor as i64)
     SDValue MagicFactor64 = isa<ConstantSDNode>(MagicFactor)
@@ -6908,22 +6903,18 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
 
     SDValue Result;
     // Perform 64x64 -> 128 multiplication and extract high 64 bits
-    if (isOperationLegalOrCustom(ISD::MULHU, WideVT, IsAfterLegalization)) {
-      SDValue High = DAG.getNode(ISD::MULHU, dl, WideVT, X64, MagicFactor64);
+    if (HasWideVT64MULHU) {
+      SDValue High = DAG.getNode(ISD::MULHU, dl, WideVT64, X64, MagicFactor64);
       Created.push_back(High.getNode());
       // Truncate back to i32
       Result = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
-    } else if (isOperationLegalOrCustom(ISD::UMUL_LOHI, WideVT, IsAfterLegalization)) {
+    } else if (HasWideVT64UMUL_LOHI) {
       SDValue LoHi = DAG.getNode(ISD::UMUL_LOHI, dl,
-                                  DAG.getVTList(WideVT, WideVT),
+                                  DAG.getVTList(WideVT64, WideVT64),
                                   X64, MagicFactor64);
       SDValue High = SDValue(LoHi.getNode(), 1);
       Created.push_back(LoHi.getNode());
       Result = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
-    } else {
-      // Fallback to standard path if 64-bit MULHU is not available
-      Use33BitOptimization = false;
-      goto standard_path;
     }
 
     // Handle divisor == 1 case with SELECT
@@ -6933,7 +6924,6 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
     return DAG.getSelect(dl, VT, IsOne, N0, Result);
   }
 
-standard_path:
   SDValue Q = N0;
   if (UsePreShift) {
     Q = DAG.getNode(ISD::SRL, dl, VT, Q, PreShift);