[llvm] [SelectionDAG] Optimize 32-bit udiv with 33-bit magic constants on 64-bit targets (PR #181288)
MITSUNARI Shigeo via llvm-commits
llvm-commits at lists.llvm.org
Thu Feb 12 21:16:01 PST 2026
https://github.com/herumi updated https://github.com/llvm/llvm-project/pull/181288
>From ae3673be45d1c9f4e1864b897238ae099f73eaac Mon Sep 17 00:00:00 2001
From: MITSUNARI Shigeo <herumi at nifty.com>
Date: Tue, 10 Feb 2026 16:01:52 +0900
Subject: [PATCH 1/2] [SelectionDAG] Optimize 32-bit udiv with 33-bit magic
constants on 64-bit targets
Compiler optimization for constant division of uint32_t variables (such as x / 7)
is based on the Granlund-Montgomery method (1994). However, the GM method for the
IsAdd=true case (33-bit magic constants) was optimized for 32-bit CPUs, not 64-bit CPUs.
This patch provides optimizations specifically for 64-bit CPUs (such as x86_64 and
Apple M-series) by pre-shifting the 33-bit magic constant left by (64-a) bits and
using the high 64 bits of a 64x64->128 bit multiplication directly. This eliminates
the add/sub/shift sequence required by the traditional approach.
Before (7 instructions for x / 7 on x86_64):
movl %edi, %eax
imulq $613566757, %rax, %rax
shrq $32, %rax
subl %eax, %edi
shrl %edi
addl %edi, %eax
shrl $2, %eax
After with BMI2 (3 instructions):
movl %edi, %edx
movabsq $2635249153617166336, %rax
mulxq %rax, %rax, %rax
Without BMI2 (4 instructions):
movl %edi, %eax
movabsq $2635249153617166336, %rcx
mulq %rcx
movq %rdx, %rax
Benchmark results (1 billion divisions by 7, 19, and 107):
| CPU | Original (sec) | Optimized (sec) | Speedup |
|------------|----------------|-----------------|---------|
| Intel Xeon | 6.40 | 3.83 | 40.2% |
| Apple M4 | 6.70 | 3.38 | 49.6% |
The optimization applies when:
- Element size is 32 bits
- Type is scalar (not vector) - SIMD lacks 64x64->128 high multiply instructions
- Target pointer size is 64 bits or larger
- UnsignedDivisionByConstantInfo::IsAdd is true (33-bit magic)
Affected divisors: 7, 19, 21, 27, 31, 35, 37, etc. (about 23% of 31-bit divisors)
Architectures: x86_64, AArch64, RISC-V64 (any 64-bit target with 64x64->128 bit multiply)
Tests added for X86, AArch64, and RISC-V64 architectures.
---
.../CodeGen/SelectionDAG/TargetLowering.cpp | 104 ++++++++++---
llvm/test/CodeGen/AArch64/rem-by-const.ll | 27 ++--
...sve-streaming-mode-fixed-length-int-div.ll | 59 ++------
.../AArch64/udiv-const-optimization.ll | 61 ++++++++
llvm/test/CodeGen/AArch64/urem-lkk.ll | 47 +++---
llvm/test/CodeGen/RISCV/div-by-constant.ll | 34 ++---
.../CodeGen/RISCV/udiv-const-optimization.ll | 66 ++++++++
llvm/test/CodeGen/RISCV/urem-lkk.ll | 24 ++-
llvm/test/CodeGen/X86/fold-loop-of-urem.ll | 14 +-
.../CodeGen/X86/udiv-const-optimization.ll | 141 ++++++++++++++++++
llvm/test/CodeGen/X86/urem-lkk.ll | 31 ++--
11 files changed, 449 insertions(+), 159 deletions(-)
create mode 100644 llvm/test/CodeGen/AArch64/udiv-const-optimization.ll
create mode 100644 llvm/test/CodeGen/RISCV/udiv-const-optimization.ll
create mode 100644 llvm/test/CodeGen/X86/udiv-const-optimization.ll
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index e4b4d80896fa7..c3829002b5b36 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -12,6 +12,7 @@
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallString.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/CodeGen/Analysis.h"
@@ -6791,6 +6792,7 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
const unsigned SVTBits = SVT.getSizeInBits();
bool UseNPQ = false, UsePreShift = false, UsePostShift = false;
+ bool Use33BitOptimization = false;
SmallVector<SDValue, 16> PreShifts, PostShifts, MagicFactors, NPQFactors;
auto BuildUDIVPattern = [&](ConstantSDNode *C) {
@@ -6812,23 +6814,51 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
UnsignedDivisionByConstantInfo::get(
Divisor, std::min(KnownLeadingZeros, Divisor.countl_zero()));
- MagicFactor = DAG.getConstant(magics.Magic.zext(SVTBits), dl, SVT);
-
- assert(magics.PreShift < Divisor.getBitWidth() &&
- "We shouldn't generate an undefined shift!");
- assert(magics.PostShift < Divisor.getBitWidth() &&
- "We shouldn't generate an undefined shift!");
- assert((!magics.IsAdd || magics.PreShift == 0) &&
- "Unexpected pre-shift");
- PreShift = DAG.getConstant(magics.PreShift, dl, ShSVT);
- PostShift = DAG.getConstant(magics.PostShift, dl, ShSVT);
- NPQFactor = DAG.getConstant(
- magics.IsAdd ? APInt::getOneBitSet(SVTBits, EltBits - 1)
- : APInt::getZero(SVTBits),
- dl, SVT);
- UseNPQ |= magics.IsAdd;
- UsePreShift |= magics.PreShift != 0;
- UsePostShift |= magics.PostShift != 0;
+ // Our Approach: For 32-bit division with IsAdd (33-bit
+ // magic case), use optimized method: preshift c by (64-a) bits to
+ // eliminate runtime shift. This requires 64x64->128 bit multiplication.
+ // Only apply to scalar types since SIMD lacks 64x64->128 high multiply.
+ // Note: IsAdd=true implies PreShift=0 by algorithm design.
+ // Check if 64-bit MULHU is available before applying this optimization.
+ EVT WideVT64 = EVT::getIntegerVT(*DAG.getContext(), 64);
+ bool Has64BitMULHU =
+ isOperationLegalOrCustom(ISD::MULHU, WideVT64, IsAfterLegalization) ||
+ isOperationLegalOrCustom(ISD::UMUL_LOHI, WideVT64,
+ IsAfterLegalization);
+ if (EltBits == 32 && !VT.isVector() && Has64BitMULHU && magics.IsAdd) {
+ // For IsAdd case, actual magic constant is 2^32 + Magic (33-bit)
+ unsigned OriginalShift = magics.PostShift + 33;
+ APInt RealMagic = APInt(65, 1).shl(32) + magics.Magic.zext(65); // 2^32 + Magic
+ Use33BitOptimization = true;
+ // Shift the constant left by (64 - OriginalShift) to avoid runtime shift
+ APInt ShiftedMagic = RealMagic.shl(64 - OriginalShift).trunc(64);
+ MagicFactor = DAG.getConstant(ShiftedMagic, dl,
+ EVT::getIntegerVT(*DAG.getContext(), 64));
+ PreShift = DAG.getConstant(0, dl, ShSVT);
+ PostShift = DAG.getConstant(0, dl, ShSVT);
+ NPQFactor = DAG.getConstant(APInt::getZero(SVTBits), dl, SVT);
+ UseNPQ = false;
+ UsePreShift = false;
+ UsePostShift = false;
+ } else {
+ MagicFactor = DAG.getConstant(magics.Magic.zext(SVTBits), dl, SVT);
+
+ assert(magics.PreShift < Divisor.getBitWidth() &&
+ "We shouldn't generate an undefined shift!");
+ assert(magics.PostShift < Divisor.getBitWidth() &&
+ "We shouldn't generate an undefined shift!");
+ assert((!magics.IsAdd || magics.PreShift == 0) &&
+ "Unexpected pre-shift");
+ PreShift = DAG.getConstant(magics.PreShift, dl, ShSVT);
+ PostShift = DAG.getConstant(magics.PostShift, dl, ShSVT);
+ NPQFactor = DAG.getConstant(
+ magics.IsAdd ? APInt::getOneBitSet(SVTBits, EltBits - 1)
+ : APInt::getZero(SVTBits),
+ dl, SVT);
+ UseNPQ |= magics.IsAdd;
+ UsePreShift |= magics.PreShift != 0;
+ UsePostShift |= magics.PostShift != 0;
+ }
}
PreShifts.push_back(PreShift);
@@ -6864,6 +6894,46 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
PostShift = PostShifts[0];
}
+ // Our Approach: Use optimized 33-bit method for 32-bit division
+ if (Use33BitOptimization) {
+ // x is i32, MagicFactor is pre-shifted i64 constant
+ // Compute: (i64(x) * MagicFactor) >> 64
+ EVT WideVT = EVT::getIntegerVT(*DAG.getContext(), 64);
+ SDValue X64 = DAG.getNode(ISD::ZERO_EXTEND, dl, WideVT, N0);
+
+ // Get the pre-shifted constant (it's already in MagicFactor as i64)
+ SDValue MagicFactor64 = isa<ConstantSDNode>(MagicFactor)
+ ? MagicFactor
+ : MagicFactors[0];
+
+ SDValue Result;
+ // Perform 64x64 -> 128 multiplication and extract high 64 bits
+ if (isOperationLegalOrCustom(ISD::MULHU, WideVT, IsAfterLegalization)) {
+ SDValue High = DAG.getNode(ISD::MULHU, dl, WideVT, X64, MagicFactor64);
+ Created.push_back(High.getNode());
+ // Truncate back to i32
+ Result = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
+ } else if (isOperationLegalOrCustom(ISD::UMUL_LOHI, WideVT, IsAfterLegalization)) {
+ SDValue LoHi = DAG.getNode(ISD::UMUL_LOHI, dl,
+ DAG.getVTList(WideVT, WideVT),
+ X64, MagicFactor64);
+ SDValue High = SDValue(LoHi.getNode(), 1);
+ Created.push_back(LoHi.getNode());
+ Result = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
+ } else {
+ // Fallback to standard path if 64-bit MULHU is not available
+ Use33BitOptimization = false;
+ goto standard_path;
+ }
+
+ // Handle divisor == 1 case with SELECT
+ EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+ SDValue One = DAG.getConstant(1, dl, VT);
+ SDValue IsOne = DAG.getSetCC(dl, SetCCVT, N1, One, ISD::SETEQ);
+ return DAG.getSelect(dl, VT, IsOne, N0, Result);
+ }
+
+standard_path:
SDValue Q = N0;
if (UsePreShift) {
Q = DAG.getNode(ISD::SRL, dl, VT, Q, PreShift);
diff --git a/llvm/test/CodeGen/AArch64/rem-by-const.ll b/llvm/test/CodeGen/AArch64/rem-by-const.ll
index 927d46612f443..c1f745c2f42de 100644
--- a/llvm/test/CodeGen/AArch64/rem-by-const.ll
+++ b/llvm/test/CodeGen/AArch64/rem-by-const.ll
@@ -306,13 +306,11 @@ entry:
define i32 @ui32_7(i32 %a, i32 %b) {
; CHECK-SD-LABEL: ui32_7:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: mov w8, #18725 // =0x4925
-; CHECK-SD-NEXT: movk w8, #9362, lsl #16
-; CHECK-SD-NEXT: umull x8, w0, w8
-; CHECK-SD-NEXT: lsr x8, x8, #32
-; CHECK-SD-NEXT: sub w9, w0, w8
-; CHECK-SD-NEXT: add w8, w8, w9, lsr #1
-; CHECK-SD-NEXT: lsr w8, w8, #2
+; CHECK-SD-NEXT: mov x8, #2684354560 // =0xa0000000
+; CHECK-SD-NEXT: mov w9, w0
+; CHECK-SD-NEXT: movk x8, #18724, lsl #32
+; CHECK-SD-NEXT: movk x8, #9362, lsl #48
+; CHECK-SD-NEXT: umulh x8, x9, x8
; CHECK-SD-NEXT: sub w8, w8, w8, lsl #3
; CHECK-SD-NEXT: add w0, w0, w8
; CHECK-SD-NEXT: ret
@@ -2558,20 +2556,19 @@ define <3 x i32> @uv3i32_7(<3 x i32> %d, <3 x i32> %e) {
; CHECK-SD-LABEL: uv3i32_7:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: mov w8, #18725 // =0x4925
-; CHECK-SD-NEXT: mov w9, v0.s[2]
+; CHECK-SD-NEXT: mov x9, #2684354560 // =0xa0000000
; CHECK-SD-NEXT: movk w8, #9362, lsl #16
+; CHECK-SD-NEXT: movk x9, #18724, lsl #32
; CHECK-SD-NEXT: dup v1.2s, w8
-; CHECK-SD-NEXT: umull x8, w9, w8
+; CHECK-SD-NEXT: mov w8, v0.s[2]
+; CHECK-SD-NEXT: movk x9, #9362, lsl #48
; CHECK-SD-NEXT: umull v1.2d, v0.2s, v1.2s
-; CHECK-SD-NEXT: lsr x8, x8, #32
-; CHECK-SD-NEXT: sub w10, w9, w8
+; CHECK-SD-NEXT: umulh x9, x8, x9
; CHECK-SD-NEXT: shrn v1.2s, v1.2d, #32
-; CHECK-SD-NEXT: add w8, w8, w10, lsr #1
-; CHECK-SD-NEXT: lsr w8, w8, #2
+; CHECK-SD-NEXT: sub w9, w9, w9, lsl #3
; CHECK-SD-NEXT: sub v2.2s, v0.2s, v1.2s
-; CHECK-SD-NEXT: sub w8, w8, w8, lsl #3
+; CHECK-SD: add w8, w8, w9
; CHECK-SD-NEXT: ushll v2.2d, v2.2s, #0
-; CHECK-SD-NEXT: add w8, w9, w8
; CHECK-SD-NEXT: shrn v2.2s, v2.2d, #1
; CHECK-SD-NEXT: add v1.2s, v2.2s, v1.2s
; CHECK-SD-NEXT: movi v2.2s, #7
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
index d22274e8312ca..21ff07b8bc008 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
@@ -1791,61 +1791,30 @@ define void @udiv_constantsplat_v8i32(ptr %a) {
; NONEON-NOSVE-LABEL: udiv_constantsplat_v8i32:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT: mov w8, #8969 // =0x2309
-; NONEON-NOSVE-NEXT: movk w8, #22765, lsl #16
+; NONEON-NOSVE-NEXT: mov x8, #301989888 // =0x12000000
+; NONEON-NOSVE-NEXT: movk x8, #55878, lsl #32
+; NONEON-NOSVE-NEXT: movk x8, #689, lsl #48
; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]!
; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64
; NONEON-NOSVE-NEXT: ldr w9, [sp, #28]
-; NONEON-NOSVE-NEXT: umull x10, w9, w8
-; NONEON-NOSVE-NEXT: lsr x10, x10, #32
-; NONEON-NOSVE-NEXT: sub w9, w9, w10
-; NONEON-NOSVE-NEXT: add w9, w10, w9, lsr #1
-; NONEON-NOSVE-NEXT: lsr w11, w9, #6
+; NONEON-NOSVE-NEXT: umulh x10, x9, x8
; NONEON-NOSVE-NEXT: ldr w9, [sp, #24]
-; NONEON-NOSVE-NEXT: umull x10, w9, w8
-; NONEON-NOSVE-NEXT: lsr x10, x10, #32
-; NONEON-NOSVE-NEXT: sub w9, w9, w10
-; NONEON-NOSVE-NEXT: add w9, w10, w9, lsr #1
-; NONEON-NOSVE-NEXT: lsr w9, w9, #6
-; NONEON-NOSVE-NEXT: stp w9, w11, [sp, #56]
+; NONEON-NOSVE-NEXT: umulh x9, x9, x8
+; NONEON-NOSVE-NEXT: stp w9, w10, [sp, #56]
; NONEON-NOSVE-NEXT: ldr w9, [sp, #20]
-; NONEON-NOSVE-NEXT: umull x10, w9, w8
-; NONEON-NOSVE-NEXT: lsr x10, x10, #32
-; NONEON-NOSVE-NEXT: sub w9, w9, w10
-; NONEON-NOSVE-NEXT: add w9, w10, w9, lsr #1
-; NONEON-NOSVE-NEXT: lsr w11, w9, #6
+; NONEON-NOSVE-NEXT: umulh x10, x9, x8
; NONEON-NOSVE-NEXT: ldr w9, [sp, #16]
-; NONEON-NOSVE-NEXT: umull x10, w9, w8
-; NONEON-NOSVE-NEXT: lsr x10, x10, #32
-; NONEON-NOSVE-NEXT: sub w9, w9, w10
-; NONEON-NOSVE-NEXT: add w9, w10, w9, lsr #1
-; NONEON-NOSVE-NEXT: lsr w9, w9, #6
-; NONEON-NOSVE-NEXT: stp w9, w11, [sp, #48]
+; NONEON-NOSVE-NEXT: umulh x9, x9, x8
+; NONEON-NOSVE-NEXT: stp w9, w10, [sp, #48]
; NONEON-NOSVE-NEXT: ldr w9, [sp, #12]
-; NONEON-NOSVE-NEXT: umull x10, w9, w8
-; NONEON-NOSVE-NEXT: lsr x10, x10, #32
-; NONEON-NOSVE-NEXT: sub w9, w9, w10
-; NONEON-NOSVE-NEXT: add w9, w10, w9, lsr #1
-; NONEON-NOSVE-NEXT: lsr w11, w9, #6
+; NONEON-NOSVE-NEXT: umulh x10, x9, x8
; NONEON-NOSVE-NEXT: ldr w9, [sp, #8]
-; NONEON-NOSVE-NEXT: umull x10, w9, w8
-; NONEON-NOSVE-NEXT: lsr x10, x10, #32
-; NONEON-NOSVE-NEXT: sub w9, w9, w10
-; NONEON-NOSVE-NEXT: add w9, w10, w9, lsr #1
-; NONEON-NOSVE-NEXT: lsr w9, w9, #6
-; NONEON-NOSVE-NEXT: stp w9, w11, [sp, #40]
+; NONEON-NOSVE-NEXT: umulh x9, x9, x8
+; NONEON-NOSVE-NEXT: stp w9, w10, [sp, #40]
; NONEON-NOSVE-NEXT: ldr w9, [sp, #4]
-; NONEON-NOSVE-NEXT: umull x10, w9, w8
-; NONEON-NOSVE-NEXT: lsr x10, x10, #32
-; NONEON-NOSVE-NEXT: sub w9, w9, w10
-; NONEON-NOSVE-NEXT: add w9, w10, w9, lsr #1
-; NONEON-NOSVE-NEXT: lsr w10, w9, #6
+; NONEON-NOSVE-NEXT: umulh x10, x9, x8
; NONEON-NOSVE-NEXT: ldr w9, [sp]
-; NONEON-NOSVE-NEXT: umull x8, w9, w8
-; NONEON-NOSVE-NEXT: lsr x8, x8, #32
-; NONEON-NOSVE-NEXT: sub w9, w9, w8
-; NONEON-NOSVE-NEXT: add w8, w8, w9, lsr #1
-; NONEON-NOSVE-NEXT: lsr w8, w8, #6
+; NONEON-NOSVE-NEXT: umulh x8, x9, x8
; NONEON-NOSVE-NEXT: stp w8, w10, [sp, #32]
; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32]
; NONEON-NOSVE-NEXT: stp q0, q1, [x0]
diff --git a/llvm/test/CodeGen/AArch64/udiv-const-optimization.ll b/llvm/test/CodeGen/AArch64/udiv-const-optimization.ll
new file mode 100644
index 0000000000000..d1282d376f0aa
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/udiv-const-optimization.ll
@@ -0,0 +1,61 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64-unknown-linux-gnu | FileCheck %s
+
+; Test optimization of 32-bit unsigned division by constants with 33-bit magic
+; constants (IsAdd=true) on AArch64. The optimization uses the umulh instruction.
+
+define i32 @udiv_by_7(i32 %x) nounwind {
+; CHECK-LABEL: udiv_by_7:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov x8, #2684354560 // =0xa0000000
+; CHECK-NEXT: mov w9, w0
+; CHECK-NEXT: movk x8, #18724, lsl #32
+; CHECK-NEXT: movk x8, #9362, lsl #48
+; CHECK-NEXT: umulh x0, x9, x8
+; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: ret
+ %div = udiv i32 %x, 7
+ ret i32 %div
+}
+
+define i32 @udiv_by_19(i32 %x) nounwind {
+; CHECK-LABEL: udiv_by_19:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov x8, #1476395008 // =0x58000000
+; CHECK-NEXT: mov w9, w0
+; CHECK-NEXT: movk x8, #17246, lsl #32
+; CHECK-NEXT: movk x8, #3449, lsl #48
+; CHECK-NEXT: umulh x0, x9, x8
+; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: ret
+ %div = udiv i32 %x, 19
+ ret i32 %div
+}
+
+define i32 @udiv_by_21(i32 %x) nounwind {
+; CHECK-LABEL: udiv_by_21:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov x8, #939524096 // =0x38000000
+; CHECK-NEXT: mov w9, w0
+; CHECK-NEXT: movk x8, #49932, lsl #32
+; CHECK-NEXT: movk x8, #3120, lsl #48
+; CHECK-NEXT: umulh x0, x9, x8
+; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: ret
+ %div = udiv i32 %x, 21
+ ret i32 %div
+}
+
+; Test non-optimized case
+define i32 @udiv_by_3(i32 %x) nounwind {
+; CHECK-LABEL: udiv_by_3:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, #43691 // =0xaaab
+; CHECK-NEXT: movk w8, #43690, lsl #16
+; CHECK-NEXT: umull x8, w0, w8
+; CHECK-NEXT: lsr x0, x8, #33
+; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: ret
+ %div = udiv i32 %x, 3
+ ret i32 %div
+}
diff --git a/llvm/test/CodeGen/AArch64/urem-lkk.ll b/llvm/test/CodeGen/AArch64/urem-lkk.ll
index 40016c7e4ce0f..c09924b1b430f 100644
--- a/llvm/test/CodeGen/AArch64/urem-lkk.ll
+++ b/llvm/test/CodeGen/AArch64/urem-lkk.ll
@@ -3,18 +3,29 @@
; RUN: llc -mtriple=aarch64-unknown-linux-gnu -global-isel < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI
define i32 @fold_urem_positive_odd(i32 %x) {
-; CHECK-LABEL: fold_urem_positive_odd:
-; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #8969 // =0x2309
-; CHECK-NEXT: movk w8, #22765, lsl #16
-; CHECK-NEXT: umull x8, w0, w8
-; CHECK-NEXT: lsr x8, x8, #32
-; CHECK-NEXT: sub w9, w0, w8
-; CHECK-NEXT: add w8, w8, w9, lsr #1
-; CHECK-NEXT: mov w9, #95 // =0x5f
-; CHECK-NEXT: lsr w8, w8, #6
-; CHECK-NEXT: msub w0, w8, w9, w0
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: fold_urem_positive_odd:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: mov x8, #301989888 // =0x12000000
+; CHECK-SD-NEXT: mov w9, w0
+; CHECK-SD-NEXT: movk x8, #55878, lsl #32
+; CHECK-SD-NEXT: movk x8, #689, lsl #48
+; CHECK-SD-NEXT: umulh x8, x9, x8
+; CHECK-SD-NEXT: mov w9, #95 // =0x5f
+; CHECK-SD-NEXT: msub w0, w8, w9, w0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: fold_urem_positive_odd:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: mov w8, #8969 // =0x2309
+; CHECK-GI-NEXT: movk w8, #22765, lsl #16
+; CHECK-GI-NEXT: umull x8, w0, w8
+; CHECK-GI-NEXT: lsr x8, x8, #32
+; CHECK-GI-NEXT: sub w9, w0, w8
+; CHECK-GI-NEXT: add w8, w8, w9, lsr #1
+; CHECK-GI-NEXT: mov w9, #95 // =0x5f
+; CHECK-GI-NEXT: lsr w8, w8, #6
+; CHECK-GI-NEXT: msub w0, w8, w9, w0
+; CHECK-GI-NEXT: ret
%1 = urem i32 %x, 95
ret i32 %1
}
@@ -37,14 +48,12 @@ define i32 @fold_urem_positive_even(i32 %x) {
define i32 @combine_urem_udiv(i32 %x) {
; CHECK-SD-LABEL: combine_urem_udiv:
; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: mov w8, #8969 // =0x2309
-; CHECK-SD-NEXT: movk w8, #22765, lsl #16
-; CHECK-SD-NEXT: umull x8, w0, w8
-; CHECK-SD-NEXT: lsr x8, x8, #32
-; CHECK-SD-NEXT: sub w9, w0, w8
-; CHECK-SD-NEXT: add w8, w8, w9, lsr #1
+; CHECK-SD-NEXT: mov x8, #301989888 // =0x12000000
+; CHECK-SD-NEXT: mov w9, w0
+; CHECK-SD-NEXT: movk x8, #55878, lsl #32
+; CHECK-SD-NEXT: movk x8, #689, lsl #48
+; CHECK-SD-NEXT: umulh x8, x9, x8
; CHECK-SD-NEXT: mov w9, #95 // =0x5f
-; CHECK-SD-NEXT: lsr w8, w8, #6
; CHECK-SD-NEXT: msub w9, w8, w9, w0
; CHECK-SD-NEXT: add w0, w9, w8
; CHECK-SD-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/div-by-constant.ll b/llvm/test/CodeGen/RISCV/div-by-constant.ll
index 53c3f5841ba0f..24c882daa113d 100644
--- a/llvm/test/CodeGen/RISCV/div-by-constant.ll
+++ b/llvm/test/CodeGen/RISCV/div-by-constant.ll
@@ -48,29 +48,25 @@ define i32 @udiv_constant_add(i32 %a) nounwind {
;
; RV64IM-LABEL: udiv_constant_add:
; RV64IM: # %bb.0:
-; RV64IM-NEXT: slli a1, a0, 32
-; RV64IM-NEXT: lui a2, 149797
-; RV64IM-NEXT: addi a2, a2, -1755
-; RV64IM-NEXT: slli a2, a2, 32
-; RV64IM-NEXT: mulhu a1, a1, a2
-; RV64IM-NEXT: srli a1, a1, 32
-; RV64IM-NEXT: sub a0, a0, a1
-; RV64IM-NEXT: srliw a0, a0, 1
-; RV64IM-NEXT: add a0, a0, a1
-; RV64IM-NEXT: srli a0, a0, 2
+; RV64IM-NEXT: slli a0, a0, 32
+; RV64IM-NEXT: lui a1, 293
+; RV64IM-NEXT: srli a0, a0, 32
+; RV64IM-NEXT: addi a1, a1, -1755
+; RV64IM-NEXT: slli a1, a1, 12
+; RV64IM-NEXT: addi a1, a1, -1755
+; RV64IM-NEXT: slli a1, a1, 29
+; RV64IM-NEXT: mulhu a0, a0, a1
; RV64IM-NEXT: ret
;
; RV64IMZB-LABEL: udiv_constant_add:
; RV64IMZB: # %bb.0:
-; RV64IMZB-NEXT: zext.w a1, a0
-; RV64IMZB-NEXT: lui a2, 149797
-; RV64IMZB-NEXT: addi a2, a2, -1755
-; RV64IMZB-NEXT: mul a1, a1, a2
-; RV64IMZB-NEXT: srli a1, a1, 32
-; RV64IMZB-NEXT: sub a0, a0, a1
-; RV64IMZB-NEXT: srliw a0, a0, 1
-; RV64IMZB-NEXT: add a0, a0, a1
-; RV64IMZB-NEXT: srli a0, a0, 2
+; RV64IMZB-NEXT: zext.w a0, a0
+; RV64IMZB-NEXT: lui a1, 293
+; RV64IMZB-NEXT: addi a1, a1, -1755
+; RV64IMZB-NEXT: slli a1, a1, 12
+; RV64IMZB-NEXT: addi a1, a1, -1755
+; RV64IMZB-NEXT: slli a1, a1, 29
+; RV64IMZB-NEXT: mulhu a0, a0, a1
; RV64IMZB-NEXT: ret
%1 = udiv i32 %a, 7
ret i32 %1
diff --git a/llvm/test/CodeGen/RISCV/udiv-const-optimization.ll b/llvm/test/CodeGen/RISCV/udiv-const-optimization.ll
new file mode 100644
index 0000000000000..5485a5b230a27
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/udiv-const-optimization.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=riscv64-unknown-linux-gnu -mattr=+m | FileCheck %s
+
+; Test optimization of 32-bit unsigned division by constants with 33-bit magic
+; constants (IsAdd=true) on RISC-V64. The optimization uses the mulhu instruction.
+
+define i32 @udiv_by_7(i32 %x) nounwind {
+; CHECK-LABEL: udiv_by_7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: slli a0, a0, 32
+; CHECK-NEXT: lui a1, 293
+; CHECK-NEXT: srli a0, a0, 32
+; CHECK-NEXT: addi a1, a1, -1755
+; CHECK-NEXT: slli a1, a1, 12
+; CHECK-NEXT: addi a1, a1, -1755
+; CHECK-NEXT: slli a1, a1, 29
+; CHECK-NEXT: mulhu a0, a0, a1
+; CHECK-NEXT: ret
+ %div = udiv i32 %x, 7
+ ret i32 %div
+}
+
+define i32 @udiv_by_19(i32 %x) nounwind {
+; CHECK-LABEL: udiv_by_19:
+; CHECK: # %bb.0:
+; CHECK-NEXT: slli a0, a0, 32
+; CHECK-NEXT: lui a1, 717447
+; CHECK-NEXT: srli a0, a0, 32
+; CHECK-NEXT: addi a1, a1, -1077
+; CHECK-NEXT: slli a1, a1, 31
+; CHECK-NEXT: srli a1, a1, 4
+; CHECK-NEXT: mulhu a0, a0, a1
+; CHECK-NEXT: ret
+ %div = udiv i32 %x, 19
+ ret i32 %div
+}
+
+define i32 @udiv_by_21(i32 %x) nounwind {
+; CHECK-LABEL: udiv_by_21:
+; CHECK: # %bb.0:
+; CHECK-NEXT: slli a0, a0, 32
+; CHECK-NEXT: lui a1, 549254
+; CHECK-NEXT: srli a0, a0, 32
+; CHECK-NEXT: addi a1, a1, 391
+; CHECK-NEXT: slli a1, a1, 31
+; CHECK-NEXT: srli a1, a1, 4
+; CHECK-NEXT: mulhu a0, a0, a1
+; CHECK-NEXT: ret
+ %div = udiv i32 %x, 21
+ ret i32 %div
+}
+
+; Test non-optimized case
+define i32 @udiv_by_3(i32 %x) nounwind {
+; CHECK-LABEL: udiv_by_3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: slli a0, a0, 32
+; CHECK-NEXT: lui a1, 699051
+; CHECK-NEXT: addi a1, a1, -1365
+; CHECK-NEXT: slli a1, a1, 32
+; CHECK-NEXT: mulhu a0, a0, a1
+; CHECK-NEXT: srli a0, a0, 33
+; CHECK-NEXT: ret
+ %div = udiv i32 %x, 3
+ ret i32 %div
+}
diff --git a/llvm/test/CodeGen/RISCV/urem-lkk.ll b/llvm/test/CodeGen/RISCV/urem-lkk.ll
index ee496123ba7b4..449e56c82e74c 100644
--- a/llvm/test/CodeGen/RISCV/urem-lkk.ll
+++ b/llvm/test/CodeGen/RISCV/urem-lkk.ll
@@ -43,15 +43,13 @@ define i32 @fold_urem_positive_odd(i32 %x) nounwind {
; RV64IM-LABEL: fold_urem_positive_odd:
; RV64IM: # %bb.0:
; RV64IM-NEXT: slli a1, a0, 32
-; RV64IM-NEXT: lui a2, 364242
+; RV64IM-NEXT: lui a2, 172
+; RV64IM-NEXT: srli a1, a1, 32
+; RV64IM-NEXT: addi a2, a2, 1897
+; RV64IM-NEXT: slli a2, a2, 13
; RV64IM-NEXT: addi a2, a2, 777
-; RV64IM-NEXT: slli a2, a2, 32
+; RV64IM-NEXT: slli a2, a2, 25
; RV64IM-NEXT: mulhu a1, a1, a2
-; RV64IM-NEXT: srli a1, a1, 32
-; RV64IM-NEXT: sub a2, a0, a1
-; RV64IM-NEXT: srliw a2, a2, 1
-; RV64IM-NEXT: add a1, a2, a1
-; RV64IM-NEXT: srli a1, a1, 6
; RV64IM-NEXT: li a2, 95
; RV64IM-NEXT: mul a1, a1, a2
; RV64IM-NEXT: subw a0, a0, a1
@@ -169,16 +167,14 @@ define i32 @combine_urem_udiv(i32 %x) nounwind {
; RV64IM-LABEL: combine_urem_udiv:
; RV64IM: # %bb.0:
; RV64IM-NEXT: slli a1, a0, 32
-; RV64IM-NEXT: lui a2, 364242
+; RV64IM-NEXT: lui a2, 172
+; RV64IM-NEXT: srli a1, a1, 32
+; RV64IM-NEXT: addi a2, a2, 1897
+; RV64IM-NEXT: slli a2, a2, 13
; RV64IM-NEXT: addi a2, a2, 777
-; RV64IM-NEXT: slli a2, a2, 32
+; RV64IM-NEXT: slli a2, a2, 25
; RV64IM-NEXT: mulhu a1, a1, a2
-; RV64IM-NEXT: srli a1, a1, 32
-; RV64IM-NEXT: sub a2, a0, a1
-; RV64IM-NEXT: srliw a2, a2, 1
-; RV64IM-NEXT: add a1, a2, a1
; RV64IM-NEXT: li a2, 95
-; RV64IM-NEXT: srli a1, a1, 6
; RV64IM-NEXT: mul a2, a1, a2
; RV64IM-NEXT: add a0, a0, a1
; RV64IM-NEXT: subw a0, a0, a2
diff --git a/llvm/test/CodeGen/X86/fold-loop-of-urem.ll b/llvm/test/CodeGen/X86/fold-loop-of-urem.ll
index c9c88f7258435..cb1c078ee5129 100644
--- a/llvm/test/CodeGen/X86/fold-loop-of-urem.ll
+++ b/llvm/test/CodeGen/X86/fold-loop-of-urem.ll
@@ -798,20 +798,14 @@ define void @simple_urem_skip_const_rem_amt(i32 %N) nounwind {
; CHECK-NEXT: movl %edi, %ebx
; CHECK-NEXT: addl $-4, %ebx
; CHECK-NEXT: movl $4, %ebp
-; CHECK-NEXT: movl $2938661835, %r14d # imm = 0xAF286BCB
+; CHECK-NEXT: movabsq $970881267157434368, %r14 # imm = 0xD79435E58000000
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB13_2: # %for.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: movl %ebp, %eax
-; CHECK-NEXT: imulq %r14, %rax
-; CHECK-NEXT: shrq $32, %rax
-; CHECK-NEXT: movl %ebp, %ecx
-; CHECK-NEXT: subl %eax, %ecx
-; CHECK-NEXT: shrl %ecx
-; CHECK-NEXT: addl %eax, %ecx
-; CHECK-NEXT: shrl $4, %ecx
-; CHECK-NEXT: leal (%rcx,%rcx,8), %eax
-; CHECK-NEXT: leal (%rcx,%rax,2), %eax
+; CHECK-NEXT: mulq %r14
+; CHECK-NEXT: leal (%rdx,%rdx,8), %eax
+; CHECK-NEXT: leal (%rdx,%rax,2), %eax
; CHECK-NEXT: movl %ebp, %edi
; CHECK-NEXT: subl %eax, %edi
; CHECK-NEXT: callq use.i32 at PLT
diff --git a/llvm/test/CodeGen/X86/udiv-const-optimization.ll b/llvm/test/CodeGen/X86/udiv-const-optimization.ll
new file mode 100644
index 0000000000000..a4fa413bab038
--- /dev/null
+++ b/llvm/test/CodeGen/X86/udiv-const-optimization.ll
@@ -0,0 +1,141 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+bmi2 | FileCheck %s --check-prefix=X64-BMI2
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu | FileCheck %s --check-prefix=X86
+
+; Test optimization of 32-bit unsigned division by constants with 33-bit magic
+; constants (IsAdd=true) on 64-bit targets. The optimization uses pre-shifted
+; constants and 64x64->128 bit multiplication to reduce instruction count.
+
+define i32 @udiv_by_7(i32 %x) nounwind {
+; X64-LABEL: udiv_by_7:
+; X64: # %bb.0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: movabsq $2635249153617166336, %rcx # imm = 0x24924924A0000000
+; X64-NEXT: mulq %rcx
+; X64-NEXT: movq %rdx, %rax
+; X64-NEXT: # kill: def $eax killed $eax killed $rax
+; X64-NEXT: retq
+;
+; X64-BMI2-LABEL: udiv_by_7:
+; X64-BMI2: # %bb.0:
+; X64-BMI2-NEXT: movl %edi, %edx
+; X64-BMI2-NEXT: movabsq $2635249153617166336, %rax # imm = 0x24924924A0000000
+; X64-BMI2-NEXT: mulxq %rax, %rax, %rax
+; X64-BMI2-NEXT: # kill: def $eax killed $eax killed $rax
+; X64-BMI2-NEXT: retq
+;
+; X86-LABEL: udiv_by_7:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl $613566757, %edx # imm = 0x24924925
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull %edx
+; X86-NEXT: subl %edx, %ecx
+; X86-NEXT: shrl %ecx
+; X86-NEXT: addl %edx, %ecx
+; X86-NEXT: shrl $2, %ecx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: retl
+ %div = udiv i32 %x, 7
+ ret i32 %div
+}
+
+define i32 @udiv_by_19(i32 %x) nounwind {
+; X64-LABEL: udiv_by_19:
+; X64: # %bb.0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: movabsq $970881267157434368, %rcx # imm = 0xD79435E58000000
+; X64-NEXT: mulq %rcx
+; X64-NEXT: movq %rdx, %rax
+; X64-NEXT: # kill: def $eax killed $eax killed $rax
+; X64-NEXT: retq
+;
+; X64-BMI2-LABEL: udiv_by_19:
+; X64-BMI2: # %bb.0:
+; X64-BMI2-NEXT: movl %edi, %edx
+; X64-BMI2-NEXT: movabsq $970881267157434368, %rax # imm = 0xD79435E58000000
+; X64-BMI2-NEXT: mulxq %rax, %rax, %rax
+; X64-BMI2-NEXT: # kill: def $eax killed $eax killed $rax
+; X64-BMI2-NEXT: retq
+;
+; X86-LABEL: udiv_by_19:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl ${{-?[0-9]+}}, %edx # imm = 0xAF286BCB
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull %edx
+; X86-NEXT: subl %edx, %ecx
+; X86-NEXT: shrl %ecx
+; X86-NEXT: addl %edx, %ecx
+; X86-NEXT: shrl $4, %ecx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: retl
+ %div = udiv i32 %x, 19
+ ret i32 %div
+}
+
+define i32 @udiv_by_21(i32 %x) nounwind {
+; X64-LABEL: udiv_by_21:
+; X64: # %bb.0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: movabsq $878416384583794688, %rcx # imm = 0xC30C30C38000000
+; X64-NEXT: mulq %rcx
+; X64-NEXT: movq %rdx, %rax
+; X64-NEXT: # kill: def $eax killed $eax killed $rax
+; X64-NEXT: retq
+;
+; X64-BMI2-LABEL: udiv_by_21:
+; X64-BMI2: # %bb.0:
+; X64-BMI2-NEXT: movl %edi, %edx
+; X64-BMI2-NEXT: movabsq $878416384583794688, %rax # imm = 0xC30C30C38000000
+; X64-BMI2-NEXT: mulxq %rax, %rax, %rax
+; X64-BMI2-NEXT: # kill: def $eax killed $eax killed $rax
+; X64-BMI2-NEXT: retq
+;
+; X86-LABEL: udiv_by_21:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl ${{-?[0-9]+}}, %edx # imm = 0x86186187
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull %edx
+; X86-NEXT: subl %edx, %ecx
+; X86-NEXT: shrl %ecx
+; X86-NEXT: addl %edx, %ecx
+; X86-NEXT: shrl $4, %ecx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: retl
+ %div = udiv i32 %x, 21
+ ret i32 %div
+}
+
+; Test non-optimized case: divisor that doesn't use IsAdd
+define i32 @udiv_by_3(i32 %x) nounwind {
+; X64-LABEL: udiv_by_3:
+; X64: # %bb.0:
+; X64-NEXT: movl %edi, %ecx
+; X64-NEXT: movl $2863311531, %eax # imm = 0xAAAAAAAB
+; X64-NEXT: imulq %rcx, %rax
+; X64-NEXT: shrq $33, %rax
+; X64-NEXT: # kill: def $eax killed $eax killed $rax
+; X64-NEXT: retq
+;
+; X64-BMI2-LABEL: udiv_by_3:
+; X64-BMI2: # %bb.0:
+; X64-BMI2-NEXT: movl %edi, %ecx
+; X64-BMI2-NEXT: movl $2863311531, %eax # imm = 0xAAAAAAAB
+; X64-BMI2-NEXT: imulq %rcx, %rax
+; X64-BMI2-NEXT: shrq $33, %rax
+; X64-BMI2-NEXT: # kill: def $eax killed $eax killed $rax
+; X64-BMI2-NEXT: retq
+;
+; X86-LABEL: udiv_by_3:
+; X86: # %bb.0:
+; X86-NEXT: movl ${{-?[0-9]+}}, %eax # imm = 0xAAAAAAAB
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: shrl %eax
+; X86-NEXT: retl
+ %div = udiv i32 %x, 3
+ ret i32 %div
+}
diff --git a/llvm/test/CodeGen/X86/urem-lkk.ll b/llvm/test/CodeGen/X86/urem-lkk.ll
index 573f875544cd4..4ac12eae0e5b9 100644
--- a/llvm/test/CodeGen/X86/urem-lkk.ll
+++ b/llvm/test/CodeGen/X86/urem-lkk.ll
@@ -5,16 +5,11 @@ define i32 @fold_urem_positive_odd(i32 %x) {
; CHECK-LABEL: fold_urem_positive_odd:
; CHECK: # %bb.0:
; CHECK-NEXT: movl %edi, %eax
-; CHECK-NEXT: movl %edi, %ecx
-; CHECK-NEXT: imulq $1491936009, %rcx, %rcx # imm = 0x58ED2309
-; CHECK-NEXT: shrq $32, %rcx
-; CHECK-NEXT: movl %edi, %edx
-; CHECK-NEXT: subl %ecx, %edx
-; CHECK-NEXT: shrl %edx
-; CHECK-NEXT: addl %ecx, %edx
-; CHECK-NEXT: shrl $6, %edx
-; CHECK-NEXT: imull $95, %edx, %ecx
-; CHECK-NEXT: subl %ecx, %eax
+; CHECK-NEXT: movabsq $194176253438197760, %rcx # imm = 0x2B1DA4612000000
+; CHECK-NEXT: mulq %rcx
+; CHECK-NEXT: imull $95, %edx, %eax
+; CHECK-NEXT: subl %eax, %edi
+; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: retq
%1 = urem i32 %x, 95
ret i32 %1
@@ -41,17 +36,13 @@ define i32 @fold_urem_positive_even(i32 %x) {
define i32 @combine_urem_udiv(i32 %x) {
; CHECK-LABEL: combine_urem_udiv:
; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
; CHECK-NEXT: movl %edi, %eax
-; CHECK-NEXT: imulq $1491936009, %rax, %rcx # imm = 0x58ED2309
-; CHECK-NEXT: shrq $32, %rcx
-; CHECK-NEXT: movl %edi, %eax
-; CHECK-NEXT: subl %ecx, %eax
-; CHECK-NEXT: shrl %eax
-; CHECK-NEXT: addl %ecx, %eax
-; CHECK-NEXT: shrl $6, %eax
-; CHECK-NEXT: imull $95, %eax, %ecx
-; CHECK-NEXT: subl %ecx, %edi
-; CHECK-NEXT: addl %edi, %eax
+; CHECK-NEXT: movabsq $194176253438197760, %rcx # imm = 0x2B1DA4612000000
+; CHECK-NEXT: mulq %rcx
+; CHECK-NEXT: imull $95, %edx, %eax
+; CHECK-NEXT: subl %eax, %edi
+; CHECK-NEXT: leal (%rdi,%rdx), %eax
; CHECK-NEXT: retq
%1 = urem i32 %x, 95
%2 = udiv i32 %x, 95
>From 6ecca57678053b1c51a715ee93603142a524d26d Mon Sep 17 00:00:00 2001
From: MITSUNARI Shigeo <herumi at nifty.com>
Date: Fri, 13 Feb 2026 12:51:09 +0900
Subject: [PATCH 2/2] Apply review suggestions from llvm/llvm-project#181288
---
.../CodeGen/SelectionDAG/TargetLowering.cpp | 34 +++++++------------
1 file changed, 12 insertions(+), 22 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index c3829002b5b36..2d149a10c3036 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -12,7 +12,6 @@
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallString.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/CodeGen/Analysis.h"
@@ -6792,6 +6791,9 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
const unsigned SVTBits = SVT.getSizeInBits();
bool UseNPQ = false, UsePreShift = false, UsePostShift = false;
+ EVT WideVT64 = EVT::getIntegerVT(*DAG.getContext(), 64);
+ bool HasWideVT64MULHU = isOperationLegalOrCustom(ISD::MULHU, WideVT64, IsAfterLegalization);
+ bool HasWideVT64UMUL_LOHI = isOperationLegalOrCustom(ISD::UMUL_LOHI, WideVT64, IsAfterLegalization);
bool Use33BitOptimization = false;
SmallVector<SDValue, 16> PreShifts, PostShifts, MagicFactors, NPQFactors;
@@ -6814,18 +6816,13 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
UnsignedDivisionByConstantInfo::get(
Divisor, std::min(KnownLeadingZeros, Divisor.countl_zero()));
- // Our Approach: For 32-bit division with IsAdd (33-bit
- // magic case), use optimized method: preshift c by (64-a) bits to
- // eliminate runtime shift. This requires 64x64->128 bit multiplication.
+ // For 32-bit division with IsAdd (33-bit magic case), use optimized method:
+ // preshift c by (64-a) bits to eliminate runtime shift.
+ // This requires 64x64->128 bit multiplication.
// Only apply to scalar types since SIMD lacks 64x64->128 high multiply.
// Note: IsAdd=true implies PreShift=0 by algorithm design.
// Check if 64-bit MULHU is available before applying this optimization.
- EVT WideVT64 = EVT::getIntegerVT(*DAG.getContext(), 64);
- bool Has64BitMULHU =
- isOperationLegalOrCustom(ISD::MULHU, WideVT64, IsAfterLegalization) ||
- isOperationLegalOrCustom(ISD::UMUL_LOHI, WideVT64,
- IsAfterLegalization);
- if (EltBits == 32 && !VT.isVector() && Has64BitMULHU && magics.IsAdd) {
+ if (EltBits == 32 && !VT.isVector() && (HasWideVT64MULHU || HasWideVT64UMUL_LOHI) && magics.IsAdd) {
// For IsAdd case, actual magic constant is 2^32 + Magic (33-bit)
unsigned OriginalShift = magics.PostShift + 33;
APInt RealMagic = APInt(65, 1).shl(32) + magics.Magic.zext(65); // 2^32 + Magic
@@ -6894,12 +6891,10 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
PostShift = PostShifts[0];
}
- // Our Approach: Use optimized 33-bit method for 32-bit division
if (Use33BitOptimization) {
// x is i32, MagicFactor is pre-shifted i64 constant
// Compute: (i64(x) * MagicFactor) >> 64
- EVT WideVT = EVT::getIntegerVT(*DAG.getContext(), 64);
- SDValue X64 = DAG.getNode(ISD::ZERO_EXTEND, dl, WideVT, N0);
+ SDValue X64 = DAG.getNode(ISD::ZERO_EXTEND, dl, WideVT64, N0);
// Get the pre-shifted constant (it's already in MagicFactor as i64)
SDValue MagicFactor64 = isa<ConstantSDNode>(MagicFactor)
@@ -6908,22 +6903,18 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
SDValue Result;
// Perform 64x64 -> 128 multiplication and extract high 64 bits
- if (isOperationLegalOrCustom(ISD::MULHU, WideVT, IsAfterLegalization)) {
- SDValue High = DAG.getNode(ISD::MULHU, dl, WideVT, X64, MagicFactor64);
+ if (HasWideVT64MULHU) {
+ SDValue High = DAG.getNode(ISD::MULHU, dl, WideVT64, X64, MagicFactor64);
Created.push_back(High.getNode());
// Truncate back to i32
Result = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
- } else if (isOperationLegalOrCustom(ISD::UMUL_LOHI, WideVT, IsAfterLegalization)) {
+ } else if (HasWideVT64UMUL_LOHI) {
SDValue LoHi = DAG.getNode(ISD::UMUL_LOHI, dl,
- DAG.getVTList(WideVT, WideVT),
+ DAG.getVTList(WideVT64, WideVT64),
X64, MagicFactor64);
SDValue High = SDValue(LoHi.getNode(), 1);
Created.push_back(LoHi.getNode());
Result = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
- } else {
- // Fallback to standard path if 64-bit MULHU is not available
- Use33BitOptimization = false;
- goto standard_path;
}
// Handle divisor == 1 case with SELECT
@@ -6933,7 +6924,6 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
return DAG.getSelect(dl, VT, IsOne, N0, Result);
}
-standard_path:
SDValue Q = N0;
if (UsePreShift) {
Q = DAG.getNode(ISD::SRL, dl, VT, Q, PreShift);
More information about the llvm-commits
mailing list