[llvm] [SelectionDAG] constant division fallback for existing Constant Division optimization (PR #188402)
Takashi Idobe via llvm-commits
llvm-commits at lists.llvm.org
Sat Apr 4 19:19:21 PDT 2026
https://github.com/Takashiidobe updated https://github.com/llvm/llvm-project/pull/188402
>From b4a6a891a45f341fe0053456b93829d12c4abd37 Mon Sep 17 00:00:00 2001
From: Takashiidobe <idobetakashi at gmail.com>
Date: Mon, 23 Mar 2026 20:07:36 -0400
Subject: [PATCH 1/6] add pre-commit tests for narrow udiv magic multiply
Add tests showing the current codegen for i8 and i16 udiv-by-constant.
The Hacker's Delight algorithm often needs an expensive add-and-shift
fixup for these narrow types. A subsequent patch will improve BuildUDIV
to use a simple magic multiply at a wider legal type instead.
---
llvm/test/CodeGen/X86/udiv-narrow-magic.ll | 170 +++++++++++++++++++++
1 file changed, 170 insertions(+)
create mode 100644 llvm/test/CodeGen/X86/udiv-narrow-magic.ll
diff --git a/llvm/test/CodeGen/X86/udiv-narrow-magic.ll b/llvm/test/CodeGen/X86/udiv-narrow-magic.ll
new file mode 100644
index 0000000000000..5aebf694da0ff
--- /dev/null
+++ b/llvm/test/CodeGen/X86/udiv-narrow-magic.ll
@@ -0,0 +1,170 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=x86_64 < %s | FileCheck %s
+
+; When a narrow udiv-by-constant (i8, i16) is lowered via BuildUDIV, the
+; Hacker's Delight algorithm may need an expensive add-and-shift fixup.
+; A wider legal type (e.g. i32 for i8) has enough headroom for a simple
+; Magic = ceil(2^Shift / C) that needs no fixup at all.
+
+; --- i8 cases ---
+
+define i8 @udiv_i8_by7(i8 %x) nounwind {
+; CHECK-LABEL: udiv_i8_by7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: leal (%rax,%rax,8), %ecx
+; CHECK-NEXT: leal (%rax,%rcx,4), %ecx
+; CHECK-NEXT: shrl $8, %ecx
+; CHECK-NEXT: subb %cl, %al
+; CHECK-NEXT: shrb %al
+; CHECK-NEXT: addb %cl, %al
+; CHECK-NEXT: shrb $2, %al
+; CHECK-NEXT: # kill: def $al killed $al killed $rax
+; CHECK-NEXT: retq
+ %d = udiv i8 %x, 7
+ ret i8 %d
+}
+
+define i8 @udiv_i8_by5(i8 %x) nounwind {
+; CHECK-LABEL: udiv_i8_by5:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: imull $205, %eax, %eax
+; CHECK-NEXT: shrl $10, %eax
+; CHECK-NEXT: # kill: def $al killed $al killed $eax
+; CHECK-NEXT: retq
+ %d = udiv i8 %x, 5
+ ret i8 %d
+}
+
+define i8 @udiv_i8_by3(i8 %x) nounwind {
+; CHECK-LABEL: udiv_i8_by3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: imull $171, %eax, %eax
+; CHECK-NEXT: shrl $9, %eax
+; CHECK-NEXT: # kill: def $al killed $al killed $eax
+; CHECK-NEXT: retq
+ %d = udiv i8 %x, 3
+ ret i8 %d
+}
+
+; Even divisor - stock uses pre-shift + magic; simple magic avoids pre-shift.
+define i8 @udiv_i8_by78(i8 %x) nounwind {
+; CHECK-LABEL: udiv_i8_by78:
+; CHECK: # %bb.0:
+; CHECK-NEXT: shrb %dil
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: imull $211, %eax, %eax
+; CHECK-NEXT: shrl $13, %eax
+; CHECK-NEXT: # kill: def $al killed $al killed $eax
+; CHECK-NEXT: retq
+ %d = udiv i8 %x, 78
+ ret i8 %d
+}
+
+define i8 @udiv_i8_by116(i8 %x) nounwind {
+; CHECK-LABEL: udiv_i8_by116:
+; CHECK: # %bb.0:
+; CHECK-NEXT: shrb $2, %dil
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: imull $71, %eax, %eax
+; CHECK-NEXT: shrl $11, %eax
+; CHECK-NEXT: # kill: def $al killed $al killed $eax
+; CHECK-NEXT: retq
+ %d = udiv i8 %x, 116
+ ret i8 %d
+}
+
+; Power of 2 - should NOT fire (already lowered to shift).
+define i8 @udiv_i8_by4(i8 %x) nounwind {
+; CHECK-LABEL: udiv_i8_by4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: shrb $2, %al
+; CHECK-NEXT: # kill: def $al killed $al killed $eax
+; CHECK-NEXT: retq
+ %d = udiv i8 %x, 4
+ ret i8 %d
+}
+
+; Division by 1 - should NOT fire.
+define i8 @udiv_i8_by1(i8 %x) nounwind {
+; CHECK-LABEL: udiv_i8_by1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: # kill: def $al killed $al killed $eax
+; CHECK-NEXT: retq
+ %d = udiv i8 %x, 1
+ ret i8 %d
+}
+
+; Bare i8 udiv feeding another i8 op (no zext).
+define i8 @udiv_i8_then_add(i8 %x, i8 %y) nounwind {
+; CHECK-LABEL: udiv_i8_then_add:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: leal (%rax,%rax,8), %ecx
+; CHECK-NEXT: leal (%rax,%rcx,4), %ecx
+; CHECK-NEXT: shrl $8, %ecx
+; CHECK-NEXT: subb %cl, %al
+; CHECK-NEXT: shrb %al
+; CHECK-NEXT: addb %cl, %al
+; CHECK-NEXT: shrb $2, %al
+; CHECK-NEXT: addb %sil, %al
+; CHECK-NEXT: # kill: def $al killed $al killed $rax
+; CHECK-NEXT: retq
+ %d = udiv i8 %x, 7
+ %r = add i8 %d, %y
+ ret i8 %r
+}
+
+; --- i16 cases ---
+
+define i16 @udiv_i16_by7(i16 %x) nounwind {
+; CHECK-LABEL: udiv_i16_by7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movzwl %di, %eax
+; CHECK-NEXT: imull $9363, %eax, %ecx # imm = 0x2493
+; CHECK-NEXT: shrl $16, %ecx
+; CHECK-NEXT: subl %ecx, %edi
+; CHECK-NEXT: movzwl %di, %eax
+; CHECK-NEXT: shrl %eax
+; CHECK-NEXT: addl %ecx, %eax
+; CHECK-NEXT: shrl $2, %eax
+; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT: retq
+ %d = udiv i16 %x, 7
+ ret i16 %d
+}
+
+define i16 @udiv_i16_by100(i16 %x) nounwind {
+; CHECK-LABEL: udiv_i16_by100:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movzwl %di, %eax
+; CHECK-NEXT: shrl $2, %eax
+; CHECK-NEXT: imull $5243, %eax, %eax # imm = 0x147B
+; CHECK-NEXT: shrl $17, %eax
+; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT: retq
+ %d = udiv i16 %x, 100
+ ret i16 %d
+}
+
+; zext(udiv i16) - should also improve.
+define i32 @zext_udiv_i16_by7(i16 %x) nounwind {
+; CHECK-LABEL: zext_udiv_i16_by7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movzwl %di, %eax
+; CHECK-NEXT: imull $9363, %eax, %ecx # imm = 0x2493
+; CHECK-NEXT: shrl $16, %ecx
+; CHECK-NEXT: subl %ecx, %edi
+; CHECK-NEXT: movzwl %di, %eax
+; CHECK-NEXT: shrl %eax
+; CHECK-NEXT: addl %ecx, %eax
+; CHECK-NEXT: shrl $2, %eax
+; CHECK-NEXT: retq
+ %d = udiv i16 %x, 7
+ %z = zext i16 %d to i32
+ ret i32 %z
+}
>From 5b9bb3086de6024f404eaec034f230db5fb9c280 Mon Sep 17 00:00:00 2001
From: Takashiidobe <idobetakashi at gmail.com>
Date: Tue, 24 Mar 2026 17:36:01 -0400
Subject: [PATCH 2/6] Use fixup-free 64-bit magic multiply for narrow udiv with
IsAdd
For i8/i16 udiv by constant where Hacker's Delight requires an
add-and-shift fixup (IsAdd), try a fixup-free alternative:
trunc(srl(mul(zext(x, 64), ceil(2^Shift / C)), Shift))
This is strictly cheaper than the HD NPQ path (zext + MUL + SRL vs.
MULHU + sub + srl + add + srl). The optimization is gated on i64 MUL
being natively legal, since some divisors near powers of two require
widening to 64 bits and a synthesized 64-bit multiply would be worse
than the fixup. When no fixup-free solution exists, the HD path is used.
Implementation notes:
- findSimpleWideMagic searches 64-bit space only: when HD requires IsAdd
for i8/i16, no 32-bit fixup-free solution exists (brute-force verified)
- Single i64 legality check at call site gates the entire optimization
- Magic*C overflow check removed: Magic*C <= 2^63 + 65535 < 2^64 for
i8/i16, so it never overflows
- Break on Check (1) overflow is correct: Magic = ceil(2^Shift / C)
grows monotonically with Shift, so overflow can only worsen
- Return type uses a named struct + bool following LLVM convention
- UseSimpleWideMul flag replaced by SimpleWideMulMagic.getNode() sentinel
---
.../CodeGen/SelectionDAG/TargetLowering.cpp | 101 +++++-
llvm/test/CodeGen/X86/udiv-narrow-magic.ll | 43 +--
llvm/test/CodeGen/X86/urem-vector-lkk.ll | 298 +++++++++++++++++-
3 files changed, 408 insertions(+), 34 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 092bc283c84dc..28f962c93b981 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -6781,6 +6781,56 @@ SDValue TargetLowering::BuildSDIV(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(ISD::ADD, dl, VT, Q, T);
}
+/// For narrow scalar types (i8/i16) where Hacker's Delight requires an
+/// add-and-shift fixup (IsAdd), check whether a fixup-free 64-bit magic
+/// multiply exists:
+/// trunc(srl(mul(zext(x, 64), Magic), Shift))
+/// where Magic = ceil(2^Shift / C).
+///
+/// No fixup is needed when two conditions hold:
+/// (1) MaxX * Magic < 2^64 (multiply doesn't overflow 64 bits)
+/// (2) MaxX * (Magic*C - 2^Shift) < 2^Shift (approximation error is exact)
+///
+/// When IsAdd is required by HD, no 32-bit fixup-free solution exists, so we
+/// search only in 64-bit space. Populates Info and returns true on success.
+struct SimpleWideMagicInfo {
+ APInt Magic;
+ unsigned Shift;
+};
+
+static bool findSimpleWideMagic(const APInt &Divisor, const APInt &MaxX,
+ SimpleWideMagicInfo &Info) {
+ APInt DivWide = Divisor.zext(64);
+ APInt MaxWide = MaxX.zext(64);
+ unsigned MinShift = Divisor.ceilLogBase2();
+
+ for (unsigned Shift = MinShift; Shift < 64; ++Shift) {
+ APInt TwoToS = APInt(64, 1).shl(Shift);
+ APInt Magic = APIntOps::RoundingUDiv(TwoToS, DivWide, APInt::Rounding::UP);
+
+ // Check (1): MaxX * Magic must fit in 64 bits. Magic = ceil(2^Shift / C)
+ // grows monotonically with Shift, so once this overflows no larger Shift
+ // can succeed either.
+ bool Overflow = false;
+ (void)MaxWide.umul_ov(Magic, Overflow);
+ if (Overflow)
+ break;
+
+ // Check (2): MaxX * (Magic*C - 2^Shift) < 2^Shift.
+ // Magic*C never overflows 64 bits for i8/i16: Magic*C <= 2^Shift + C
+ // <= 2^63 + 65535 < 2^64.
+ APInt Error = Magic * DivWide - TwoToS;
+ APInt MaxError = MaxWide.umul_ov(Error, Overflow);
+ if (Overflow || MaxError.uge(TwoToS))
+ continue;
+
+ Info = {Magic, Shift};
+ return true;
+ }
+
+ return false;
+}
+
/// Given an ISD::UDIV node expressing a divide by constant,
/// return a DAG expression to select that will generate the same value by
/// multiplying by a magic number.
@@ -6852,9 +6902,13 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
+ // Compute known bits once; used both to reduce the Hacker's Delight magic
+ // and to check simple-wide-magic conditions below.
+ KnownBits Known0 = DAG.computeKnownBits(N0);
+
// Try to use leading zeros of the dividend to reduce the multiplier and
// avoid expensive fixups.
- unsigned KnownLeadingZeros = DAG.computeKnownBits(N0).countMinLeadingZeros();
+ unsigned KnownLeadingZeros = Known0.countMinLeadingZeros();
// If we're after type legalization and SVT is not legal, use the
// promoted type for creating constants to avoid creating nodes with
@@ -6882,6 +6936,13 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
bool UseNPQ = false, UsePreShift = false, UsePostShift = false;
bool UseWiden = false;
+ // For narrow scalar types (i8, i16) a simple fixup-free wide magic may exist:
+ // trunc(srl(mul(zext(x, W), ceil(2^Shift / C)), Shift))
+ // This is preferred over the NPQ add-and-shift fixup when it applies.
+ // SimpleWideMulMagic being non-null indicates this path was taken.
+ EVT SimpleWideMulVT;
+ SDValue SimpleWideMulMagic;
+ SDValue SimpleWideMulShift;
SmallVector<SDValue, 16> PreShifts, PostShifts, MagicFactors, NPQFactors;
auto BuildUDIVPattern = [&](ConstantSDNode *C) {
@@ -6927,6 +6988,29 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
UseNPQ |= magics.IsAdd;
UsePreShift |= magics.PreShift != 0;
UsePostShift |= magics.PostShift != 0;
+
+ // For narrow scalar types (i8, i16), when the Hacker's Delight magic
+ // requires the expensive NPQ add-and-shift fixup (IsAdd), check whether
+ // a simple fixup-free 64-bit magic exists:
+ // trunc(srl(mul(zext(x, 64), ceil(2^Shift / C)), Shift))
+ // When IsAdd is required by HD, no 32-bit fixup-free solution exists, so
+ // we go directly to 64-bit. Only attempt when i64 MUL is natively legal.
+ EVT I64VT = EVT::getIntegerVT(*DAG.getContext(), 64);
+ bool IsScalar = !VT.isVector();
+ bool IsNarrow = EltBits <= 16;
+ bool NeedsAddFixup = magics.IsAdd;
+ bool HasLegalI64Mul =
+ isOperationLegalOrCustom(ISD::MUL, I64VT, IsAfterLegalization);
+ if (IsScalar && IsNarrow && NeedsAddFixup && HasLegalI64Mul) {
+ APInt MaxX = Known0.getMaxValue();
+ SimpleWideMagicInfo Info;
+ if (findSimpleWideMagic(Divisor, MaxX, Info)) {
+ SimpleWideMulVT = I64VT;
+ EVT WideShVT = getShiftAmountTy(I64VT, DAG.getDataLayout());
+ SimpleWideMulMagic = DAG.getConstant(Info.Magic, dl, I64VT);
+ SimpleWideMulShift = DAG.getConstant(Info.Shift, dl, WideShVT);
+ }
+ }
}
PreShifts.push_back(PreShift);
@@ -6962,6 +7046,21 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
PostShift = PostShifts[0];
}
+ // Simple wide magic: trunc(srl(mul(zext(x, 64), Magic), Shift)).
+ // Only applies to narrow scalars (i8, i16); divisor=1 is excluded (never
+ // sets SimpleWideMulMagic), so no IsOne select is needed here.
+ if (SimpleWideMulMagic.getNode()) {
+ SDValue Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, SimpleWideMulVT, N0);
+ Created.push_back(Wide.getNode());
+ SDValue Mul =
+ DAG.getNode(ISD::MUL, dl, SimpleWideMulVT, Wide, SimpleWideMulMagic);
+ Created.push_back(Mul.getNode());
+ SDValue Srl =
+ DAG.getNode(ISD::SRL, dl, SimpleWideMulVT, Mul, SimpleWideMulShift);
+ Created.push_back(Srl.getNode());
+ return DAG.getNode(ISD::TRUNCATE, dl, VT, Srl);
+ }
+
if (UseWiden) {
// Compute: (WideSVT(x) * MagicFactor) >> WideSVTBits.
SDValue WideN0 = DAG.getNode(ISD::ZERO_EXTEND, dl, WideSVT, N0);
diff --git a/llvm/test/CodeGen/X86/udiv-narrow-magic.ll b/llvm/test/CodeGen/X86/udiv-narrow-magic.ll
index 5aebf694da0ff..6c20b14f90ae6 100644
--- a/llvm/test/CodeGen/X86/udiv-narrow-magic.ll
+++ b/llvm/test/CodeGen/X86/udiv-narrow-magic.ll
@@ -12,14 +12,9 @@ define i8 @udiv_i8_by7(i8 %x) nounwind {
; CHECK-LABEL: udiv_i8_by7:
; CHECK: # %bb.0:
; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: leal (%rax,%rax,8), %ecx
-; CHECK-NEXT: leal (%rax,%rcx,4), %ecx
-; CHECK-NEXT: shrl $8, %ecx
-; CHECK-NEXT: subb %cl, %al
-; CHECK-NEXT: shrb %al
-; CHECK-NEXT: addb %cl, %al
-; CHECK-NEXT: shrb $2, %al
-; CHECK-NEXT: # kill: def $al killed $al killed $rax
+; CHECK-NEXT: imull $293, %eax, %eax # imm = 0x125
+; CHECK-NEXT: shrl $11, %eax
+; CHECK-NEXT: # kill: def $al killed $al killed $eax
; CHECK-NEXT: retq
%d = udiv i8 %x, 7
ret i8 %d
@@ -104,15 +99,10 @@ define i8 @udiv_i8_then_add(i8 %x, i8 %y) nounwind {
; CHECK-LABEL: udiv_i8_then_add:
; CHECK: # %bb.0:
; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: leal (%rax,%rax,8), %ecx
-; CHECK-NEXT: leal (%rax,%rcx,4), %ecx
-; CHECK-NEXT: shrl $8, %ecx
-; CHECK-NEXT: subb %cl, %al
-; CHECK-NEXT: shrb %al
-; CHECK-NEXT: addb %cl, %al
-; CHECK-NEXT: shrb $2, %al
+; CHECK-NEXT: imull $293, %eax, %eax # imm = 0x125
+; CHECK-NEXT: shrl $11, %eax
; CHECK-NEXT: addb %sil, %al
-; CHECK-NEXT: # kill: def $al killed $al killed $rax
+; CHECK-NEXT: # kill: def $al killed $al killed $eax
; CHECK-NEXT: retq
%d = udiv i8 %x, 7
%r = add i8 %d, %y
@@ -125,14 +115,9 @@ define i16 @udiv_i16_by7(i16 %x) nounwind {
; CHECK-LABEL: udiv_i16_by7:
; CHECK: # %bb.0:
; CHECK-NEXT: movzwl %di, %eax
-; CHECK-NEXT: imull $9363, %eax, %ecx # imm = 0x2493
-; CHECK-NEXT: shrl $16, %ecx
-; CHECK-NEXT: subl %ecx, %edi
-; CHECK-NEXT: movzwl %di, %eax
-; CHECK-NEXT: shrl %eax
-; CHECK-NEXT: addl %ecx, %eax
-; CHECK-NEXT: shrl $2, %eax
-; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT: imulq $74899, %rax, %rax # imm = 0x12493
+; CHECK-NEXT: shrq $19, %rax
+; CHECK-NEXT: # kill: def $ax killed $ax killed $rax
; CHECK-NEXT: retq
%d = udiv i16 %x, 7
ret i16 %d
@@ -156,13 +141,9 @@ define i32 @zext_udiv_i16_by7(i16 %x) nounwind {
; CHECK-LABEL: zext_udiv_i16_by7:
; CHECK: # %bb.0:
; CHECK-NEXT: movzwl %di, %eax
-; CHECK-NEXT: imull $9363, %eax, %ecx # imm = 0x2493
-; CHECK-NEXT: shrl $16, %ecx
-; CHECK-NEXT: subl %ecx, %edi
-; CHECK-NEXT: movzwl %di, %eax
-; CHECK-NEXT: shrl %eax
-; CHECK-NEXT: addl %ecx, %eax
-; CHECK-NEXT: shrl $2, %eax
+; CHECK-NEXT: imulq $74899, %rax, %rax # imm = 0x12493
+; CHECK-NEXT: shrq $19, %rax
+; CHECK-NEXT: # kill: def $eax killed $eax killed $rax
; CHECK-NEXT: retq
%d = udiv i16 %x, 7
%z = zext i16 %d to i32
diff --git a/llvm/test/CodeGen/X86/urem-vector-lkk.ll b/llvm/test/CodeGen/X86/urem-vector-lkk.ll
index 3faa2a0720d4e..f9a3b017a6748 100644
--- a/llvm/test/CodeGen/X86/urem-vector-lkk.ll
+++ b/llvm/test/CodeGen/X86/urem-vector-lkk.ll
@@ -22,8 +22,78 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) {
; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [95,124,98,1003,u,u,u,u]
; SSE2-NEXT: psubw %xmm1, %xmm0
; SSE2-NEXT: retq
-;
-; SSE4-LABEL: fold_urem_vec_1:
+; SSE-LABEL: fold_urem_vec_1:
+; SSE: # %bb.0:
+; SSE-NEXT: pextrw $1, %xmm0, %eax
+; SSE-NEXT: movl %eax, %ecx
+; SSE-NEXT: shrl $2, %ecx
+; SSE-NEXT: imull $16913, %ecx, %ecx # imm = 0x4211
+; SSE-NEXT: shrl $19, %ecx
+; SSE-NEXT: imull $124, %ecx, %ecx
+; SSE-NEXT: subl %ecx, %eax
+; SSE-NEXT: movd %xmm0, %ecx
+; SSE-NEXT: movzwl %cx, %edx
+; SSE-NEXT: imull $44151, %edx, %edx # imm = 0xAC77
+; SSE-NEXT: shrl $22, %edx
+; SSE-NEXT: imull $95, %edx, %edx
+; SSE-NEXT: subl %edx, %ecx
+; SSE-NEXT: movd %ecx, %xmm1
+; SSE-NEXT: pinsrw $1, %eax, %xmm1
+; SSE-NEXT: pextrw $2, %xmm0, %eax
+; SSE-NEXT: movl %eax, %ecx
+; SSE-NEXT: shrl %ecx
+; SSE-NEXT: imull $2675, %ecx, %ecx # imm = 0xA73
+; SSE-NEXT: shrl $17, %ecx
+; SSE-NEXT: imull $98, %ecx, %ecx
+; SSE-NEXT: subl %ecx, %eax
+; SSE-NEXT: pinsrw $2, %eax, %xmm1
+; SSE-NEXT: pextrw $3, %xmm0, %eax
+; SSE-NEXT: imull $1373, %eax, %ecx # imm = 0x55D
+; SSE-NEXT: shrl $16, %ecx
+; SSE-NEXT: movl %eax, %edx
+; SSE-NEXT: subl %ecx, %edx
+; SSE-NEXT: movzwl %dx, %edx
+; SSE-NEXT: shrl %edx
+; SSE-NEXT: addl %ecx, %edx
+; SSE-NEXT: shrl $9, %edx
+; SSE-NEXT: imull $1003, %edx, %ecx # imm = 0x3EB
+; SSE-NEXT: subl %ecx, %eax
+; SSE-NEXT: pinsrw $3, %eax, %xmm1
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: retq
+; SSE-LABEL: fold_urem_vec_1:
+; SSE: # %bb.0:
+; SSE-NEXT: pextrw $1, %xmm0, %eax
+; SSE-NEXT: movl %eax, %ecx
+; SSE-NEXT: shrl $2, %ecx
+; SSE-NEXT: imull $16913, %ecx, %ecx # imm = 0x4211
+; SSE-NEXT: shrl $19, %ecx
+; SSE-NEXT: imull $124, %ecx, %ecx
+; SSE-NEXT: subl %ecx, %eax
+; SSE-NEXT: movd %xmm0, %ecx
+; SSE-NEXT: movzwl %cx, %edx
+; SSE-NEXT: imull $44151, %edx, %edx # imm = 0xAC77
+; SSE-NEXT: shrl $22, %edx
+; SSE-NEXT: imull $95, %edx, %edx
+; SSE-NEXT: subl %edx, %ecx
+; SSE-NEXT: movd %ecx, %xmm1
+; SSE-NEXT: pinsrw $1, %eax, %xmm1
+; SSE-NEXT: pextrw $2, %xmm0, %eax
+; SSE-NEXT: movl %eax, %ecx
+; SSE-NEXT: shrl %ecx
+; SSE-NEXT: imull $2675, %ecx, %ecx # imm = 0xA73
+; SSE-NEXT: shrl $17, %ecx
+; SSE-NEXT: imull $98, %ecx, %ecx
+; SSE-NEXT: subl %ecx, %eax
+; SSE-NEXT: pinsrw $2, %eax, %xmm1
+; SSE-NEXT: pextrw $3, %xmm0, %eax
+; SSE-NEXT: imulq $66909, %rax, %rcx # imm = 0x1055D
+; SSE-NEXT: shrq $26, %rcx
+; SSE-NEXT: imull $1003, %ecx, %ecx # imm = 0x3EB
+; SSE-NEXT: subl %ecx, %eax
+; SSE-NEXT: pinsrw $3, %eax, %xmm1
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: retq
; SSE4: # %bb.0:
; SSE4-NEXT: movq {{.*#+}} xmm1 = [0,16384,32768,0,0,0,0,0]
; SSE4-NEXT: pmulhuw %xmm0, %xmm1
@@ -62,6 +132,79 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) {
; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [95,124,98,1003,u,u,u,u]
; AVX512-NEXT: vpsubw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
+||||||| parent of 49405037fc36 (Use fixup-free 64-bit magic multiply for narrow udiv with IsAdd)
+; AVX-LABEL: fold_urem_vec_1:
+; AVX: # %bb.0:
+; AVX-NEXT: vpextrw $1, %xmm0, %eax
+; AVX-NEXT: movl %eax, %ecx
+; AVX-NEXT: shrl $2, %ecx
+; AVX-NEXT: imull $16913, %ecx, %ecx # imm = 0x4211
+; AVX-NEXT: shrl $19, %ecx
+; AVX-NEXT: imull $124, %ecx, %ecx
+; AVX-NEXT: subl %ecx, %eax
+; AVX-NEXT: vmovd %xmm0, %ecx
+; AVX-NEXT: movzwl %cx, %edx
+; AVX-NEXT: imull $44151, %edx, %edx # imm = 0xAC77
+; AVX-NEXT: shrl $22, %edx
+; AVX-NEXT: imull $95, %edx, %edx
+; AVX-NEXT: subl %edx, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm1
+; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
+; AVX-NEXT: vpextrw $2, %xmm0, %eax
+; AVX-NEXT: movl %eax, %ecx
+; AVX-NEXT: shrl %ecx
+; AVX-NEXT: imull $2675, %ecx, %ecx # imm = 0xA73
+; AVX-NEXT: shrl $17, %ecx
+; AVX-NEXT: imull $98, %ecx, %ecx
+; AVX-NEXT: subl %ecx, %eax
+; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
+; AVX-NEXT: vpextrw $3, %xmm0, %eax
+; AVX-NEXT: imull $1373, %eax, %ecx # imm = 0x55D
+; AVX-NEXT: shrl $16, %ecx
+; AVX-NEXT: movl %eax, %edx
+; AVX-NEXT: subl %ecx, %edx
+; AVX-NEXT: movzwl %dx, %edx
+; AVX-NEXT: shrl %edx
+; AVX-NEXT: addl %ecx, %edx
+; AVX-NEXT: shrl $9, %edx
+; AVX-NEXT: imull $1003, %edx, %ecx # imm = 0x3EB
+; AVX-NEXT: subl %ecx, %eax
+; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0
+; AVX-NEXT: retq
+=======
+; AVX-LABEL: fold_urem_vec_1:
+; AVX: # %bb.0:
+; AVX-NEXT: vpextrw $1, %xmm0, %eax
+; AVX-NEXT: movl %eax, %ecx
+; AVX-NEXT: shrl $2, %ecx
+; AVX-NEXT: imull $16913, %ecx, %ecx # imm = 0x4211
+; AVX-NEXT: shrl $19, %ecx
+; AVX-NEXT: imull $124, %ecx, %ecx
+; AVX-NEXT: subl %ecx, %eax
+; AVX-NEXT: vmovd %xmm0, %ecx
+; AVX-NEXT: movzwl %cx, %edx
+; AVX-NEXT: imull $44151, %edx, %edx # imm = 0xAC77
+; AVX-NEXT: shrl $22, %edx
+; AVX-NEXT: imull $95, %edx, %edx
+; AVX-NEXT: subl %edx, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm1
+; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
+; AVX-NEXT: vpextrw $2, %xmm0, %eax
+; AVX-NEXT: movl %eax, %ecx
+; AVX-NEXT: shrl %ecx
+; AVX-NEXT: imull $2675, %ecx, %ecx # imm = 0xA73
+; AVX-NEXT: shrl $17, %ecx
+; AVX-NEXT: imull $98, %ecx, %ecx
+; AVX-NEXT: subl %ecx, %eax
+; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
+; AVX-NEXT: vpextrw $3, %xmm0, %eax
+; AVX-NEXT: imulq $66909, %rax, %rcx # imm = 0x1055D
+; AVX-NEXT: shrq $26, %rcx
+; AVX-NEXT: imull $1003, %ecx, %ecx # imm = 0x3EB
+; AVX-NEXT: subl %ecx, %eax
+; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0
+; AVX-NEXT: retq
+>>>>>>> 49405037fc36 (Use fixup-free 64-bit magic multiply for narrow udiv with IsAdd)
%1 = urem <4 x i16> %x, <i16 95, i16 124, i16 98, i16 1003>
ret <4 x i16> %1
}
@@ -175,6 +318,7 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) {
; Don't fold if the divisor is one.
define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) {
+<<<<<<< HEAD
; SSE2-LABEL: dont_fold_urem_one:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
@@ -204,7 +348,67 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) {
; SSE4-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [1,654,23,5423,u,u,u,u]
; SSE4-NEXT: psubw %xmm2, %xmm0
; SSE4-NEXT: retq
+||||||| parent of 49405037fc36 (Use fixup-free 64-bit magic multiply for narrow udiv with IsAdd)
+; SSE-LABEL: dont_fold_urem_one:
+; SSE: # %bb.0:
+; SSE-NEXT: pextrw $2, %xmm0, %eax
+; SSE-NEXT: imull $25645, %eax, %ecx # imm = 0x642D
+; SSE-NEXT: shrl $16, %ecx
+; SSE-NEXT: movl %eax, %edx
+; SSE-NEXT: subl %ecx, %edx
+; SSE-NEXT: movzwl %dx, %edx
+; SSE-NEXT: shrl %edx
+; SSE-NEXT: addl %ecx, %edx
+; SSE-NEXT: shrl $4, %edx
+; SSE-NEXT: leal (%rdx,%rdx,2), %ecx
+; SSE-NEXT: shll $3, %ecx
+; SSE-NEXT: subl %ecx, %edx
+; SSE-NEXT: addl %eax, %edx
+; SSE-NEXT: pextrw $1, %xmm0, %eax
+; SSE-NEXT: imull $51307, %eax, %ecx # imm = 0xC86B
+; SSE-NEXT: shrl $25, %ecx
+; SSE-NEXT: imull $654, %ecx, %ecx # imm = 0x28E
+; SSE-NEXT: subl %ecx, %eax
+; SSE-NEXT: pxor %xmm1, %xmm1
+; SSE-NEXT: pinsrw $1, %eax, %xmm1
+; SSE-NEXT: pinsrw $2, %edx, %xmm1
+; SSE-NEXT: pextrw $3, %xmm0, %eax
+; SSE-NEXT: imull $12375, %eax, %ecx # imm = 0x3057
+; SSE-NEXT: shrl $26, %ecx
+; SSE-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F
+; SSE-NEXT: subl %ecx, %eax
+; SSE-NEXT: pinsrw $3, %eax, %xmm1
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: retq
+=======
+; SSE-LABEL: dont_fold_urem_one:
+; SSE: # %bb.0:
+; SSE-NEXT: pextrw $1, %xmm0, %eax
+; SSE-NEXT: imull $51307, %eax, %ecx # imm = 0xC86B
+; SSE-NEXT: shrl $25, %ecx
+; SSE-NEXT: imull $654, %ecx, %ecx # imm = 0x28E
+; SSE-NEXT: subl %ecx, %eax
+; SSE-NEXT: pxor %xmm1, %xmm1
+; SSE-NEXT: pinsrw $1, %eax, %xmm1
+; SSE-NEXT: pextrw $2, %xmm0, %eax
+; SSE-NEXT: imulq $91181, %rax, %rcx # imm = 0x1642D
+; SSE-NEXT: shrq $21, %rcx
+; SSE-NEXT: leal (%rcx,%rcx,2), %edx
+; SSE-NEXT: shll $3, %edx
+; SSE-NEXT: subl %edx, %ecx
+; SSE-NEXT: addl %eax, %ecx
+; SSE-NEXT: pinsrw $2, %ecx, %xmm1
+; SSE-NEXT: pextrw $3, %xmm0, %eax
+; SSE-NEXT: imull $12375, %eax, %ecx # imm = 0x3057
+; SSE-NEXT: shrl $26, %ecx
+; SSE-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F
+; SSE-NEXT: subl %ecx, %eax
+; SSE-NEXT: pinsrw $3, %eax, %xmm1
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: retq
+>>>>>>> 49405037fc36 (Use fixup-free 64-bit magic multiply for narrow udiv with IsAdd)
;
+<<<<<<< HEAD
; AVX1OR2-LABEL: dont_fold_urem_one:
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [u,51307,25645,12375,u,u,u,u]
@@ -228,6 +432,94 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) {
; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,654,23,5423,u,u,u,u]
; AVX512-NEXT: vpsubw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
+||||||| parent of 49405037fc36 (Use fixup-free 64-bit magic multiply for narrow udiv with IsAdd)
+; AVX1OR2-LABEL: dont_fold_urem_one:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vpextrw $2, %xmm0, %eax
+; AVX1OR2-NEXT: imull $25645, %eax, %ecx # imm = 0x642D
+; AVX1OR2-NEXT: shrl $16, %ecx
+; AVX1OR2-NEXT: movl %eax, %edx
+; AVX1OR2-NEXT: subl %ecx, %edx
+; AVX1OR2-NEXT: movzwl %dx, %edx
+; AVX1OR2-NEXT: shrl %edx
+; AVX1OR2-NEXT: addl %ecx, %edx
+; AVX1OR2-NEXT: shrl $4, %edx
+; AVX1OR2-NEXT: leal (%rdx,%rdx,2), %ecx
+; AVX1OR2-NEXT: shll $3, %ecx
+; AVX1OR2-NEXT: subl %ecx, %edx
+; AVX1OR2-NEXT: addl %eax, %edx
+; AVX1OR2-NEXT: vpextrw $1, %xmm0, %eax
+; AVX1OR2-NEXT: imull $51307, %eax, %ecx # imm = 0xC86B
+; AVX1OR2-NEXT: shrl $25, %ecx
+; AVX1OR2-NEXT: imull $654, %ecx, %ecx # imm = 0x28E
+; AVX1OR2-NEXT: subl %ecx, %eax
+; AVX1OR2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1OR2-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
+; AVX1OR2-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1
+; AVX1OR2-NEXT: vpextrw $3, %xmm0, %eax
+; AVX1OR2-NEXT: imull $12375, %eax, %ecx # imm = 0x3057
+; AVX1OR2-NEXT: shrl $26, %ecx
+; AVX1OR2-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F
+; AVX1OR2-NEXT: subl %ecx, %eax
+; AVX1OR2-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0
+; AVX1OR2-NEXT: retq
+;
+; AVX512-LABEL: dont_fold_urem_one:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpextrw $2, %xmm0, %eax
+; AVX512-NEXT: imull $25645, %eax, %ecx # imm = 0x642D
+; AVX512-NEXT: shrl $16, %ecx
+; AVX512-NEXT: movl %eax, %edx
+; AVX512-NEXT: subl %ecx, %edx
+; AVX512-NEXT: movzwl %dx, %edx
+; AVX512-NEXT: shrl %edx
+; AVX512-NEXT: addl %ecx, %edx
+; AVX512-NEXT: shrl $4, %edx
+; AVX512-NEXT: leal (%rdx,%rdx,2), %ecx
+; AVX512-NEXT: shll $3, %ecx
+; AVX512-NEXT: subl %ecx, %edx
+; AVX512-NEXT: vpextrw $1, %xmm0, %ecx
+; AVX512-NEXT: addl %eax, %edx
+; AVX512-NEXT: imull $51307, %ecx, %eax # imm = 0xC86B
+; AVX512-NEXT: shrl $25, %eax
+; AVX512-NEXT: imull $654, %eax, %eax # imm = 0x28E
+; AVX512-NEXT: subl %eax, %ecx
+; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1
+; AVX512-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1
+; AVX512-NEXT: vpextrw $3, %xmm0, %eax
+; AVX512-NEXT: imull $12375, %eax, %ecx # imm = 0x3057
+; AVX512-NEXT: shrl $26, %ecx
+; AVX512-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F
+; AVX512-NEXT: subl %ecx, %eax
+; AVX512-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0
+; AVX512-NEXT: retq
+=======
+; AVX-LABEL: dont_fold_urem_one:
+; AVX: # %bb.0:
+; AVX-NEXT: vpextrw $1, %xmm0, %eax
+; AVX-NEXT: imull $51307, %eax, %ecx # imm = 0xC86B
+; AVX-NEXT: shrl $25, %ecx
+; AVX-NEXT: imull $654, %ecx, %ecx # imm = 0x28E
+; AVX-NEXT: subl %ecx, %eax
+; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
+; AVX-NEXT: vpextrw $2, %xmm0, %eax
+; AVX-NEXT: imulq $91181, %rax, %rcx # imm = 0x1642D
+; AVX-NEXT: shrq $21, %rcx
+; AVX-NEXT: leal (%rcx,%rcx,2), %edx
+; AVX-NEXT: shll $3, %edx
+; AVX-NEXT: subl %edx, %ecx
+; AVX-NEXT: addl %eax, %ecx
+; AVX-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm1
+; AVX-NEXT: vpextrw $3, %xmm0, %eax
+; AVX-NEXT: imull $12375, %eax, %ecx # imm = 0x3057
+; AVX-NEXT: shrl $26, %ecx
+; AVX-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F
+; AVX-NEXT: subl %ecx, %eax
+; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0
+; AVX-NEXT: retq
+>>>>>>> 49405037fc36 (Use fixup-free 64-bit magic multiply for narrow udiv with IsAdd)
%1 = urem <4 x i16> %x, <i16 1, i16 654, i16 23, i16 5423>
ret <4 x i16> %1
}
@@ -445,3 +737,5 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) {
%1 = urem <4 x i64> %x, <i64 1, i64 654, i64 23, i64 5423>
ret <4 x i64> %1
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AVX1OR2: {{.*}}
>From e72af0a35e18d987eb29ef9b2daead17006e522a Mon Sep 17 00:00:00 2001
From: Takashiidobe <idobetakashi at gmail.com>
Date: Wed, 25 Mar 2026 17:38:22 -0400
Subject: [PATCH 3/6] refactor to embed the fallback division inside the
cosntant division strength reduction
---
.../llvm/Support/DivisionByConstantInfo.h | 24 ++-
.../CodeGen/SelectionDAG/TargetLowering.cpp | 156 ++++++------------
llvm/lib/Support/DivisionByConstantInfo.cpp | 74 +++++++--
llvm/test/CodeGen/AArch64/rem-by-const.ll | 20 +--
llvm/test/CodeGen/RISCV/div-by-constant.ll | 44 ++---
llvm/test/CodeGen/X86/rotate-extract.ll | 10 +-
llvm/test/CodeGen/X86/udiv-narrow-magic.ll | 40 +++++
.../Support/DivisionByConstantTest.cpp | 86 +++++++++-
8 files changed, 279 insertions(+), 175 deletions(-)
diff --git a/llvm/include/llvm/Support/DivisionByConstantInfo.h b/llvm/include/llvm/Support/DivisionByConstantInfo.h
index 283283c912dfe..3e898e9321d1d 100644
--- a/llvm/include/llvm/Support/DivisionByConstantInfo.h
+++ b/llvm/include/llvm/Support/DivisionByConstantInfo.h
@@ -18,6 +18,26 @@
namespace llvm {
+/// Standard integer bitwidths that division strength-reduction may widen to.
+/// The numeric value is the actual bit count, so arithmetic on it is valid.
+enum class IntegerBitWidth : unsigned {
+ None = 0,
+ I8 = 8,
+ I16 = 16,
+ I32 = 32,
+ I64 = 64,
+ I128 = 128,
+};
+
+/// Widening strategies for unsigned division by a constant.
+enum class UnsignedDivisionByConstantWidening {
+ None,
+ /// Use a widened high-half multiply and truncate the result.
+ MulHigh,
+ /// Use a widened full multiply followed by an explicit right shift.
+ FullMultiply,
+};
+
/// Magic data for optimising signed division by a constant.
struct SignedDivisionByConstantInfo {
LLVM_ABI static SignedDivisionByConstantInfo get(const APInt &D);
@@ -30,12 +50,12 @@ struct UnsignedDivisionByConstantInfo {
LLVM_ABI static UnsignedDivisionByConstantInfo
get(const APInt &D, unsigned LeadingZeros = 0,
bool AllowEvenDivisorOptimization = true,
- bool AllowWidenOptimization = false);
+ IntegerBitWidth MaxBitWidth = IntegerBitWidth::None);
APInt Magic; ///< magic number
bool IsAdd; ///< add indicator
unsigned PostShift; ///< post-shift amount
unsigned PreShift; ///< pre-shift amount
- bool Widen; ///< use widen optimization
+ UnsignedDivisionByConstantWidening Widening;
};
} // namespace llvm
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 28f962c93b981..8880fd150ad2e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -6781,56 +6781,6 @@ SDValue TargetLowering::BuildSDIV(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(ISD::ADD, dl, VT, Q, T);
}
-/// For narrow scalar types (i8/i16) where Hacker's Delight requires an
-/// add-and-shift fixup (IsAdd), check whether a fixup-free 64-bit magic
-/// multiply exists:
-/// trunc(srl(mul(zext(x, 64), Magic), Shift))
-/// where Magic = ceil(2^Shift / C).
-///
-/// No fixup is needed when two conditions hold:
-/// (1) MaxX * Magic < 2^64 (multiply doesn't overflow 64 bits)
-/// (2) MaxX * (Magic*C - 2^Shift) < 2^Shift (approximation error is exact)
-///
-/// When IsAdd is required by HD, no 32-bit fixup-free solution exists, so we
-/// search only in 64-bit space. Populates Info and returns true on success.
-struct SimpleWideMagicInfo {
- APInt Magic;
- unsigned Shift;
-};
-
-static bool findSimpleWideMagic(const APInt &Divisor, const APInt &MaxX,
- SimpleWideMagicInfo &Info) {
- APInt DivWide = Divisor.zext(64);
- APInt MaxWide = MaxX.zext(64);
- unsigned MinShift = Divisor.ceilLogBase2();
-
- for (unsigned Shift = MinShift; Shift < 64; ++Shift) {
- APInt TwoToS = APInt(64, 1).shl(Shift);
- APInt Magic = APIntOps::RoundingUDiv(TwoToS, DivWide, APInt::Rounding::UP);
-
- // Check (1): MaxX * Magic must fit in 64 bits. Magic = ceil(2^Shift / C)
- // grows monotonically with Shift, so once this overflows no larger Shift
- // can succeed either.
- bool Overflow = false;
- (void)MaxWide.umul_ov(Magic, Overflow);
- if (Overflow)
- break;
-
- // Check (2): MaxX * (Magic*C - 2^Shift) < 2^Shift.
- // Magic*C never overflows 64 bits for i8/i16: Magic*C <= 2^Shift + C
- // <= 2^63 + 65535 < 2^64.
- APInt Error = Magic * DivWide - TwoToS;
- APInt MaxError = MaxWide.umul_ov(Error, Overflow);
- if (Overflow || MaxError.uge(TwoToS))
- continue;
-
- Info = {Magic, Shift};
- return true;
- }
-
- return false;
-}
-
/// Given an ISD::UDIV node expressing a divide by constant,
/// return a DAG expression to select that will generate the same value by
/// multiplying by a magic number.
@@ -6933,16 +6883,20 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
VT == MVT::i32 &&
isOperationLegalOrCustom(ISD::UMUL_LOHI, WideSVT, IsAfterLegalization);
const bool AllowWiden = (HasWideMULHU || HasWideUMUL_LOHI);
+ // For narrow scalars (i8, i16), a fixup-free 64-bit magic may exist when
+ // i64 MUL is available: trunc(srl(mul(zext(x, 64), ceil(2^S/C)), S)).
+ const bool HasLegalI64Mul =
+ isOperationLegalOrCustom(ISD::MUL, WideSVT, IsAfterLegalization);
+ const bool AllowNarrowWiden =
+ EltBits <= 16 && !VT.isVector() && HasLegalI64Mul;
+ const IntegerBitWidth MaxBitWidth = (AllowWiden || AllowNarrowWiden)
+ ? IntegerBitWidth::I64
+ : IntegerBitWidth::None;
bool UseNPQ = false, UsePreShift = false, UsePostShift = false;
- bool UseWiden = false;
- // For narrow scalar types (i8, i16) a simple fixup-free wide magic may exist:
- // trunc(srl(mul(zext(x, W), ceil(2^Shift / C)), Shift))
- // This is preferred over the NPQ add-and-shift fixup when it applies.
- // SimpleWideMulMagic being non-null indicates this path was taken.
- EVT SimpleWideMulVT;
- SDValue SimpleWideMulMagic;
- SDValue SimpleWideMulShift;
+ UnsignedDivisionByConstantWidening WideningKind =
+ UnsignedDivisionByConstantWidening::None;
+ SDValue SimpleWidenShift;
SmallVector<SDValue, 16> PreShifts, PostShifts, MagicFactors, NPQFactors;
auto BuildUDIVPattern = [&](ConstantSDNode *C) {
@@ -6964,18 +6918,31 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
UnsignedDivisionByConstantInfo::get(
Divisor, std::min(KnownLeadingZeros, Divisor.countl_zero()),
/*AllowEvenDivisorOptimization=*/true,
- /*AllowWidenOptimization=*/AllowWiden);
+ /*MaxBitWidth=*/MaxBitWidth);
- if (magics.Widen) {
- UseWiden = true;
- MagicFactor = DAG.getConstant(magics.Magic, dl, WideSVT);
- } else {
+ switch (magics.Widening) {
+ case UnsignedDivisionByConstantWidening::None:
MagicFactor = DAG.getConstant(magics.Magic.zext(SVTBits), dl, SVT);
+ break;
+ case UnsignedDivisionByConstantWidening::MulHigh:
+ WideningKind = UnsignedDivisionByConstantWidening::MulHigh;
+ MagicFactor = DAG.getConstant(magics.Magic, dl, WideSVT);
+ break;
+ case UnsignedDivisionByConstantWidening::FullMultiply:
+ WideningKind = UnsignedDivisionByConstantWidening::FullMultiply;
+ MagicFactor = DAG.getConstant(magics.Magic, dl, WideSVT);
+ // Simple wide magic (narrow types): explicit shift after multiply.
+ SimpleWidenShift =
+ DAG.getConstant(magics.PostShift, dl,
+ getShiftAmountTy(WideSVT, DAG.getDataLayout()));
+ break;
}
assert(magics.PreShift < Divisor.getBitWidth() &&
"We shouldn't generate an undefined shift!");
- assert(magics.PostShift < Divisor.getBitWidth() &&
+ assert((magics.Widening !=
+ UnsignedDivisionByConstantWidening::FullMultiply ||
+ magics.PostShift < magics.Magic.getBitWidth()) &&
"We shouldn't generate an undefined shift!");
assert((!magics.IsAdd || magics.PreShift == 0) &&
"Unexpected pre-shift");
@@ -6987,30 +6954,9 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
dl, SVT);
UseNPQ |= magics.IsAdd;
UsePreShift |= magics.PreShift != 0;
- UsePostShift |= magics.PostShift != 0;
-
- // For narrow scalar types (i8, i16), when the Hacker's Delight magic
- // requires the expensive NPQ add-and-shift fixup (IsAdd), check whether
- // a simple fixup-free 64-bit magic exists:
- // trunc(srl(mul(zext(x, 64), ceil(2^Shift / C)), Shift))
- // When IsAdd is required by HD, no 32-bit fixup-free solution exists, so
- // we go directly to 64-bit. Only attempt when i64 MUL is natively legal.
- EVT I64VT = EVT::getIntegerVT(*DAG.getContext(), 64);
- bool IsScalar = !VT.isVector();
- bool IsNarrow = EltBits <= 16;
- bool NeedsAddFixup = magics.IsAdd;
- bool HasLegalI64Mul =
- isOperationLegalOrCustom(ISD::MUL, I64VT, IsAfterLegalization);
- if (IsScalar && IsNarrow && NeedsAddFixup && HasLegalI64Mul) {
- APInt MaxX = Known0.getMaxValue();
- SimpleWideMagicInfo Info;
- if (findSimpleWideMagic(Divisor, MaxX, Info)) {
- SimpleWideMulVT = I64VT;
- EVT WideShVT = getShiftAmountTy(I64VT, DAG.getDataLayout());
- SimpleWideMulMagic = DAG.getConstant(Info.Magic, dl, I64VT);
- SimpleWideMulShift = DAG.getConstant(Info.Shift, dl, WideShVT);
- }
- }
+ UsePostShift |=
+ magics.Widening == UnsignedDivisionByConstantWidening::None &&
+ magics.PostShift != 0;
}
PreShifts.push_back(PreShift);
@@ -7046,27 +6992,27 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
PostShift = PostShifts[0];
}
- // Simple wide magic: trunc(srl(mul(zext(x, 64), Magic), Shift)).
- // Only applies to narrow scalars (i8, i16); divisor=1 is excluded (never
- // sets SimpleWideMulMagic), so no IsOne select is needed here.
- if (SimpleWideMulMagic.getNode()) {
- SDValue Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, SimpleWideMulVT, N0);
- Created.push_back(Wide.getNode());
- SDValue Mul =
- DAG.getNode(ISD::MUL, dl, SimpleWideMulVT, Wide, SimpleWideMulMagic);
+ switch (WideningKind) {
+ case UnsignedDivisionByConstantWidening::None:
+ break;
+ case UnsignedDivisionByConstantWidening::FullMultiply: {
+ SDValue WideN0 = DAG.getNode(ISD::ZERO_EXTEND, dl, WideSVT, N0);
+ Created.push_back(WideN0.getNode());
+ assert(EltBits <= 16 && !VT.isVector() &&
+ "FullMultiply widening is only expected for narrow scalars");
+ // Narrow scalar: trunc(srl(mul(zext(x, 64), ceil(2^S/C)), S)).
+ // divisor=1 never reaches here (handled above), so no IsOne select needed.
+ SDValue Mul = DAG.getNode(ISD::MUL, dl, WideSVT, WideN0, MagicFactor);
Created.push_back(Mul.getNode());
- SDValue Srl =
- DAG.getNode(ISD::SRL, dl, SimpleWideMulVT, Mul, SimpleWideMulShift);
+ SDValue Srl = DAG.getNode(ISD::SRL, dl, WideSVT, Mul, SimpleWidenShift);
Created.push_back(Srl.getNode());
return DAG.getNode(ISD::TRUNCATE, dl, VT, Srl);
}
-
- if (UseWiden) {
- // Compute: (WideSVT(x) * MagicFactor) >> WideSVTBits.
+ case UnsignedDivisionByConstantWidening::MulHigh: {
SDValue WideN0 = DAG.getNode(ISD::ZERO_EXTEND, dl, WideSVT, N0);
-
- // Perform WideSVTxWideSVT -> 2*WideSVT multiplication and extract high
- // WideSVT bits
+ Created.push_back(WideN0.getNode());
+ assert(VT == MVT::i32 && "MulHigh widening is only expected for i32");
+ // i32 -> i64: extract high 32 bits of the 64-bit multiply.
SDValue High;
if (HasWideMULHU) {
High = DAG.getNode(ISD::MULHU, dl, WideSVT, WideN0, MagicFactor);
@@ -7077,10 +7023,10 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
WideN0, MagicFactor);
High = LoHi.getValue(1);
}
-
Created.push_back(High.getNode());
return DAG.getNode(ISD::TRUNCATE, dl, VT, High);
}
+ }
SDValue Q = N0;
if (UsePreShift) {
diff --git a/llvm/lib/Support/DivisionByConstantInfo.cpp b/llvm/lib/Support/DivisionByConstantInfo.cpp
index 18b756d1ce8db..4fa3d1fa54b81 100644
--- a/llvm/lib/Support/DivisionByConstantInfo.cpp
+++ b/llvm/lib/Support/DivisionByConstantInfo.cpp
@@ -14,6 +14,38 @@
using namespace llvm;
+/// Find M = ceil(2^S / D) and S such that
+/// trunc(srl(mul(zext(x, W), M), S)) == udiv(x, D)
+/// for all x in [0, MaxX], where the multiply stays within W bits (no MULHU).
+///
+/// This gives a fixup-free alternative to the Hacker's Delight add-and-shift
+/// for narrow types (i8/i16) widened into a larger integer. The HD algorithm
+/// in wide space produces MULHU-style magic (≈2^W/D), which overflows a plain
+/// W-bit multiply; this routine instead finds the smallest S ≥ ceil(log2(D))
+/// for which the product MaxX * ceil(2^S/D) fits in W bits and the rounding
+/// error is harmless.
+static bool findSimpleWideMagic(const APInt &D, const APInt &MaxX, unsigned W,
+ APInt &Magic, unsigned &Shift) {
+ APInt DivW = D.zext(W);
+ APInt MaxW = MaxX.zext(W);
+ for (unsigned S = D.ceilLogBase2(); S < W; ++S) {
+ APInt TwoToS = APInt::getOneBitSet(W, S);
+ APInt M = APIntOps::RoundingUDiv(TwoToS, DivW, APInt::Rounding::UP);
+ bool Overflow = false;
+ (void)MaxW.umul_ov(M, Overflow);
+ if (Overflow)
+ break; // M grows monotonically; no larger S can succeed.
+ APInt Error = M * DivW - TwoToS;
+ APInt MaxError = MaxW.umul_ov(Error, Overflow);
+ if (Overflow || MaxError.uge(TwoToS))
+ continue;
+ Magic = M;
+ Shift = S;
+ return true;
+ }
+ return false;
+}
+
/// Calculate the magic numbers required to implement a signed integer division
/// by a constant as a sequence of multiplies, adds and shifts. Requires that
/// the divisor not be 0, 1, or -1. Taken from "Hacker's Delight", Henry S.
@@ -73,14 +105,15 @@ SignedDivisionByConstantInfo SignedDivisionByConstantInfo::get(const APInt &D) {
UnsignedDivisionByConstantInfo
UnsignedDivisionByConstantInfo::get(const APInt &D, unsigned LeadingZeros,
bool AllowEvenDivisorOptimization,
- bool AllowWidenOptimization) {
+ IntegerBitWidth MaxBitWidth) {
+ unsigned WideningBitWidth = static_cast<unsigned>(MaxBitWidth);
assert(!D.isZero() && !D.isOne() && "Precondition violation.");
assert(D.getBitWidth() > 1 && "Does not work at smaller bitwidths.");
APInt Delta;
struct UnsignedDivisionByConstantInfo Retval;
Retval.IsAdd = false; // initialize "add" indicator
- Retval.Widen = false; // initialize widen indicator
+ Retval.Widening = UnsignedDivisionByConstantWidening::None;
APInt AllOnes =
APInt::getLowBitsSet(D.getBitWidth(), D.getBitWidth() - LeadingZeros);
APInt SignedMin = APInt::getSignedMinValue(D.getBitWidth());
@@ -154,19 +187,32 @@ UnsignedDivisionByConstantInfo::get(const APInt &D, unsigned LeadingZeros,
}
Retval.PreShift = 0;
- // For IsAdd case with AllowWidenOptimization, compute widened magic.
- // This is for optimizing 32-bit division using 64-bit multiplication.
- // The actual magic constant is 2^W + Magic ((W+1)-bit).
- // We pre-shift it left by (W*2 - OriginalShift) to avoid runtime shift.
- if (Retval.IsAdd && AllowWidenOptimization) {
+ if (Retval.IsAdd && WideningBitWidth) {
unsigned W = D.getBitWidth();
- unsigned OriginalShift = Retval.PostShift + W + 1;
- // Since PostShift >= 1, shift amount is at most W-2, so W*2 bits suffice.
- Retval.Magic = (APInt::getOneBitSet(W * 2, W) + Retval.Magic.zext(W * 2))
- .shl(W * 2 - OriginalShift);
- Retval.IsAdd = false;
- Retval.PostShift = 0;
- Retval.Widen = true;
+ if (WideningBitWidth == W * 2) {
+ // MULHU-style widen: pre-shift the (W+1)-bit magic into a W*2-bit value
+ // so the high W bits of the wide multiply give the quotient directly.
+ unsigned OriginalShift = Retval.PostShift + W + 1;
+ // Since PostShift >= 1, shift amount is at most W-2, so W*2 bits suffice.
+ Retval.Magic = (APInt::getOneBitSet(W * 2, W) + Retval.Magic.zext(W * 2))
+ .shl(W * 2 - OriginalShift);
+ Retval.IsAdd = false;
+ Retval.PostShift = 0;
+ Retval.Widening = UnsignedDivisionByConstantWidening::MulHigh;
+ } else if (WideningBitWidth > W * 2) {
+ // Simple wide magic: trunc(srl(mul(zext(x, W), ceil(2^S/D)), S)).
+ // The HD algorithm in wide space produces MULHU-style magic (≈2^W/D)
+ // whose full product overflows W bits; findSimpleWideMagic instead finds
+ // the smallest ceil(2^S/D) whose W-bit product with MaxX stays in bounds.
+ APInt Magic;
+ unsigned Shift;
+ if (findSimpleWideMagic(D, AllOnes, WideningBitWidth, Magic, Shift)) {
+ Retval.Magic = std::move(Magic);
+ Retval.PostShift = Shift;
+ Retval.IsAdd = false;
+ Retval.Widening = UnsignedDivisionByConstantWidening::FullMultiply;
+ }
+ }
}
return Retval;
diff --git a/llvm/test/CodeGen/AArch64/rem-by-const.ll b/llvm/test/CodeGen/AArch64/rem-by-const.ll
index 1c6b241cb8f12..b1f2d20553c6d 100644
--- a/llvm/test/CodeGen/AArch64/rem-by-const.ll
+++ b/llvm/test/CodeGen/AArch64/rem-by-const.ll
@@ -67,14 +67,10 @@ entry:
define i8 @ui8_7(i8 %a, i8 %b) {
; CHECK-SD-LABEL: ui8_7:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: mov w8, #37 // =0x25
+; CHECK-SD-NEXT: mov w8, #293 // =0x125
; CHECK-SD-NEXT: and w9, w0, #0xff
; CHECK-SD-NEXT: mul w8, w9, w8
-; CHECK-SD-NEXT: lsr w8, w8, #8
-; CHECK-SD-NEXT: sub w9, w0, w8
-; CHECK-SD-NEXT: and w9, w9, #0xfe
-; CHECK-SD-NEXT: add w8, w8, w9, lsr #1
-; CHECK-SD-NEXT: lsr w8, w8, #2
+; CHECK-SD-NEXT: lsr w8, w8, #11
; CHECK-SD-NEXT: sub w8, w8, w8, lsl #3
; CHECK-SD-NEXT: add w0, w0, w8
; CHECK-SD-NEXT: ret
@@ -187,13 +183,11 @@ define i16 @ui16_7(i16 %a, i16 %b) {
; CHECK-SD-LABEL: ui16_7:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: mov w8, #9363 // =0x2493
-; CHECK-SD-NEXT: and w9, w0, #0xffff
-; CHECK-SD-NEXT: mul w8, w9, w8
-; CHECK-SD-NEXT: lsr w8, w8, #16
-; CHECK-SD-NEXT: sub w9, w0, w8
-; CHECK-SD-NEXT: and w9, w9, #0xfffe
-; CHECK-SD-NEXT: add w8, w8, w9, lsr #1
-; CHECK-SD-NEXT: lsr w8, w8, #2
+; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT: and x9, x0, #0xffff
+; CHECK-SD-NEXT: movk w8, #1, lsl #16
+; CHECK-SD-NEXT: umull x8, w9, w8
+; CHECK-SD-NEXT: lsr x8, x8, #19
; CHECK-SD-NEXT: sub w8, w8, w8, lsl #3
; CHECK-SD-NEXT: add w0, w0, w8
; CHECK-SD-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/div-by-constant.ll b/llvm/test/CodeGen/RISCV/div-by-constant.ll
index 1aa0cd053f3ed..40b599bc1a076 100644
--- a/llvm/test/CodeGen/RISCV/div-by-constant.ll
+++ b/llvm/test/CodeGen/RISCV/div-by-constant.ll
@@ -245,31 +245,13 @@ define i8 @udiv8_constant_add(i8 %a) nounwind {
; RV32IMZB-NEXT: srli a0, a0, 2
; RV32IMZB-NEXT: ret
;
-; RV64IM-LABEL: udiv8_constant_add:
-; RV64IM: # %bb.0:
-; RV64IM-NEXT: zext.b a1, a0
-; RV64IM-NEXT: li a2, 37
-; RV64IM-NEXT: mul a1, a1, a2
-; RV64IM-NEXT: srli a1, a1, 8
-; RV64IM-NEXT: sub a0, a0, a1
-; RV64IM-NEXT: slli a0, a0, 56
-; RV64IM-NEXT: srli a0, a0, 57
-; RV64IM-NEXT: add a0, a0, a1
-; RV64IM-NEXT: srli a0, a0, 2
-; RV64IM-NEXT: ret
-;
-; RV64IMZB-LABEL: udiv8_constant_add:
-; RV64IMZB: # %bb.0:
-; RV64IMZB-NEXT: zext.b a1, a0
-; RV64IMZB-NEXT: sh3add a2, a1, a1
-; RV64IMZB-NEXT: sh2add a1, a2, a1
-; RV64IMZB-NEXT: srli a1, a1, 8
-; RV64IMZB-NEXT: sub a0, a0, a1
-; RV64IMZB-NEXT: slli a0, a0, 56
-; RV64IMZB-NEXT: srli a0, a0, 57
-; RV64IMZB-NEXT: add a0, a0, a1
-; RV64IMZB-NEXT: srli a0, a0, 2
-; RV64IMZB-NEXT: ret
+; RV64-LABEL: udiv8_constant_add:
+; RV64: # %bb.0:
+; RV64-NEXT: zext.b a0, a0
+; RV64-NEXT: li a1, 293
+; RV64-NEXT: mul a0, a0, a1
+; RV64-NEXT: srli a0, a0, 11
+; RV64-NEXT: ret
%1 = udiv i8 %a, 7
ret i8 %1
}
@@ -311,15 +293,11 @@ define i16 @udiv16_constant_add(i16 %a) nounwind {
;
; RV64-LABEL: udiv16_constant_add:
; RV64: # %bb.0:
-; RV64-NEXT: slli a1, a0, 48
-; RV64-NEXT: lui a2, 149808
-; RV64-NEXT: mulhu a1, a1, a2
-; RV64-NEXT: srli a1, a1, 16
-; RV64-NEXT: sub a0, a0, a1
+; RV64-NEXT: lui a1, 74899
+; RV64-NEXT: slli a1, a1, 4
; RV64-NEXT: slli a0, a0, 48
-; RV64-NEXT: srli a0, a0, 49
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: srli a0, a0, 2
+; RV64-NEXT: mulhu a0, a0, a1
+; RV64-NEXT: srli a0, a0, 19
; RV64-NEXT: ret
%1 = udiv i16 %a, 7
ret i16 %1
diff --git a/llvm/test/CodeGen/X86/rotate-extract.ll b/llvm/test/CodeGen/X86/rotate-extract.ll
index b5332068d7edd..66e53b21343fb 100644
--- a/llvm/test/CodeGen/X86/rotate-extract.ll
+++ b/llvm/test/CodeGen/X86/rotate-extract.ll
@@ -240,15 +240,11 @@ define i8 @no_extract_udiv(i8 %i) nounwind {
; X64-NEXT: movzbl %dil, %ecx
; X64-NEXT: imull $171, %ecx, %eax
; X64-NEXT: shrl $8, %eax
-; X64-NEXT: imull $79, %ecx, %edx
-; X64-NEXT: shrl $8, %edx
-; X64-NEXT: subb %dl, %cl
-; X64-NEXT: shrb %cl
-; X64-NEXT: addb %dl, %cl
-; X64-NEXT: shrb $5, %cl
+; X64-NEXT: imull $335, %ecx, %ecx # imm = 0x14F
+; X64-NEXT: shrl $14, %ecx
; X64-NEXT: shlb $3, %al
+; X64-NEXT: andb $-16, %al
; X64-NEXT: orb %cl, %al
-; X64-NEXT: andb $-9, %al
; X64-NEXT: # kill: def $al killed $al killed $eax
; X64-NEXT: retq
%lhs_div = udiv i8 %i, 3
diff --git a/llvm/test/CodeGen/X86/udiv-narrow-magic.ll b/llvm/test/CodeGen/X86/udiv-narrow-magic.ll
index 6c20b14f90ae6..5d5e893a3c59f 100644
--- a/llvm/test/CodeGen/X86/udiv-narrow-magic.ll
+++ b/llvm/test/CodeGen/X86/udiv-narrow-magic.ll
@@ -136,6 +136,46 @@ define i16 @udiv_i16_by100(i16 %x) nounwind {
ret i16 %d
}
+; Vector narrow udiv - should NOT use the scalar narrow-magic widening path.
+define <16 x i8> @udiv_v16i8_by7(<16 x i8> %x) nounwind {
+; CHECK-LABEL: udiv_v16i8_by7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pxor %xmm1, %xmm1
+; CHECK-NEXT: movdqa %xmm0, %xmm2
+; CHECK-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0]
+; CHECK-NEXT: pmullw %xmm3, %xmm2
+; CHECK-NEXT: psrlw $8, %xmm2
+; CHECK-NEXT: movdqa %xmm0, %xmm4
+; CHECK-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
+; CHECK-NEXT: pmullw %xmm3, %xmm4
+; CHECK-NEXT: psrlw $8, %xmm4
+; CHECK-NEXT: packuswb %xmm2, %xmm4
+; CHECK-NEXT: psubb %xmm4, %xmm0
+; CHECK-NEXT: psrlw $1, %xmm0
+; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: paddb %xmm4, %xmm0
+; CHECK-NEXT: psrlw $2, %xmm0
+; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: retq
+ %d = udiv <16 x i8> %x, splat (i8 7)
+ ret <16 x i8> %d
+}
+
+define <8 x i16> @udiv_v8i16_by7(<8 x i16> %x) nounwind {
+; CHECK-LABEL: udiv_v8i16_by7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [9363,9363,9363,9363,9363,9363,9363,9363]
+; CHECK-NEXT: pmulhuw %xmm0, %xmm1
+; CHECK-NEXT: psubw %xmm1, %xmm0
+; CHECK-NEXT: psrlw $1, %xmm0
+; CHECK-NEXT: paddw %xmm1, %xmm0
+; CHECK-NEXT: psrlw $2, %xmm0
+; CHECK-NEXT: retq
+ %d = udiv <8 x i16> %x, splat (i16 7)
+ ret <8 x i16> %d
+}
+
; zext(udiv i16) - should also improve.
define i32 @zext_udiv_i16_by7(i16 %x) nounwind {
; CHECK-LABEL: zext_udiv_i16_by7:
diff --git a/llvm/unittests/Support/DivisionByConstantTest.cpp b/llvm/unittests/Support/DivisionByConstantTest.cpp
index 715dded68ff01..7ea87851ac926 100644
--- a/llvm/unittests/Support/DivisionByConstantTest.cpp
+++ b/llvm/unittests/Support/DivisionByConstantTest.cpp
@@ -95,6 +95,13 @@ APInt MULHU(APInt X, APInt Y) {
return (X.zext(WideBits) * Y.zext(WideBits)).lshr(Bits).trunc(Bits);
}
+APInt WideMULHU(APInt X, APInt Y) {
+ assert(X.getBitWidth() == Y.getBitWidth() && "Expected matching widths");
+ unsigned Bits = X.getBitWidth();
+ unsigned WideBits = 2 * Bits;
+ return (X.zext(WideBits) * Y.zext(WideBits)).lshr(Bits).trunc(Bits);
+}
+
APInt UnsignedDivideUsingMagic(const APInt &Numerator, const APInt &Divisor,
bool LZOptimization,
bool AllowEvenDivisorOptimization, bool ForceNPQ,
@@ -116,13 +123,26 @@ APInt UnsignedDivideUsingMagic(const APInt &Numerator, const APInt &Divisor,
assert(Magics.PreShift < Divisor.getBitWidth() &&
"We shouldn't generate an undefined shift!");
- assert(Magics.PostShift < Divisor.getBitWidth() &&
+ assert((Magics.Widening != UnsignedDivisionByConstantWidening::FullMultiply ||
+ Magics.PostShift < Magics.Magic.getBitWidth()) &&
"We shouldn't generate an undefined shift!");
assert((!Magics.IsAdd || Magics.PreShift == 0) && "Unexpected pre-shift");
unsigned PreShift = Magics.PreShift;
unsigned PostShift = Magics.PostShift;
bool UseNPQ = Magics.IsAdd;
+ if (Magics.Widening == UnsignedDivisionByConstantWidening::MulHigh) {
+ unsigned WideBits = Magics.Magic.getBitWidth();
+ APInt Q = WideMULHU(Numerator.zext(WideBits), Magics.Magic);
+ return Q.trunc(Bits);
+ }
+
+ if (Magics.Widening == UnsignedDivisionByConstantWidening::FullMultiply) {
+ unsigned WideBits = Magics.Magic.getBitWidth();
+ APInt Q = Numerator.zext(WideBits) * Magics.Magic;
+ return Q.lshr(PostShift).trunc(Bits);
+ }
+
APInt NPQFactor =
UseNPQ ? APInt::getSignedMinValue(Bits) : APInt::getZero(Bits);
@@ -186,4 +206,68 @@ TEST(UnsignedDivisionByConstantTest, Test) {
}
}
+TEST(UnsignedDivisionByConstantTest, WideningKinds) {
+ {
+ APInt Divisor(8, 7);
+ auto Magics = UnsignedDivisionByConstantInfo::get(
+ Divisor, /*LeadingZeros=*/0, /*AllowEvenDivisorOptimization=*/true,
+ IntegerBitWidth::I16);
+ EXPECT_EQ(Magics.Widening, UnsignedDivisionByConstantWidening::MulHigh);
+ EXPECT_EQ(Magics.Magic.getBitWidth(), 16u);
+ EXPECT_FALSE(Magics.IsAdd);
+ EXPECT_EQ(Magics.PostShift, 0u);
+ }
+
+ {
+ APInt Divisor(8, 7);
+ auto Magics = UnsignedDivisionByConstantInfo::get(
+ Divisor, /*LeadingZeros=*/0, /*AllowEvenDivisorOptimization=*/true,
+ IntegerBitWidth::I64);
+ EXPECT_EQ(Magics.Widening,
+ UnsignedDivisionByConstantWidening::FullMultiply);
+ EXPECT_EQ(Magics.Magic.getBitWidth(), 64u);
+ EXPECT_FALSE(Magics.IsAdd);
+ EXPECT_GT(Magics.PostShift, 0u);
+ }
+
+ {
+ APInt Divisor(32, 7);
+ auto Magics = UnsignedDivisionByConstantInfo::get(
+ Divisor, /*LeadingZeros=*/0, /*AllowEvenDivisorOptimization=*/true,
+ IntegerBitWidth::I64);
+ EXPECT_EQ(Magics.Widening, UnsignedDivisionByConstantWidening::MulHigh);
+ EXPECT_EQ(Magics.Magic.getBitWidth(), 64u);
+ EXPECT_FALSE(Magics.IsAdd);
+ EXPECT_EQ(Magics.PostShift, 0u);
+ }
+}
+
+TEST(UnsignedDivisionByConstantTest, WidenedMagicExecutesCorrectly) {
+ auto CheckAllNumerators = [](const APInt &Divisor,
+ IntegerBitWidth MaxBitWidth,
+ UnsignedDivisionByConstantWidening Widening) {
+ auto Magics = UnsignedDivisionByConstantInfo::get(
+ Divisor, /*LeadingZeros=*/0, /*AllowEvenDivisorOptimization=*/true,
+ MaxBitWidth);
+ ASSERT_EQ(Magics.Widening, Widening);
+ EnumerateAPInts(Divisor.getBitWidth(), [&](const APInt &Numerator) {
+ ASSERT_EQ(UnsignedDivideUsingMagic(Numerator, Divisor,
+ /*LZOptimization=*/false,
+ /*AllowEvenDivisorOptimization=*/true,
+ /*ForceNPQ=*/false, Magics),
+ Numerator.udiv(Divisor))
+ << " ... given the operation: udiv i" << Divisor.getBitWidth() << " "
+ << Numerator << ", " << Divisor << " with widening "
+ << static_cast<int>(Widening);
+ });
+ };
+
+ CheckAllNumerators(APInt(8, 7), IntegerBitWidth::I16,
+ UnsignedDivisionByConstantWidening::MulHigh);
+ CheckAllNumerators(APInt(8, 7), IntegerBitWidth::I64,
+ UnsignedDivisionByConstantWidening::FullMultiply);
+ CheckAllNumerators(APInt(16, 7), IntegerBitWidth::I64,
+ UnsignedDivisionByConstantWidening::FullMultiply);
+}
+
} // end anonymous namespace
>From e53bcb19211881e1ac6f25f9f9469a1b05ee6d06 Mon Sep 17 00:00:00 2001
From: Takashiidobe <idobetakashi at gmail.com>
Date: Wed, 25 Mar 2026 19:44:51 -0400
Subject: [PATCH 4/6] allow 32-bit targets with UMUL_LOHI to use the shift free
and fixup free division strength reduction for i8/i16
---
.../CodeGen/SelectionDAG/TargetLowering.cpp | 47 +-
.../CodeGen/Generic/udiv-narrow-widening.ll | 487 ++++++++++++++++++
.../CodeGen/X86/udiv-const-optimization.ll | 61 +++
llvm/test/CodeGen/X86/udiv-narrow-magic.ll | 191 -------
llvm/test/CodeGen/X86/urem-vector-lkk.ll | 16 +-
5 files changed, 588 insertions(+), 214 deletions(-)
create mode 100644 llvm/test/CodeGen/Generic/udiv-narrow-widening.ll
delete mode 100644 llvm/test/CodeGen/X86/udiv-narrow-magic.ll
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 8880fd150ad2e..f2e6ca9f1c154 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -6873,25 +6873,41 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
}
const unsigned SVTBits = SVT.getSizeInBits();
- // Allow i32 to be widened to i64 for uncooperative divisors if i64 MULHU or
- // UMUL_LOHI is supported.
- const EVT WideSVT = MVT::i64;
+ // Allow scalar i16 to be widened to i32 for uncooperative divisors if i32
+ // MULHU or UMUL_LOHI is supported (shiftless MulHigh, prefer over i64 widen).
+ const bool HasWideI32MULHU =
+ VT == MVT::i16 &&
+ isOperationLegalOrCustom(ISD::MULHU, MVT::i32, IsAfterLegalization);
+ const bool HasWideI32UMUL_LOHI =
+ VT == MVT::i16 &&
+ isOperationLegalOrCustom(ISD::UMUL_LOHI, MVT::i32, IsAfterLegalization);
+ // Allow scalar i32 to be widened to i64 for uncooperative divisors if i64
+ // MULHU or UMUL_LOHI is supported (shiftless MulHigh).
const bool HasWideMULHU =
- VT == MVT::i32 &&
- isOperationLegalOrCustom(ISD::MULHU, WideSVT, IsAfterLegalization);
+ HasWideI32MULHU ||
+ (VT == MVT::i32 &&
+ isOperationLegalOrCustom(ISD::MULHU, MVT::i64, IsAfterLegalization));
const bool HasWideUMUL_LOHI =
- VT == MVT::i32 &&
- isOperationLegalOrCustom(ISD::UMUL_LOHI, WideSVT, IsAfterLegalization);
+ HasWideI32UMUL_LOHI ||
+ (VT == MVT::i32 &&
+ isOperationLegalOrCustom(ISD::UMUL_LOHI, MVT::i64, IsAfterLegalization));
const bool AllowWiden = (HasWideMULHU || HasWideUMUL_LOHI);
+ // WideSVT: the doubled type for MulHigh multiplication.
+ // Use i32 for the i16->i32 case, i64 otherwise.
+ const EVT WideSVT =
+ (HasWideI32MULHU || HasWideI32UMUL_LOHI) ? MVT::i32 : MVT::i64;
// For narrow scalars (i8, i16), a fixup-free 64-bit magic may exist when
// i64 MUL is available: trunc(srl(mul(zext(x, 64), ceil(2^S/C)), S)).
+ // Skip this when i32 MulHigh is already preferred for i16.
const bool HasLegalI64Mul =
- isOperationLegalOrCustom(ISD::MUL, WideSVT, IsAfterLegalization);
- const bool AllowNarrowWiden =
- EltBits <= 16 && !VT.isVector() && HasLegalI64Mul;
- const IntegerBitWidth MaxBitWidth = (AllowWiden || AllowNarrowWiden)
- ? IntegerBitWidth::I64
- : IntegerBitWidth::None;
+ isOperationLegalOrCustom(ISD::MUL, MVT::i64, IsAfterLegalization);
+ const bool AllowNarrowWiden = EltBits <= 16 && !VT.isVector() &&
+ !(HasWideI32MULHU || HasWideI32UMUL_LOHI) &&
+ HasLegalI64Mul;
+ const IntegerBitWidth MaxBitWidth =
+ (HasWideI32MULHU || HasWideI32UMUL_LOHI) ? IntegerBitWidth::I32
+ : (AllowWiden || AllowNarrowWiden) ? IntegerBitWidth::I64
+ : IntegerBitWidth::None;
bool UseNPQ = false, UsePreShift = false, UsePostShift = false;
UnsignedDivisionByConstantWidening WideningKind =
@@ -7011,8 +7027,9 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
case UnsignedDivisionByConstantWidening::MulHigh: {
SDValue WideN0 = DAG.getNode(ISD::ZERO_EXTEND, dl, WideSVT, N0);
Created.push_back(WideN0.getNode());
- assert(VT == MVT::i32 && "MulHigh widening is only expected for i32");
- // i32 -> i64: extract high 32 bits of the 64-bit multiply.
+ assert((VT == MVT::i32 || VT == MVT::i16) &&
+ "MulHigh widening is only expected for i32 or i16");
+ // Extract the high half of the widened multiply (i16->i32 or i32->i64).
SDValue High;
if (HasWideMULHU) {
High = DAG.getNode(ISD::MULHU, dl, WideSVT, WideN0, MagicFactor);
diff --git a/llvm/test/CodeGen/Generic/udiv-narrow-widening.ll b/llvm/test/CodeGen/Generic/udiv-narrow-widening.ll
new file mode 100644
index 0000000000000..124bb7cb73565
--- /dev/null
+++ b/llvm/test/CodeGen/Generic/udiv-narrow-widening.ll
@@ -0,0 +1,487 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=i386-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,X86
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,X64
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,AARCH64
+; RUN: llc -mtriple=riscv32-unknown-linux-gnu -mattr=+m < %s | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: llc -mtriple=riscv64-unknown-linux-gnu -mattr=+m < %s | FileCheck %s --check-prefixes=CHECK,RV64
+
+; When a narrow udiv-by-constant (i8, i16) is lowered via BuildUDIV, the
+; Granlund-Montgomery (GM) algorithm may need an expensive add-and-shift fixup
+; for IsAdd=true divisors. A wider legal type (e.g. i32 for i16) has enough
+; headroom for a simple magic = ceil(2^Shift / C) that needs no fixup at all.
+;
+; i16 IsAdd=true divisors (e.g. 7): on targets with i32 UMUL_LOHI (i386,
+; x86-64) we emit a shiftless 32x32->64 multiply and take the high 32 bits.
+; On AArch64 (no i32 UMUL_LOHI) we fall back to FullMultiply via i64.
+
+; --- i8 cases ---
+
+define i8 @udiv_i8_by7(i8 %x) nounwind {
+; X86-LABEL: udiv_i8_by7:
+; X86: # %bb.0:
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: leal (%eax,%eax,8), %ecx
+; X86-NEXT: leal (%eax,%ecx,4), %ecx
+; X86-NEXT: subb %ch, %al
+; X86-NEXT: shrb %al
+; X86-NEXT: addb %ch, %al
+; X86-NEXT: shrb $2, %al
+; X86-NEXT: # kill: def $al killed $al killed $eax
+; X86-NEXT: retl
+;
+; X64-LABEL: udiv_i8_by7:
+; X64: # %bb.0:
+; X64-NEXT: movzbl %dil, %eax
+; X64-NEXT: imull $293, %eax, %eax # imm = 0x125
+; X64-NEXT: shrl $11, %eax
+; X64-NEXT: # kill: def $al killed $al killed $eax
+; X64-NEXT: retq
+;
+; AARCH64-LABEL: udiv_i8_by7:
+; AARCH64: // %bb.0:
+; AARCH64-NEXT: // kill: def $w0 killed $w0 def $x0
+; AARCH64-NEXT: mov w8, #293 // =0x125
+; AARCH64-NEXT: and x9, x0, #0xff
+; AARCH64-NEXT: umull x8, w9, w8
+; AARCH64-NEXT: lsr x0, x8, #11
+; AARCH64-NEXT: // kill: def $w0 killed $w0 killed $x0
+; AARCH64-NEXT: ret
+;
+; RV32-LABEL: udiv_i8_by7:
+; RV32: # %bb.0:
+; RV32-NEXT: zext.b a1, a0
+; RV32-NEXT: li a2, 37
+; RV32-NEXT: mul a1, a1, a2
+; RV32-NEXT: srli a1, a1, 8
+; RV32-NEXT: sub a0, a0, a1
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srli a0, a0, 25
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: srli a0, a0, 2
+; RV32-NEXT: ret
+;
+; RV64-LABEL: udiv_i8_by7:
+; RV64: # %bb.0:
+; RV64-NEXT: zext.b a0, a0
+; RV64-NEXT: li a1, 293
+; RV64-NEXT: mul a0, a0, a1
+; RV64-NEXT: srli a0, a0, 11
+; RV64-NEXT: ret
+ %d = udiv i8 %x, 7
+ ret i8 %d
+}
+
+define i8 @udiv_i8_by5(i8 %x) nounwind {
+; X86-LABEL: udiv_i8_by5:
+; X86: # %bb.0:
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: imull $205, %eax, %eax
+; X86-NEXT: shrl $10, %eax
+; X86-NEXT: # kill: def $al killed $al killed $eax
+; X86-NEXT: retl
+;
+; X64-LABEL: udiv_i8_by5:
+; X64: # %bb.0:
+; X64-NEXT: movzbl %dil, %eax
+; X64-NEXT: imull $205, %eax, %eax
+; X64-NEXT: shrl $10, %eax
+; X64-NEXT: # kill: def $al killed $al killed $eax
+; X64-NEXT: retq
+;
+; AARCH64-LABEL: udiv_i8_by5:
+; AARCH64: // %bb.0:
+; AARCH64-NEXT: mov w8, #205 // =0xcd
+; AARCH64-NEXT: and w9, w0, #0xff
+; AARCH64-NEXT: mul w8, w9, w8
+; AARCH64-NEXT: lsr w0, w8, #10
+; AARCH64-NEXT: ret
+;
+; RV32-LABEL: udiv_i8_by5:
+; RV32: # %bb.0:
+; RV32-NEXT: zext.b a0, a0
+; RV32-NEXT: li a1, 205
+; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: srli a0, a0, 10
+; RV32-NEXT: ret
+;
+; RV64-LABEL: udiv_i8_by5:
+; RV64: # %bb.0:
+; RV64-NEXT: zext.b a0, a0
+; RV64-NEXT: li a1, 205
+; RV64-NEXT: mul a0, a0, a1
+; RV64-NEXT: srli a0, a0, 10
+; RV64-NEXT: ret
+ %d = udiv i8 %x, 5
+ ret i8 %d
+}
+
+define i8 @udiv_i8_by3(i8 %x) nounwind {
+; X86-LABEL: udiv_i8_by3:
+; X86: # %bb.0:
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: imull $171, %eax, %eax
+; X86-NEXT: shrl $9, %eax
+; X86-NEXT: # kill: def $al killed $al killed $eax
+; X86-NEXT: retl
+;
+; X64-LABEL: udiv_i8_by3:
+; X64: # %bb.0:
+; X64-NEXT: movzbl %dil, %eax
+; X64-NEXT: imull $171, %eax, %eax
+; X64-NEXT: shrl $9, %eax
+; X64-NEXT: # kill: def $al killed $al killed $eax
+; X64-NEXT: retq
+;
+; AARCH64-LABEL: udiv_i8_by3:
+; AARCH64: // %bb.0:
+; AARCH64-NEXT: mov w8, #171 // =0xab
+; AARCH64-NEXT: and w9, w0, #0xff
+; AARCH64-NEXT: mul w8, w9, w8
+; AARCH64-NEXT: lsr w0, w8, #9
+; AARCH64-NEXT: ret
+;
+; RV32-LABEL: udiv_i8_by3:
+; RV32: # %bb.0:
+; RV32-NEXT: zext.b a0, a0
+; RV32-NEXT: li a1, 171
+; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: srli a0, a0, 9
+; RV32-NEXT: ret
+;
+; RV64-LABEL: udiv_i8_by3:
+; RV64: # %bb.0:
+; RV64-NEXT: zext.b a0, a0
+; RV64-NEXT: li a1, 171
+; RV64-NEXT: mul a0, a0, a1
+; RV64-NEXT: srli a0, a0, 9
+; RV64-NEXT: ret
+ %d = udiv i8 %x, 3
+ ret i8 %d
+}
+
+; Even divisor - stock uses pre-shift + magic; simple magic avoids pre-shift.
+define i8 @udiv_i8_by78(i8 %x) nounwind {
+; X86-LABEL: udiv_i8_by78:
+; X86: # %bb.0:
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: shrb %al
+; X86-NEXT: movzbl %al, %eax
+; X86-NEXT: imull $211, %eax, %eax
+; X86-NEXT: shrl $13, %eax
+; X86-NEXT: # kill: def $al killed $al killed $eax
+; X86-NEXT: retl
+;
+; X64-LABEL: udiv_i8_by78:
+; X64: # %bb.0:
+; X64-NEXT: shrb %dil
+; X64-NEXT: movzbl %dil, %eax
+; X64-NEXT: imull $211, %eax, %eax
+; X64-NEXT: shrl $13, %eax
+; X64-NEXT: # kill: def $al killed $al killed $eax
+; X64-NEXT: retq
+;
+; AARCH64-LABEL: udiv_i8_by78:
+; AARCH64: // %bb.0:
+; AARCH64-NEXT: ubfx w8, w0, #1, #7
+; AARCH64-NEXT: mov w9, #211 // =0xd3
+; AARCH64-NEXT: mul w8, w8, w9
+; AARCH64-NEXT: lsr w0, w8, #13
+; AARCH64-NEXT: ret
+;
+; RV32-LABEL: udiv_i8_by78:
+; RV32: # %bb.0:
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srli a0, a0, 25
+; RV32-NEXT: li a1, 211
+; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: srli a0, a0, 13
+; RV32-NEXT: ret
+;
+; RV64-LABEL: udiv_i8_by78:
+; RV64: # %bb.0:
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srli a0, a0, 57
+; RV64-NEXT: li a1, 211
+; RV64-NEXT: mul a0, a0, a1
+; RV64-NEXT: srli a0, a0, 13
+; RV64-NEXT: ret
+ %d = udiv i8 %x, 78
+ ret i8 %d
+}
+
+define i8 @udiv_i8_by116(i8 %x) nounwind {
+; X86-LABEL: udiv_i8_by116:
+; X86: # %bb.0:
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: shrb $2, %al
+; X86-NEXT: movzbl %al, %eax
+; X86-NEXT: imull $71, %eax, %eax
+; X86-NEXT: shrl $11, %eax
+; X86-NEXT: # kill: def $al killed $al killed $eax
+; X86-NEXT: retl
+;
+; X64-LABEL: udiv_i8_by116:
+; X64: # %bb.0:
+; X64-NEXT: shrb $2, %dil
+; X64-NEXT: movzbl %dil, %eax
+; X64-NEXT: imull $71, %eax, %eax
+; X64-NEXT: shrl $11, %eax
+; X64-NEXT: # kill: def $al killed $al killed $eax
+; X64-NEXT: retq
+;
+; AARCH64-LABEL: udiv_i8_by116:
+; AARCH64: // %bb.0:
+; AARCH64-NEXT: ubfx w8, w0, #2, #6
+; AARCH64-NEXT: mov w9, #71 // =0x47
+; AARCH64-NEXT: mul w8, w8, w9
+; AARCH64-NEXT: lsr w0, w8, #11
+; AARCH64-NEXT: ret
+;
+; RV32-LABEL: udiv_i8_by116:
+; RV32: # %bb.0:
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srli a0, a0, 26
+; RV32-NEXT: li a1, 71
+; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: srli a0, a0, 11
+; RV32-NEXT: ret
+;
+; RV64-LABEL: udiv_i8_by116:
+; RV64: # %bb.0:
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srli a0, a0, 58
+; RV64-NEXT: li a1, 71
+; RV64-NEXT: mul a0, a0, a1
+; RV64-NEXT: srli a0, a0, 11
+; RV64-NEXT: ret
+ %d = udiv i8 %x, 116
+ ret i8 %d
+}
+
+; Power of 2 - should NOT fire (already lowered to shift).
+define i8 @udiv_i8_by4(i8 %x) nounwind {
+; X86-LABEL: udiv_i8_by4:
+; X86: # %bb.0:
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: shrb $2, %al
+; X86-NEXT: retl
+;
+; X64-LABEL: udiv_i8_by4:
+; X64: # %bb.0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: shrb $2, %al
+; X64-NEXT: # kill: def $al killed $al killed $eax
+; X64-NEXT: retq
+;
+; AARCH64-LABEL: udiv_i8_by4:
+; AARCH64: // %bb.0:
+; AARCH64-NEXT: ubfx w0, w0, #2, #6
+; AARCH64-NEXT: ret
+;
+; RV32-LABEL: udiv_i8_by4:
+; RV32: # %bb.0:
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srli a0, a0, 26
+; RV32-NEXT: ret
+;
+; RV64-LABEL: udiv_i8_by4:
+; RV64: # %bb.0:
+; RV64-NEXT: slli a0, a0, 56
+; RV64-NEXT: srli a0, a0, 58
+; RV64-NEXT: ret
+ %d = udiv i8 %x, 4
+ ret i8 %d
+}
+
+; Division by 1 - should NOT fire.
+define i8 @udiv_i8_by1(i8 %x) nounwind {
+; X86-LABEL: udiv_i8_by1:
+; X86: # %bb.0:
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: udiv_i8_by1:
+; X64: # %bb.0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: # kill: def $al killed $al killed $eax
+; X64-NEXT: retq
+;
+; AARCH64-LABEL: udiv_i8_by1:
+; AARCH64: // %bb.0:
+; AARCH64-NEXT: ret
+;
+; RV32-LABEL: udiv_i8_by1:
+; RV32: # %bb.0:
+; RV32-NEXT: ret
+;
+; RV64-LABEL: udiv_i8_by1:
+; RV64: # %bb.0:
+; RV64-NEXT: ret
+ %d = udiv i8 %x, 1
+ ret i8 %d
+}
+
+; Bare i8 udiv feeding another i8 op (no zext).
+define i8 @udiv_i8_then_add(i8 %x, i8 %y) nounwind {
+; X86-LABEL: udiv_i8_then_add:
+; X86: # %bb.0:
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: leal (%eax,%eax,8), %ecx
+; X86-NEXT: leal (%eax,%ecx,4), %ecx
+; X86-NEXT: subb %ch, %al
+; X86-NEXT: shrb %al
+; X86-NEXT: addb %ch, %al
+; X86-NEXT: shrb $2, %al
+; X86-NEXT: addb {{[0-9]+}}(%esp), %al
+; X86-NEXT: # kill: def $al killed $al killed $eax
+; X86-NEXT: retl
+;
+; X64-LABEL: udiv_i8_then_add:
+; X64: # %bb.0:
+; X64-NEXT: movzbl %dil, %eax
+; X64-NEXT: imull $293, %eax, %eax # imm = 0x125
+; X64-NEXT: shrl $11, %eax
+; X64-NEXT: addb %sil, %al
+; X64-NEXT: # kill: def $al killed $al killed $eax
+; X64-NEXT: retq
+;
+; AARCH64-LABEL: udiv_i8_then_add:
+; AARCH64: // %bb.0:
+; AARCH64-NEXT: mov w8, #293 // =0x125
+; AARCH64-NEXT: and w9, w0, #0xff
+; AARCH64-NEXT: mul w8, w9, w8
+; AARCH64-NEXT: add w0, w1, w8, lsr #11
+; AARCH64-NEXT: ret
+;
+; RV32-LABEL: udiv_i8_then_add:
+; RV32: # %bb.0:
+; RV32-NEXT: zext.b a2, a0
+; RV32-NEXT: li a3, 37
+; RV32-NEXT: mul a2, a2, a3
+; RV32-NEXT: srli a2, a2, 8
+; RV32-NEXT: sub a0, a0, a2
+; RV32-NEXT: slli a0, a0, 24
+; RV32-NEXT: srli a0, a0, 25
+; RV32-NEXT: add a0, a0, a2
+; RV32-NEXT: srli a0, a0, 2
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: ret
+;
+; RV64-LABEL: udiv_i8_then_add:
+; RV64: # %bb.0:
+; RV64-NEXT: zext.b a0, a0
+; RV64-NEXT: li a2, 293
+; RV64-NEXT: mul a0, a0, a2
+; RV64-NEXT: srli a0, a0, 11
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: ret
+ %d = udiv i8 %x, 7
+ %r = add i8 %d, %y
+ ret i8 %r
+}
+
+; --- i16 cases ---
+
+; IsAdd=true: i386 uses shiftless mull (UMUL_LOHI), x86-64 uses imulq+shrq $32,
+; AArch64 falls back to umull+lsr (FullMultiply via i64).
+define i16 @udiv_i16_by7(i16 %x) nounwind {
+; X86-LABEL: udiv_i16_by7:
+; X86: # %bb.0:
+; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl $613572608, %ecx # imm = 0x24926000
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: # kill: def $ax killed $ax killed $eax
+; X86-NEXT: retl
+;
+; X64-LABEL: udiv_i16_by7:
+; X64: # %bb.0:
+; X64-NEXT: movzwl %di, %eax
+; X64-NEXT: imulq $613572608, %rax, %rax # imm = 0x24926000
+; X64-NEXT: shrq $32, %rax
+; X64-NEXT: # kill: def $ax killed $ax killed $rax
+; X64-NEXT: retq
+;
+; AARCH64-LABEL: udiv_i16_by7:
+; AARCH64: // %bb.0:
+; AARCH64-NEXT: mov w8, #9363 // =0x2493
+; AARCH64-NEXT: // kill: def $w0 killed $w0 def $x0
+; AARCH64-NEXT: and x9, x0, #0xffff
+; AARCH64-NEXT: movk w8, #1, lsl #16
+; AARCH64-NEXT: umull x8, w9, w8
+; AARCH64-NEXT: lsr x0, x8, #19
+; AARCH64-NEXT: // kill: def $w0 killed $w0 killed $x0
+; AARCH64-NEXT: ret
+;
+; RV32-LABEL: udiv_i16_by7:
+; RV32: # %bb.0:
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: srli a0, a0, 16
+; RV32-NEXT: lui a1, 149798
+; RV32-NEXT: mulhu a0, a0, a1
+; RV32-NEXT: ret
+;
+; RV64-LABEL: udiv_i16_by7:
+; RV64: # %bb.0:
+; RV64-NEXT: lui a1, 74899
+; RV64-NEXT: slli a1, a1, 4
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: mulhu a0, a0, a1
+; RV64-NEXT: srli a0, a0, 19
+; RV64-NEXT: ret
+ %d = udiv i16 %x, 7
+ ret i16 %d
+}
+
+define i16 @udiv_i16_by100(i16 %x) nounwind {
+; X86-LABEL: udiv_i16_by100:
+; X86: # %bb.0:
+; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: shrl $2, %eax
+; X86-NEXT: imull $5243, %eax, %eax # imm = 0x147B
+; X86-NEXT: shrl $17, %eax
+; X86-NEXT: # kill: def $ax killed $ax killed $eax
+; X86-NEXT: retl
+;
+; X64-LABEL: udiv_i16_by100:
+; X64: # %bb.0:
+; X64-NEXT: movzwl %di, %eax
+; X64-NEXT: shrl $2, %eax
+; X64-NEXT: imull $5243, %eax, %eax # imm = 0x147B
+; X64-NEXT: shrl $17, %eax
+; X64-NEXT: # kill: def $ax killed $ax killed $eax
+; X64-NEXT: retq
+;
+; AARCH64-LABEL: udiv_i16_by100:
+; AARCH64: // %bb.0:
+; AARCH64-NEXT: ubfx w8, w0, #2, #14
+; AARCH64-NEXT: mov w9, #5243 // =0x147b
+; AARCH64-NEXT: mul w8, w8, w9
+; AARCH64-NEXT: lsr w0, w8, #17
+; AARCH64-NEXT: ret
+;
+; RV32-LABEL: udiv_i16_by100:
+; RV32: # %bb.0:
+; RV32-NEXT: slli a0, a0, 16
+; RV32-NEXT: lui a1, 1
+; RV32-NEXT: srli a0, a0, 18
+; RV32-NEXT: addi a1, a1, 1147
+; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: srli a0, a0, 17
+; RV32-NEXT: ret
+;
+; RV64-LABEL: udiv_i16_by100:
+; RV64: # %bb.0:
+; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: lui a1, 1
+; RV64-NEXT: srli a0, a0, 50
+; RV64-NEXT: addi a1, a1, 1147
+; RV64-NEXT: mul a0, a0, a1
+; RV64-NEXT: srli a0, a0, 17
+; RV64-NEXT: ret
+ %d = udiv i16 %x, 100
+ ret i16 %d
+}
+
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/X86/udiv-const-optimization.ll b/llvm/test/CodeGen/X86/udiv-const-optimization.ll
index a4fa413bab038..5e2518108512a 100644
--- a/llvm/test/CodeGen/X86/udiv-const-optimization.ll
+++ b/llvm/test/CodeGen/X86/udiv-const-optimization.ll
@@ -139,3 +139,64 @@ define i32 @udiv_by_3(i32 %x) nounwind {
%div = udiv i32 %x, 3
ret i32 %div
}
+
+; Test i16 udiv optimization: shiftless 32x32->hi32 via UMUL_LOHI(i32).
+; On i386, this uses mull (4 instructions, down from the 8-instruction IsAdd GM sequence).
+; On x86-64, the UMUL_LOHI(i32) folds into imulq+shrq $32 (3 instructions).
+define i16 @udiv_i16_by_7(i16 %x) nounwind {
+; X64-LABEL: udiv_i16_by_7:
+; X64: # %bb.0:
+; X64-NEXT: movzwl %di, %eax
+; X64-NEXT: imulq $613572608, %rax, %rax # imm = 0x24926000
+; X64-NEXT: shrq $32, %rax
+; X64-NEXT: # kill: def $ax killed $ax killed $rax
+; X64-NEXT: retq
+;
+; X64-BMI2-LABEL: udiv_i16_by_7:
+; X64-BMI2: # %bb.0:
+; X64-BMI2-NEXT: movzwl %di, %eax
+; X64-BMI2-NEXT: imulq $613572608, %rax, %rax # imm = 0x24926000
+; X64-BMI2-NEXT: shrq $32, %rax
+; X64-BMI2-NEXT: # kill: def $ax killed $ax killed $rax
+; X64-BMI2-NEXT: retq
+;
+; X86-LABEL: udiv_i16_by_7:
+; X86: # %bb.0:
+; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl $613572608, %ecx # imm = 0x24926000
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: # kill: def $ax killed $ax killed $eax
+; X86-NEXT: retl
+ %div = udiv i16 %x, 7
+ ret i16 %div
+}
+
+; Test non-optimized i16 case: IsAdd=false divisor uses regular 32-bit MULHU.
+define i16 @udiv_i16_by_3(i16 %x) nounwind {
+; X64-LABEL: udiv_i16_by_3:
+; X64: # %bb.0:
+; X64-NEXT: movzwl %di, %eax
+; X64-NEXT: imull $43691, %eax, %eax # imm = 0xAAAB
+; X64-NEXT: shrl $17, %eax
+; X64-NEXT: # kill: def $ax killed $ax killed $eax
+; X64-NEXT: retq
+;
+; X64-BMI2-LABEL: udiv_i16_by_3:
+; X64-BMI2: # %bb.0:
+; X64-BMI2-NEXT: movzwl %di, %eax
+; X64-BMI2-NEXT: imull $43691, %eax, %eax # imm = 0xAAAB
+; X64-BMI2-NEXT: shrl $17, %eax
+; X64-BMI2-NEXT: # kill: def $ax killed $ax killed $eax
+; X64-BMI2-NEXT: retq
+;
+; X86-LABEL: udiv_i16_by_3:
+; X86: # %bb.0:
+; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: imull $43691, %eax, %eax # imm = 0xAAAB
+; X86-NEXT: shrl $17, %eax
+; X86-NEXT: # kill: def $ax killed $ax killed $eax
+; X86-NEXT: retl
+ %div = udiv i16 %x, 3
+ ret i16 %div
+}
diff --git a/llvm/test/CodeGen/X86/udiv-narrow-magic.ll b/llvm/test/CodeGen/X86/udiv-narrow-magic.ll
deleted file mode 100644
index 5d5e893a3c59f..0000000000000
--- a/llvm/test/CodeGen/X86/udiv-narrow-magic.ll
+++ /dev/null
@@ -1,191 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=x86_64 < %s | FileCheck %s
-
-; When a narrow udiv-by-constant (i8, i16) is lowered via BuildUDIV, the
-; Hacker's Delight algorithm may need an expensive add-and-shift fixup.
-; A wider legal type (e.g. i32 for i8) has enough headroom for a simple
-; Magic = ceil(2^Shift / C) that needs no fixup at all.
-
-; --- i8 cases ---
-
-define i8 @udiv_i8_by7(i8 %x) nounwind {
-; CHECK-LABEL: udiv_i8_by7:
-; CHECK: # %bb.0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: imull $293, %eax, %eax # imm = 0x125
-; CHECK-NEXT: shrl $11, %eax
-; CHECK-NEXT: # kill: def $al killed $al killed $eax
-; CHECK-NEXT: retq
- %d = udiv i8 %x, 7
- ret i8 %d
-}
-
-define i8 @udiv_i8_by5(i8 %x) nounwind {
-; CHECK-LABEL: udiv_i8_by5:
-; CHECK: # %bb.0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: imull $205, %eax, %eax
-; CHECK-NEXT: shrl $10, %eax
-; CHECK-NEXT: # kill: def $al killed $al killed $eax
-; CHECK-NEXT: retq
- %d = udiv i8 %x, 5
- ret i8 %d
-}
-
-define i8 @udiv_i8_by3(i8 %x) nounwind {
-; CHECK-LABEL: udiv_i8_by3:
-; CHECK: # %bb.0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: imull $171, %eax, %eax
-; CHECK-NEXT: shrl $9, %eax
-; CHECK-NEXT: # kill: def $al killed $al killed $eax
-; CHECK-NEXT: retq
- %d = udiv i8 %x, 3
- ret i8 %d
-}
-
-; Even divisor - stock uses pre-shift + magic; simple magic avoids pre-shift.
-define i8 @udiv_i8_by78(i8 %x) nounwind {
-; CHECK-LABEL: udiv_i8_by78:
-; CHECK: # %bb.0:
-; CHECK-NEXT: shrb %dil
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: imull $211, %eax, %eax
-; CHECK-NEXT: shrl $13, %eax
-; CHECK-NEXT: # kill: def $al killed $al killed $eax
-; CHECK-NEXT: retq
- %d = udiv i8 %x, 78
- ret i8 %d
-}
-
-define i8 @udiv_i8_by116(i8 %x) nounwind {
-; CHECK-LABEL: udiv_i8_by116:
-; CHECK: # %bb.0:
-; CHECK-NEXT: shrb $2, %dil
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: imull $71, %eax, %eax
-; CHECK-NEXT: shrl $11, %eax
-; CHECK-NEXT: # kill: def $al killed $al killed $eax
-; CHECK-NEXT: retq
- %d = udiv i8 %x, 116
- ret i8 %d
-}
-
-; Power of 2 - should NOT fire (already lowered to shift).
-define i8 @udiv_i8_by4(i8 %x) nounwind {
-; CHECK-LABEL: udiv_i8_by4:
-; CHECK: # %bb.0:
-; CHECK-NEXT: movl %edi, %eax
-; CHECK-NEXT: shrb $2, %al
-; CHECK-NEXT: # kill: def $al killed $al killed $eax
-; CHECK-NEXT: retq
- %d = udiv i8 %x, 4
- ret i8 %d
-}
-
-; Division by 1 - should NOT fire.
-define i8 @udiv_i8_by1(i8 %x) nounwind {
-; CHECK-LABEL: udiv_i8_by1:
-; CHECK: # %bb.0:
-; CHECK-NEXT: movl %edi, %eax
-; CHECK-NEXT: # kill: def $al killed $al killed $eax
-; CHECK-NEXT: retq
- %d = udiv i8 %x, 1
- ret i8 %d
-}
-
-; Bare i8 udiv feeding another i8 op (no zext).
-define i8 @udiv_i8_then_add(i8 %x, i8 %y) nounwind {
-; CHECK-LABEL: udiv_i8_then_add:
-; CHECK: # %bb.0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: imull $293, %eax, %eax # imm = 0x125
-; CHECK-NEXT: shrl $11, %eax
-; CHECK-NEXT: addb %sil, %al
-; CHECK-NEXT: # kill: def $al killed $al killed $eax
-; CHECK-NEXT: retq
- %d = udiv i8 %x, 7
- %r = add i8 %d, %y
- ret i8 %r
-}
-
-; --- i16 cases ---
-
-define i16 @udiv_i16_by7(i16 %x) nounwind {
-; CHECK-LABEL: udiv_i16_by7:
-; CHECK: # %bb.0:
-; CHECK-NEXT: movzwl %di, %eax
-; CHECK-NEXT: imulq $74899, %rax, %rax # imm = 0x12493
-; CHECK-NEXT: shrq $19, %rax
-; CHECK-NEXT: # kill: def $ax killed $ax killed $rax
-; CHECK-NEXT: retq
- %d = udiv i16 %x, 7
- ret i16 %d
-}
-
-define i16 @udiv_i16_by100(i16 %x) nounwind {
-; CHECK-LABEL: udiv_i16_by100:
-; CHECK: # %bb.0:
-; CHECK-NEXT: movzwl %di, %eax
-; CHECK-NEXT: shrl $2, %eax
-; CHECK-NEXT: imull $5243, %eax, %eax # imm = 0x147B
-; CHECK-NEXT: shrl $17, %eax
-; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
-; CHECK-NEXT: retq
- %d = udiv i16 %x, 100
- ret i16 %d
-}
-
-; Vector narrow udiv - should NOT use the scalar narrow-magic widening path.
-define <16 x i8> @udiv_v16i8_by7(<16 x i8> %x) nounwind {
-; CHECK-LABEL: udiv_v16i8_by7:
-; CHECK: # %bb.0:
-; CHECK-NEXT: pxor %xmm1, %xmm1
-; CHECK-NEXT: movdqa %xmm0, %xmm2
-; CHECK-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0]
-; CHECK-NEXT: pmullw %xmm3, %xmm2
-; CHECK-NEXT: psrlw $8, %xmm2
-; CHECK-NEXT: movdqa %xmm0, %xmm4
-; CHECK-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
-; CHECK-NEXT: pmullw %xmm3, %xmm4
-; CHECK-NEXT: psrlw $8, %xmm4
-; CHECK-NEXT: packuswb %xmm2, %xmm4
-; CHECK-NEXT: psubb %xmm4, %xmm0
-; CHECK-NEXT: psrlw $1, %xmm0
-; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: paddb %xmm4, %xmm0
-; CHECK-NEXT: psrlw $2, %xmm0
-; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: retq
- %d = udiv <16 x i8> %x, splat (i8 7)
- ret <16 x i8> %d
-}
-
-define <8 x i16> @udiv_v8i16_by7(<8 x i16> %x) nounwind {
-; CHECK-LABEL: udiv_v8i16_by7:
-; CHECK: # %bb.0:
-; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [9363,9363,9363,9363,9363,9363,9363,9363]
-; CHECK-NEXT: pmulhuw %xmm0, %xmm1
-; CHECK-NEXT: psubw %xmm1, %xmm0
-; CHECK-NEXT: psrlw $1, %xmm0
-; CHECK-NEXT: paddw %xmm1, %xmm0
-; CHECK-NEXT: psrlw $2, %xmm0
-; CHECK-NEXT: retq
- %d = udiv <8 x i16> %x, splat (i16 7)
- ret <8 x i16> %d
-}
-
-; zext(udiv i16) - should also improve.
-define i32 @zext_udiv_i16_by7(i16 %x) nounwind {
-; CHECK-LABEL: zext_udiv_i16_by7:
-; CHECK: # %bb.0:
-; CHECK-NEXT: movzwl %di, %eax
-; CHECK-NEXT: imulq $74899, %rax, %rax # imm = 0x12493
-; CHECK-NEXT: shrq $19, %rax
-; CHECK-NEXT: # kill: def $eax killed $eax killed $rax
-; CHECK-NEXT: retq
- %d = udiv i16 %x, 7
- %z = zext i16 %d to i32
- ret i32 %z
-}
diff --git a/llvm/test/CodeGen/X86/urem-vector-lkk.ll b/llvm/test/CodeGen/X86/urem-vector-lkk.ll
index f9a3b017a6748..89cef0daffd77 100644
--- a/llvm/test/CodeGen/X86/urem-vector-lkk.ll
+++ b/llvm/test/CodeGen/X86/urem-vector-lkk.ll
@@ -87,8 +87,8 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) {
; SSE-NEXT: subl %ecx, %eax
; SSE-NEXT: pinsrw $2, %eax, %xmm1
; SSE-NEXT: pextrw $3, %xmm0, %eax
-; SSE-NEXT: imulq $66909, %rax, %rcx # imm = 0x1055D
-; SSE-NEXT: shrq $26, %rcx
+; SSE-NEXT: imulq $4282176, %rax, %rcx # imm = 0x415740
+; SSE-NEXT: shrq $32, %rcx
; SSE-NEXT: imull $1003, %ecx, %ecx # imm = 0x3EB
; SSE-NEXT: subl %ecx, %eax
; SSE-NEXT: pinsrw $3, %eax, %xmm1
@@ -198,8 +198,8 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) {
; AVX-NEXT: subl %ecx, %eax
; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
; AVX-NEXT: vpextrw $3, %xmm0, %eax
-; AVX-NEXT: imulq $66909, %rax, %rcx # imm = 0x1055D
-; AVX-NEXT: shrq $26, %rcx
+; AVX-NEXT: imulq $4282176, %rax, %rcx # imm = 0x415740
+; AVX-NEXT: shrq $32, %rcx
; AVX-NEXT: imull $1003, %ecx, %ecx # imm = 0x3EB
; AVX-NEXT: subl %ecx, %eax
; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0
@@ -391,8 +391,8 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) {
; SSE-NEXT: pxor %xmm1, %xmm1
; SSE-NEXT: pinsrw $1, %eax, %xmm1
; SSE-NEXT: pextrw $2, %xmm0, %eax
-; SSE-NEXT: imulq $91181, %rax, %rcx # imm = 0x1642D
-; SSE-NEXT: shrq $21, %rcx
+; SSE-NEXT: imulq $186738688, %rax, %rcx # imm = 0xB216800
+; SSE-NEXT: shrq $32, %rcx
; SSE-NEXT: leal (%rcx,%rcx,2), %edx
; SSE-NEXT: shll $3, %edx
; SSE-NEXT: subl %edx, %ecx
@@ -505,8 +505,8 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) {
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
; AVX-NEXT: vpextrw $2, %xmm0, %eax
-; AVX-NEXT: imulq $91181, %rax, %rcx # imm = 0x1642D
-; AVX-NEXT: shrq $21, %rcx
+; AVX-NEXT: imulq $186738688, %rax, %rcx # imm = 0xB216800
+; AVX-NEXT: shrq $32, %rcx
; AVX-NEXT: leal (%rcx,%rcx,2), %edx
; AVX-NEXT: shll $3, %edx
; AVX-NEXT: subl %edx, %ecx
>From b19aeb0f818f7232db18efc98bcad0ab9949fc86 Mon Sep 17 00:00:00 2001
From: Takashiidobe <idobetakashi at gmail.com>
Date: Wed, 25 Mar 2026 20:24:41 -0400
Subject: [PATCH 5/6] code review feedback, remove tests from generic
---
.../CodeGen/SelectionDAG/TargetLowering.cpp | 6 +-
.../CodeGen/Generic/udiv-narrow-widening.ll | 487 ------------------
llvm/test/CodeGen/RISCV/div-by-constant.ll | 26 +-
.../CodeGen/X86/udiv-const-optimization.ll | 6 +-
4 files changed, 18 insertions(+), 507 deletions(-)
delete mode 100644 llvm/test/CodeGen/Generic/udiv-narrow-widening.ll
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index f2e6ca9f1c154..2f9bce103ff36 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -6852,13 +6852,9 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
- // Compute known bits once; used both to reduce the Hacker's Delight magic
- // and to check simple-wide-magic conditions below.
- KnownBits Known0 = DAG.computeKnownBits(N0);
-
// Try to use leading zeros of the dividend to reduce the multiplier and
// avoid expensive fixups.
- unsigned KnownLeadingZeros = Known0.countMinLeadingZeros();
+ unsigned KnownLeadingZeros = DAG.computeKnownBits(N0).countMinLeadingZeros();
// If we're after type legalization and SVT is not legal, use the
// promoted type for creating constants to avoid creating nodes with
diff --git a/llvm/test/CodeGen/Generic/udiv-narrow-widening.ll b/llvm/test/CodeGen/Generic/udiv-narrow-widening.ll
deleted file mode 100644
index 124bb7cb73565..0000000000000
--- a/llvm/test/CodeGen/Generic/udiv-narrow-widening.ll
+++ /dev/null
@@ -1,487 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=i386-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,X86
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,X64
-; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,AARCH64
-; RUN: llc -mtriple=riscv32-unknown-linux-gnu -mattr=+m < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64-unknown-linux-gnu -mattr=+m < %s | FileCheck %s --check-prefixes=CHECK,RV64
-
-; When a narrow udiv-by-constant (i8, i16) is lowered via BuildUDIV, the
-; Granlund-Montgomery (GM) algorithm may need an expensive add-and-shift fixup
-; for IsAdd=true divisors. A wider legal type (e.g. i32 for i16) has enough
-; headroom for a simple magic = ceil(2^Shift / C) that needs no fixup at all.
-;
-; i16 IsAdd=true divisors (e.g. 7): on targets with i32 UMUL_LOHI (i386,
-; x86-64) we emit a shiftless 32x32->64 multiply and take the high 32 bits.
-; On AArch64 (no i32 UMUL_LOHI) we fall back to FullMultiply via i64.
-
-; --- i8 cases ---
-
-define i8 @udiv_i8_by7(i8 %x) nounwind {
-; X86-LABEL: udiv_i8_by7:
-; X86: # %bb.0:
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: leal (%eax,%eax,8), %ecx
-; X86-NEXT: leal (%eax,%ecx,4), %ecx
-; X86-NEXT: subb %ch, %al
-; X86-NEXT: shrb %al
-; X86-NEXT: addb %ch, %al
-; X86-NEXT: shrb $2, %al
-; X86-NEXT: # kill: def $al killed $al killed $eax
-; X86-NEXT: retl
-;
-; X64-LABEL: udiv_i8_by7:
-; X64: # %bb.0:
-; X64-NEXT: movzbl %dil, %eax
-; X64-NEXT: imull $293, %eax, %eax # imm = 0x125
-; X64-NEXT: shrl $11, %eax
-; X64-NEXT: # kill: def $al killed $al killed $eax
-; X64-NEXT: retq
-;
-; AARCH64-LABEL: udiv_i8_by7:
-; AARCH64: // %bb.0:
-; AARCH64-NEXT: // kill: def $w0 killed $w0 def $x0
-; AARCH64-NEXT: mov w8, #293 // =0x125
-; AARCH64-NEXT: and x9, x0, #0xff
-; AARCH64-NEXT: umull x8, w9, w8
-; AARCH64-NEXT: lsr x0, x8, #11
-; AARCH64-NEXT: // kill: def $w0 killed $w0 killed $x0
-; AARCH64-NEXT: ret
-;
-; RV32-LABEL: udiv_i8_by7:
-; RV32: # %bb.0:
-; RV32-NEXT: zext.b a1, a0
-; RV32-NEXT: li a2, 37
-; RV32-NEXT: mul a1, a1, a2
-; RV32-NEXT: srli a1, a1, 8
-; RV32-NEXT: sub a0, a0, a1
-; RV32-NEXT: slli a0, a0, 24
-; RV32-NEXT: srli a0, a0, 25
-; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: srli a0, a0, 2
-; RV32-NEXT: ret
-;
-; RV64-LABEL: udiv_i8_by7:
-; RV64: # %bb.0:
-; RV64-NEXT: zext.b a0, a0
-; RV64-NEXT: li a1, 293
-; RV64-NEXT: mul a0, a0, a1
-; RV64-NEXT: srli a0, a0, 11
-; RV64-NEXT: ret
- %d = udiv i8 %x, 7
- ret i8 %d
-}
-
-define i8 @udiv_i8_by5(i8 %x) nounwind {
-; X86-LABEL: udiv_i8_by5:
-; X86: # %bb.0:
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: imull $205, %eax, %eax
-; X86-NEXT: shrl $10, %eax
-; X86-NEXT: # kill: def $al killed $al killed $eax
-; X86-NEXT: retl
-;
-; X64-LABEL: udiv_i8_by5:
-; X64: # %bb.0:
-; X64-NEXT: movzbl %dil, %eax
-; X64-NEXT: imull $205, %eax, %eax
-; X64-NEXT: shrl $10, %eax
-; X64-NEXT: # kill: def $al killed $al killed $eax
-; X64-NEXT: retq
-;
-; AARCH64-LABEL: udiv_i8_by5:
-; AARCH64: // %bb.0:
-; AARCH64-NEXT: mov w8, #205 // =0xcd
-; AARCH64-NEXT: and w9, w0, #0xff
-; AARCH64-NEXT: mul w8, w9, w8
-; AARCH64-NEXT: lsr w0, w8, #10
-; AARCH64-NEXT: ret
-;
-; RV32-LABEL: udiv_i8_by5:
-; RV32: # %bb.0:
-; RV32-NEXT: zext.b a0, a0
-; RV32-NEXT: li a1, 205
-; RV32-NEXT: mul a0, a0, a1
-; RV32-NEXT: srli a0, a0, 10
-; RV32-NEXT: ret
-;
-; RV64-LABEL: udiv_i8_by5:
-; RV64: # %bb.0:
-; RV64-NEXT: zext.b a0, a0
-; RV64-NEXT: li a1, 205
-; RV64-NEXT: mul a0, a0, a1
-; RV64-NEXT: srli a0, a0, 10
-; RV64-NEXT: ret
- %d = udiv i8 %x, 5
- ret i8 %d
-}
-
-define i8 @udiv_i8_by3(i8 %x) nounwind {
-; X86-LABEL: udiv_i8_by3:
-; X86: # %bb.0:
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: imull $171, %eax, %eax
-; X86-NEXT: shrl $9, %eax
-; X86-NEXT: # kill: def $al killed $al killed $eax
-; X86-NEXT: retl
-;
-; X64-LABEL: udiv_i8_by3:
-; X64: # %bb.0:
-; X64-NEXT: movzbl %dil, %eax
-; X64-NEXT: imull $171, %eax, %eax
-; X64-NEXT: shrl $9, %eax
-; X64-NEXT: # kill: def $al killed $al killed $eax
-; X64-NEXT: retq
-;
-; AARCH64-LABEL: udiv_i8_by3:
-; AARCH64: // %bb.0:
-; AARCH64-NEXT: mov w8, #171 // =0xab
-; AARCH64-NEXT: and w9, w0, #0xff
-; AARCH64-NEXT: mul w8, w9, w8
-; AARCH64-NEXT: lsr w0, w8, #9
-; AARCH64-NEXT: ret
-;
-; RV32-LABEL: udiv_i8_by3:
-; RV32: # %bb.0:
-; RV32-NEXT: zext.b a0, a0
-; RV32-NEXT: li a1, 171
-; RV32-NEXT: mul a0, a0, a1
-; RV32-NEXT: srli a0, a0, 9
-; RV32-NEXT: ret
-;
-; RV64-LABEL: udiv_i8_by3:
-; RV64: # %bb.0:
-; RV64-NEXT: zext.b a0, a0
-; RV64-NEXT: li a1, 171
-; RV64-NEXT: mul a0, a0, a1
-; RV64-NEXT: srli a0, a0, 9
-; RV64-NEXT: ret
- %d = udiv i8 %x, 3
- ret i8 %d
-}
-
-; Even divisor - stock uses pre-shift + magic; simple magic avoids pre-shift.
-define i8 @udiv_i8_by78(i8 %x) nounwind {
-; X86-LABEL: udiv_i8_by78:
-; X86: # %bb.0:
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: shrb %al
-; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: imull $211, %eax, %eax
-; X86-NEXT: shrl $13, %eax
-; X86-NEXT: # kill: def $al killed $al killed $eax
-; X86-NEXT: retl
-;
-; X64-LABEL: udiv_i8_by78:
-; X64: # %bb.0:
-; X64-NEXT: shrb %dil
-; X64-NEXT: movzbl %dil, %eax
-; X64-NEXT: imull $211, %eax, %eax
-; X64-NEXT: shrl $13, %eax
-; X64-NEXT: # kill: def $al killed $al killed $eax
-; X64-NEXT: retq
-;
-; AARCH64-LABEL: udiv_i8_by78:
-; AARCH64: // %bb.0:
-; AARCH64-NEXT: ubfx w8, w0, #1, #7
-; AARCH64-NEXT: mov w9, #211 // =0xd3
-; AARCH64-NEXT: mul w8, w8, w9
-; AARCH64-NEXT: lsr w0, w8, #13
-; AARCH64-NEXT: ret
-;
-; RV32-LABEL: udiv_i8_by78:
-; RV32: # %bb.0:
-; RV32-NEXT: slli a0, a0, 24
-; RV32-NEXT: srli a0, a0, 25
-; RV32-NEXT: li a1, 211
-; RV32-NEXT: mul a0, a0, a1
-; RV32-NEXT: srli a0, a0, 13
-; RV32-NEXT: ret
-;
-; RV64-LABEL: udiv_i8_by78:
-; RV64: # %bb.0:
-; RV64-NEXT: slli a0, a0, 56
-; RV64-NEXT: srli a0, a0, 57
-; RV64-NEXT: li a1, 211
-; RV64-NEXT: mul a0, a0, a1
-; RV64-NEXT: srli a0, a0, 13
-; RV64-NEXT: ret
- %d = udiv i8 %x, 78
- ret i8 %d
-}
-
-define i8 @udiv_i8_by116(i8 %x) nounwind {
-; X86-LABEL: udiv_i8_by116:
-; X86: # %bb.0:
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: shrb $2, %al
-; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: imull $71, %eax, %eax
-; X86-NEXT: shrl $11, %eax
-; X86-NEXT: # kill: def $al killed $al killed $eax
-; X86-NEXT: retl
-;
-; X64-LABEL: udiv_i8_by116:
-; X64: # %bb.0:
-; X64-NEXT: shrb $2, %dil
-; X64-NEXT: movzbl %dil, %eax
-; X64-NEXT: imull $71, %eax, %eax
-; X64-NEXT: shrl $11, %eax
-; X64-NEXT: # kill: def $al killed $al killed $eax
-; X64-NEXT: retq
-;
-; AARCH64-LABEL: udiv_i8_by116:
-; AARCH64: // %bb.0:
-; AARCH64-NEXT: ubfx w8, w0, #2, #6
-; AARCH64-NEXT: mov w9, #71 // =0x47
-; AARCH64-NEXT: mul w8, w8, w9
-; AARCH64-NEXT: lsr w0, w8, #11
-; AARCH64-NEXT: ret
-;
-; RV32-LABEL: udiv_i8_by116:
-; RV32: # %bb.0:
-; RV32-NEXT: slli a0, a0, 24
-; RV32-NEXT: srli a0, a0, 26
-; RV32-NEXT: li a1, 71
-; RV32-NEXT: mul a0, a0, a1
-; RV32-NEXT: srli a0, a0, 11
-; RV32-NEXT: ret
-;
-; RV64-LABEL: udiv_i8_by116:
-; RV64: # %bb.0:
-; RV64-NEXT: slli a0, a0, 56
-; RV64-NEXT: srli a0, a0, 58
-; RV64-NEXT: li a1, 71
-; RV64-NEXT: mul a0, a0, a1
-; RV64-NEXT: srli a0, a0, 11
-; RV64-NEXT: ret
- %d = udiv i8 %x, 116
- ret i8 %d
-}
-
-; Power of 2 - should NOT fire (already lowered to shift).
-define i8 @udiv_i8_by4(i8 %x) nounwind {
-; X86-LABEL: udiv_i8_by4:
-; X86: # %bb.0:
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: shrb $2, %al
-; X86-NEXT: retl
-;
-; X64-LABEL: udiv_i8_by4:
-; X64: # %bb.0:
-; X64-NEXT: movl %edi, %eax
-; X64-NEXT: shrb $2, %al
-; X64-NEXT: # kill: def $al killed $al killed $eax
-; X64-NEXT: retq
-;
-; AARCH64-LABEL: udiv_i8_by4:
-; AARCH64: // %bb.0:
-; AARCH64-NEXT: ubfx w0, w0, #2, #6
-; AARCH64-NEXT: ret
-;
-; RV32-LABEL: udiv_i8_by4:
-; RV32: # %bb.0:
-; RV32-NEXT: slli a0, a0, 24
-; RV32-NEXT: srli a0, a0, 26
-; RV32-NEXT: ret
-;
-; RV64-LABEL: udiv_i8_by4:
-; RV64: # %bb.0:
-; RV64-NEXT: slli a0, a0, 56
-; RV64-NEXT: srli a0, a0, 58
-; RV64-NEXT: ret
- %d = udiv i8 %x, 4
- ret i8 %d
-}
-
-; Division by 1 - should NOT fire.
-define i8 @udiv_i8_by1(i8 %x) nounwind {
-; X86-LABEL: udiv_i8_by1:
-; X86: # %bb.0:
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: retl
-;
-; X64-LABEL: udiv_i8_by1:
-; X64: # %bb.0:
-; X64-NEXT: movl %edi, %eax
-; X64-NEXT: # kill: def $al killed $al killed $eax
-; X64-NEXT: retq
-;
-; AARCH64-LABEL: udiv_i8_by1:
-; AARCH64: // %bb.0:
-; AARCH64-NEXT: ret
-;
-; RV32-LABEL: udiv_i8_by1:
-; RV32: # %bb.0:
-; RV32-NEXT: ret
-;
-; RV64-LABEL: udiv_i8_by1:
-; RV64: # %bb.0:
-; RV64-NEXT: ret
- %d = udiv i8 %x, 1
- ret i8 %d
-}
-
-; Bare i8 udiv feeding another i8 op (no zext).
-define i8 @udiv_i8_then_add(i8 %x, i8 %y) nounwind {
-; X86-LABEL: udiv_i8_then_add:
-; X86: # %bb.0:
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: leal (%eax,%eax,8), %ecx
-; X86-NEXT: leal (%eax,%ecx,4), %ecx
-; X86-NEXT: subb %ch, %al
-; X86-NEXT: shrb %al
-; X86-NEXT: addb %ch, %al
-; X86-NEXT: shrb $2, %al
-; X86-NEXT: addb {{[0-9]+}}(%esp), %al
-; X86-NEXT: # kill: def $al killed $al killed $eax
-; X86-NEXT: retl
-;
-; X64-LABEL: udiv_i8_then_add:
-; X64: # %bb.0:
-; X64-NEXT: movzbl %dil, %eax
-; X64-NEXT: imull $293, %eax, %eax # imm = 0x125
-; X64-NEXT: shrl $11, %eax
-; X64-NEXT: addb %sil, %al
-; X64-NEXT: # kill: def $al killed $al killed $eax
-; X64-NEXT: retq
-;
-; AARCH64-LABEL: udiv_i8_then_add:
-; AARCH64: // %bb.0:
-; AARCH64-NEXT: mov w8, #293 // =0x125
-; AARCH64-NEXT: and w9, w0, #0xff
-; AARCH64-NEXT: mul w8, w9, w8
-; AARCH64-NEXT: add w0, w1, w8, lsr #11
-; AARCH64-NEXT: ret
-;
-; RV32-LABEL: udiv_i8_then_add:
-; RV32: # %bb.0:
-; RV32-NEXT: zext.b a2, a0
-; RV32-NEXT: li a3, 37
-; RV32-NEXT: mul a2, a2, a3
-; RV32-NEXT: srli a2, a2, 8
-; RV32-NEXT: sub a0, a0, a2
-; RV32-NEXT: slli a0, a0, 24
-; RV32-NEXT: srli a0, a0, 25
-; RV32-NEXT: add a0, a0, a2
-; RV32-NEXT: srli a0, a0, 2
-; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: ret
-;
-; RV64-LABEL: udiv_i8_then_add:
-; RV64: # %bb.0:
-; RV64-NEXT: zext.b a0, a0
-; RV64-NEXT: li a2, 293
-; RV64-NEXT: mul a0, a0, a2
-; RV64-NEXT: srli a0, a0, 11
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: ret
- %d = udiv i8 %x, 7
- %r = add i8 %d, %y
- ret i8 %r
-}
-
-; --- i16 cases ---
-
-; IsAdd=true: i386 uses shiftless mull (UMUL_LOHI), x86-64 uses imulq+shrq $32,
-; AArch64 falls back to umull+lsr (FullMultiply via i64).
-define i16 @udiv_i16_by7(i16 %x) nounwind {
-; X86-LABEL: udiv_i16_by7:
-; X86: # %bb.0:
-; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl $613572608, %ecx # imm = 0x24926000
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %eax
-; X86-NEXT: # kill: def $ax killed $ax killed $eax
-; X86-NEXT: retl
-;
-; X64-LABEL: udiv_i16_by7:
-; X64: # %bb.0:
-; X64-NEXT: movzwl %di, %eax
-; X64-NEXT: imulq $613572608, %rax, %rax # imm = 0x24926000
-; X64-NEXT: shrq $32, %rax
-; X64-NEXT: # kill: def $ax killed $ax killed $rax
-; X64-NEXT: retq
-;
-; AARCH64-LABEL: udiv_i16_by7:
-; AARCH64: // %bb.0:
-; AARCH64-NEXT: mov w8, #9363 // =0x2493
-; AARCH64-NEXT: // kill: def $w0 killed $w0 def $x0
-; AARCH64-NEXT: and x9, x0, #0xffff
-; AARCH64-NEXT: movk w8, #1, lsl #16
-; AARCH64-NEXT: umull x8, w9, w8
-; AARCH64-NEXT: lsr x0, x8, #19
-; AARCH64-NEXT: // kill: def $w0 killed $w0 killed $x0
-; AARCH64-NEXT: ret
-;
-; RV32-LABEL: udiv_i16_by7:
-; RV32: # %bb.0:
-; RV32-NEXT: slli a0, a0, 16
-; RV32-NEXT: srli a0, a0, 16
-; RV32-NEXT: lui a1, 149798
-; RV32-NEXT: mulhu a0, a0, a1
-; RV32-NEXT: ret
-;
-; RV64-LABEL: udiv_i16_by7:
-; RV64: # %bb.0:
-; RV64-NEXT: lui a1, 74899
-; RV64-NEXT: slli a1, a1, 4
-; RV64-NEXT: slli a0, a0, 48
-; RV64-NEXT: mulhu a0, a0, a1
-; RV64-NEXT: srli a0, a0, 19
-; RV64-NEXT: ret
- %d = udiv i16 %x, 7
- ret i16 %d
-}
-
-define i16 @udiv_i16_by100(i16 %x) nounwind {
-; X86-LABEL: udiv_i16_by100:
-; X86: # %bb.0:
-; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: shrl $2, %eax
-; X86-NEXT: imull $5243, %eax, %eax # imm = 0x147B
-; X86-NEXT: shrl $17, %eax
-; X86-NEXT: # kill: def $ax killed $ax killed $eax
-; X86-NEXT: retl
-;
-; X64-LABEL: udiv_i16_by100:
-; X64: # %bb.0:
-; X64-NEXT: movzwl %di, %eax
-; X64-NEXT: shrl $2, %eax
-; X64-NEXT: imull $5243, %eax, %eax # imm = 0x147B
-; X64-NEXT: shrl $17, %eax
-; X64-NEXT: # kill: def $ax killed $ax killed $eax
-; X64-NEXT: retq
-;
-; AARCH64-LABEL: udiv_i16_by100:
-; AARCH64: // %bb.0:
-; AARCH64-NEXT: ubfx w8, w0, #2, #14
-; AARCH64-NEXT: mov w9, #5243 // =0x147b
-; AARCH64-NEXT: mul w8, w8, w9
-; AARCH64-NEXT: lsr w0, w8, #17
-; AARCH64-NEXT: ret
-;
-; RV32-LABEL: udiv_i16_by100:
-; RV32: # %bb.0:
-; RV32-NEXT: slli a0, a0, 16
-; RV32-NEXT: lui a1, 1
-; RV32-NEXT: srli a0, a0, 18
-; RV32-NEXT: addi a1, a1, 1147
-; RV32-NEXT: mul a0, a0, a1
-; RV32-NEXT: srli a0, a0, 17
-; RV32-NEXT: ret
-;
-; RV64-LABEL: udiv_i16_by100:
-; RV64: # %bb.0:
-; RV64-NEXT: slli a0, a0, 48
-; RV64-NEXT: lui a1, 1
-; RV64-NEXT: srli a0, a0, 50
-; RV64-NEXT: addi a1, a1, 1147
-; RV64-NEXT: mul a0, a0, a1
-; RV64-NEXT: srli a0, a0, 17
-; RV64-NEXT: ret
- %d = udiv i16 %x, 100
- ret i16 %d
-}
-
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/div-by-constant.ll b/llvm/test/CodeGen/RISCV/div-by-constant.ll
index 40b599bc1a076..44773ff04924b 100644
--- a/llvm/test/CodeGen/RISCV/div-by-constant.ll
+++ b/llvm/test/CodeGen/RISCV/div-by-constant.ll
@@ -278,18 +278,20 @@ define i16 @udiv16_constant_no_add(i16 %a) nounwind {
}
define i16 @udiv16_constant_add(i16 %a) nounwind {
-; RV32-LABEL: udiv16_constant_add:
-; RV32: # %bb.0:
-; RV32-NEXT: slli a1, a0, 16
-; RV32-NEXT: lui a2, 149808
-; RV32-NEXT: mulhu a1, a1, a2
-; RV32-NEXT: srli a1, a1, 16
-; RV32-NEXT: sub a0, a0, a1
-; RV32-NEXT: slli a0, a0, 16
-; RV32-NEXT: srli a0, a0, 17
-; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: srli a0, a0, 2
-; RV32-NEXT: ret
+; RV32IM-LABEL: udiv16_constant_add:
+; RV32IM: # %bb.0:
+; RV32IM-NEXT: slli a0, a0, 16
+; RV32IM-NEXT: srli a0, a0, 16
+; RV32IM-NEXT: lui a1, 149798
+; RV32IM-NEXT: mulhu a0, a0, a1
+; RV32IM-NEXT: ret
+;
+; RV32IMZB-LABEL: udiv16_constant_add:
+; RV32IMZB: # %bb.0:
+; RV32IMZB-NEXT: zext.h a0, a0
+; RV32IMZB-NEXT: lui a1, 149798
+; RV32IMZB-NEXT: mulhu a0, a0, a1
+; RV32IMZB-NEXT: ret
;
; RV64-LABEL: udiv16_constant_add:
; RV64: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/udiv-const-optimization.ll b/llvm/test/CodeGen/X86/udiv-const-optimization.ll
index 5e2518108512a..b4152aaddf39d 100644
--- a/llvm/test/CodeGen/X86/udiv-const-optimization.ll
+++ b/llvm/test/CodeGen/X86/udiv-const-optimization.ll
@@ -62,7 +62,7 @@ define i32 @udiv_by_19(i32 %x) nounwind {
; X86-LABEL: udiv_by_19:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl ${{-?[0-9]+}}, %edx # imm = 0xAF286BCB
+; X86-NEXT: movl $-1356305461, %edx # imm = 0xAF286BCB
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: mull %edx
; X86-NEXT: subl %edx, %ecx
@@ -96,7 +96,7 @@ define i32 @udiv_by_21(i32 %x) nounwind {
; X86-LABEL: udiv_by_21:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl ${{-?[0-9]+}}, %edx # imm = 0x86186187
+; X86-NEXT: movl $-2045222521, %edx # imm = 0x86186187
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: mull %edx
; X86-NEXT: subl %edx, %ecx
@@ -131,7 +131,7 @@ define i32 @udiv_by_3(i32 %x) nounwind {
;
; X86-LABEL: udiv_by_3:
; X86: # %bb.0:
-; X86-NEXT: movl ${{-?[0-9]+}}, %eax # imm = 0xAAAAAAAB
+; X86-NEXT: movl $-1431655765, %eax # imm = 0xAAAAAAAB
; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: movl %edx, %eax
; X86-NEXT: shrl %eax
>From 8a953862ab10de99f4abf31a7b245d3be7faf463 Mon Sep 17 00:00:00 2001
From: Takashiidobe <idobetakashi at gmail.com>
Date: Sat, 4 Apr 2026 22:19:06 -0400
Subject: [PATCH 6/6] fix botched rebase
---
llvm/test/CodeGen/X86/urem-vector-lkk.ll | 298 +----------------------
1 file changed, 2 insertions(+), 296 deletions(-)
diff --git a/llvm/test/CodeGen/X86/urem-vector-lkk.ll b/llvm/test/CodeGen/X86/urem-vector-lkk.ll
index 89cef0daffd77..3faa2a0720d4e 100644
--- a/llvm/test/CodeGen/X86/urem-vector-lkk.ll
+++ b/llvm/test/CodeGen/X86/urem-vector-lkk.ll
@@ -22,78 +22,8 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) {
; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [95,124,98,1003,u,u,u,u]
; SSE2-NEXT: psubw %xmm1, %xmm0
; SSE2-NEXT: retq
-; SSE-LABEL: fold_urem_vec_1:
-; SSE: # %bb.0:
-; SSE-NEXT: pextrw $1, %xmm0, %eax
-; SSE-NEXT: movl %eax, %ecx
-; SSE-NEXT: shrl $2, %ecx
-; SSE-NEXT: imull $16913, %ecx, %ecx # imm = 0x4211
-; SSE-NEXT: shrl $19, %ecx
-; SSE-NEXT: imull $124, %ecx, %ecx
-; SSE-NEXT: subl %ecx, %eax
-; SSE-NEXT: movd %xmm0, %ecx
-; SSE-NEXT: movzwl %cx, %edx
-; SSE-NEXT: imull $44151, %edx, %edx # imm = 0xAC77
-; SSE-NEXT: shrl $22, %edx
-; SSE-NEXT: imull $95, %edx, %edx
-; SSE-NEXT: subl %edx, %ecx
-; SSE-NEXT: movd %ecx, %xmm1
-; SSE-NEXT: pinsrw $1, %eax, %xmm1
-; SSE-NEXT: pextrw $2, %xmm0, %eax
-; SSE-NEXT: movl %eax, %ecx
-; SSE-NEXT: shrl %ecx
-; SSE-NEXT: imull $2675, %ecx, %ecx # imm = 0xA73
-; SSE-NEXT: shrl $17, %ecx
-; SSE-NEXT: imull $98, %ecx, %ecx
-; SSE-NEXT: subl %ecx, %eax
-; SSE-NEXT: pinsrw $2, %eax, %xmm1
-; SSE-NEXT: pextrw $3, %xmm0, %eax
-; SSE-NEXT: imull $1373, %eax, %ecx # imm = 0x55D
-; SSE-NEXT: shrl $16, %ecx
-; SSE-NEXT: movl %eax, %edx
-; SSE-NEXT: subl %ecx, %edx
-; SSE-NEXT: movzwl %dx, %edx
-; SSE-NEXT: shrl %edx
-; SSE-NEXT: addl %ecx, %edx
-; SSE-NEXT: shrl $9, %edx
-; SSE-NEXT: imull $1003, %edx, %ecx # imm = 0x3EB
-; SSE-NEXT: subl %ecx, %eax
-; SSE-NEXT: pinsrw $3, %eax, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: retq
-; SSE-LABEL: fold_urem_vec_1:
-; SSE: # %bb.0:
-; SSE-NEXT: pextrw $1, %xmm0, %eax
-; SSE-NEXT: movl %eax, %ecx
-; SSE-NEXT: shrl $2, %ecx
-; SSE-NEXT: imull $16913, %ecx, %ecx # imm = 0x4211
-; SSE-NEXT: shrl $19, %ecx
-; SSE-NEXT: imull $124, %ecx, %ecx
-; SSE-NEXT: subl %ecx, %eax
-; SSE-NEXT: movd %xmm0, %ecx
-; SSE-NEXT: movzwl %cx, %edx
-; SSE-NEXT: imull $44151, %edx, %edx # imm = 0xAC77
-; SSE-NEXT: shrl $22, %edx
-; SSE-NEXT: imull $95, %edx, %edx
-; SSE-NEXT: subl %edx, %ecx
-; SSE-NEXT: movd %ecx, %xmm1
-; SSE-NEXT: pinsrw $1, %eax, %xmm1
-; SSE-NEXT: pextrw $2, %xmm0, %eax
-; SSE-NEXT: movl %eax, %ecx
-; SSE-NEXT: shrl %ecx
-; SSE-NEXT: imull $2675, %ecx, %ecx # imm = 0xA73
-; SSE-NEXT: shrl $17, %ecx
-; SSE-NEXT: imull $98, %ecx, %ecx
-; SSE-NEXT: subl %ecx, %eax
-; SSE-NEXT: pinsrw $2, %eax, %xmm1
-; SSE-NEXT: pextrw $3, %xmm0, %eax
-; SSE-NEXT: imulq $4282176, %rax, %rcx # imm = 0x415740
-; SSE-NEXT: shrq $32, %rcx
-; SSE-NEXT: imull $1003, %ecx, %ecx # imm = 0x3EB
-; SSE-NEXT: subl %ecx, %eax
-; SSE-NEXT: pinsrw $3, %eax, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: retq
+;
+; SSE4-LABEL: fold_urem_vec_1:
; SSE4: # %bb.0:
; SSE4-NEXT: movq {{.*#+}} xmm1 = [0,16384,32768,0,0,0,0,0]
; SSE4-NEXT: pmulhuw %xmm0, %xmm1
@@ -132,79 +62,6 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) {
; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [95,124,98,1003,u,u,u,u]
; AVX512-NEXT: vpsubw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
-||||||| parent of 49405037fc36 (Use fixup-free 64-bit magic multiply for narrow udiv with IsAdd)
-; AVX-LABEL: fold_urem_vec_1:
-; AVX: # %bb.0:
-; AVX-NEXT: vpextrw $1, %xmm0, %eax
-; AVX-NEXT: movl %eax, %ecx
-; AVX-NEXT: shrl $2, %ecx
-; AVX-NEXT: imull $16913, %ecx, %ecx # imm = 0x4211
-; AVX-NEXT: shrl $19, %ecx
-; AVX-NEXT: imull $124, %ecx, %ecx
-; AVX-NEXT: subl %ecx, %eax
-; AVX-NEXT: vmovd %xmm0, %ecx
-; AVX-NEXT: movzwl %cx, %edx
-; AVX-NEXT: imull $44151, %edx, %edx # imm = 0xAC77
-; AVX-NEXT: shrl $22, %edx
-; AVX-NEXT: imull $95, %edx, %edx
-; AVX-NEXT: subl %edx, %ecx
-; AVX-NEXT: vmovd %ecx, %xmm1
-; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $2, %xmm0, %eax
-; AVX-NEXT: movl %eax, %ecx
-; AVX-NEXT: shrl %ecx
-; AVX-NEXT: imull $2675, %ecx, %ecx # imm = 0xA73
-; AVX-NEXT: shrl $17, %ecx
-; AVX-NEXT: imull $98, %ecx, %ecx
-; AVX-NEXT: subl %ecx, %eax
-; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $3, %xmm0, %eax
-; AVX-NEXT: imull $1373, %eax, %ecx # imm = 0x55D
-; AVX-NEXT: shrl $16, %ecx
-; AVX-NEXT: movl %eax, %edx
-; AVX-NEXT: subl %ecx, %edx
-; AVX-NEXT: movzwl %dx, %edx
-; AVX-NEXT: shrl %edx
-; AVX-NEXT: addl %ecx, %edx
-; AVX-NEXT: shrl $9, %edx
-; AVX-NEXT: imull $1003, %edx, %ecx # imm = 0x3EB
-; AVX-NEXT: subl %ecx, %eax
-; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0
-; AVX-NEXT: retq
-=======
-; AVX-LABEL: fold_urem_vec_1:
-; AVX: # %bb.0:
-; AVX-NEXT: vpextrw $1, %xmm0, %eax
-; AVX-NEXT: movl %eax, %ecx
-; AVX-NEXT: shrl $2, %ecx
-; AVX-NEXT: imull $16913, %ecx, %ecx # imm = 0x4211
-; AVX-NEXT: shrl $19, %ecx
-; AVX-NEXT: imull $124, %ecx, %ecx
-; AVX-NEXT: subl %ecx, %eax
-; AVX-NEXT: vmovd %xmm0, %ecx
-; AVX-NEXT: movzwl %cx, %edx
-; AVX-NEXT: imull $44151, %edx, %edx # imm = 0xAC77
-; AVX-NEXT: shrl $22, %edx
-; AVX-NEXT: imull $95, %edx, %edx
-; AVX-NEXT: subl %edx, %ecx
-; AVX-NEXT: vmovd %ecx, %xmm1
-; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $2, %xmm0, %eax
-; AVX-NEXT: movl %eax, %ecx
-; AVX-NEXT: shrl %ecx
-; AVX-NEXT: imull $2675, %ecx, %ecx # imm = 0xA73
-; AVX-NEXT: shrl $17, %ecx
-; AVX-NEXT: imull $98, %ecx, %ecx
-; AVX-NEXT: subl %ecx, %eax
-; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $3, %xmm0, %eax
-; AVX-NEXT: imulq $4282176, %rax, %rcx # imm = 0x415740
-; AVX-NEXT: shrq $32, %rcx
-; AVX-NEXT: imull $1003, %ecx, %ecx # imm = 0x3EB
-; AVX-NEXT: subl %ecx, %eax
-; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0
-; AVX-NEXT: retq
->>>>>>> 49405037fc36 (Use fixup-free 64-bit magic multiply for narrow udiv with IsAdd)
%1 = urem <4 x i16> %x, <i16 95, i16 124, i16 98, i16 1003>
ret <4 x i16> %1
}
@@ -318,7 +175,6 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) {
; Don't fold if the divisor is one.
define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) {
-<<<<<<< HEAD
; SSE2-LABEL: dont_fold_urem_one:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
@@ -348,67 +204,7 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) {
; SSE4-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [1,654,23,5423,u,u,u,u]
; SSE4-NEXT: psubw %xmm2, %xmm0
; SSE4-NEXT: retq
-||||||| parent of 49405037fc36 (Use fixup-free 64-bit magic multiply for narrow udiv with IsAdd)
-; SSE-LABEL: dont_fold_urem_one:
-; SSE: # %bb.0:
-; SSE-NEXT: pextrw $2, %xmm0, %eax
-; SSE-NEXT: imull $25645, %eax, %ecx # imm = 0x642D
-; SSE-NEXT: shrl $16, %ecx
-; SSE-NEXT: movl %eax, %edx
-; SSE-NEXT: subl %ecx, %edx
-; SSE-NEXT: movzwl %dx, %edx
-; SSE-NEXT: shrl %edx
-; SSE-NEXT: addl %ecx, %edx
-; SSE-NEXT: shrl $4, %edx
-; SSE-NEXT: leal (%rdx,%rdx,2), %ecx
-; SSE-NEXT: shll $3, %ecx
-; SSE-NEXT: subl %ecx, %edx
-; SSE-NEXT: addl %eax, %edx
-; SSE-NEXT: pextrw $1, %xmm0, %eax
-; SSE-NEXT: imull $51307, %eax, %ecx # imm = 0xC86B
-; SSE-NEXT: shrl $25, %ecx
-; SSE-NEXT: imull $654, %ecx, %ecx # imm = 0x28E
-; SSE-NEXT: subl %ecx, %eax
-; SSE-NEXT: pxor %xmm1, %xmm1
-; SSE-NEXT: pinsrw $1, %eax, %xmm1
-; SSE-NEXT: pinsrw $2, %edx, %xmm1
-; SSE-NEXT: pextrw $3, %xmm0, %eax
-; SSE-NEXT: imull $12375, %eax, %ecx # imm = 0x3057
-; SSE-NEXT: shrl $26, %ecx
-; SSE-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F
-; SSE-NEXT: subl %ecx, %eax
-; SSE-NEXT: pinsrw $3, %eax, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: retq
-=======
-; SSE-LABEL: dont_fold_urem_one:
-; SSE: # %bb.0:
-; SSE-NEXT: pextrw $1, %xmm0, %eax
-; SSE-NEXT: imull $51307, %eax, %ecx # imm = 0xC86B
-; SSE-NEXT: shrl $25, %ecx
-; SSE-NEXT: imull $654, %ecx, %ecx # imm = 0x28E
-; SSE-NEXT: subl %ecx, %eax
-; SSE-NEXT: pxor %xmm1, %xmm1
-; SSE-NEXT: pinsrw $1, %eax, %xmm1
-; SSE-NEXT: pextrw $2, %xmm0, %eax
-; SSE-NEXT: imulq $186738688, %rax, %rcx # imm = 0xB216800
-; SSE-NEXT: shrq $32, %rcx
-; SSE-NEXT: leal (%rcx,%rcx,2), %edx
-; SSE-NEXT: shll $3, %edx
-; SSE-NEXT: subl %edx, %ecx
-; SSE-NEXT: addl %eax, %ecx
-; SSE-NEXT: pinsrw $2, %ecx, %xmm1
-; SSE-NEXT: pextrw $3, %xmm0, %eax
-; SSE-NEXT: imull $12375, %eax, %ecx # imm = 0x3057
-; SSE-NEXT: shrl $26, %ecx
-; SSE-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F
-; SSE-NEXT: subl %ecx, %eax
-; SSE-NEXT: pinsrw $3, %eax, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: retq
->>>>>>> 49405037fc36 (Use fixup-free 64-bit magic multiply for narrow udiv with IsAdd)
;
-<<<<<<< HEAD
; AVX1OR2-LABEL: dont_fold_urem_one:
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [u,51307,25645,12375,u,u,u,u]
@@ -432,94 +228,6 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) {
; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,654,23,5423,u,u,u,u]
; AVX512-NEXT: vpsubw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
-||||||| parent of 49405037fc36 (Use fixup-free 64-bit magic multiply for narrow udiv with IsAdd)
-; AVX1OR2-LABEL: dont_fold_urem_one:
-; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vpextrw $2, %xmm0, %eax
-; AVX1OR2-NEXT: imull $25645, %eax, %ecx # imm = 0x642D
-; AVX1OR2-NEXT: shrl $16, %ecx
-; AVX1OR2-NEXT: movl %eax, %edx
-; AVX1OR2-NEXT: subl %ecx, %edx
-; AVX1OR2-NEXT: movzwl %dx, %edx
-; AVX1OR2-NEXT: shrl %edx
-; AVX1OR2-NEXT: addl %ecx, %edx
-; AVX1OR2-NEXT: shrl $4, %edx
-; AVX1OR2-NEXT: leal (%rdx,%rdx,2), %ecx
-; AVX1OR2-NEXT: shll $3, %ecx
-; AVX1OR2-NEXT: subl %ecx, %edx
-; AVX1OR2-NEXT: addl %eax, %edx
-; AVX1OR2-NEXT: vpextrw $1, %xmm0, %eax
-; AVX1OR2-NEXT: imull $51307, %eax, %ecx # imm = 0xC86B
-; AVX1OR2-NEXT: shrl $25, %ecx
-; AVX1OR2-NEXT: imull $654, %ecx, %ecx # imm = 0x28E
-; AVX1OR2-NEXT: subl %ecx, %eax
-; AVX1OR2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1OR2-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
-; AVX1OR2-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1
-; AVX1OR2-NEXT: vpextrw $3, %xmm0, %eax
-; AVX1OR2-NEXT: imull $12375, %eax, %ecx # imm = 0x3057
-; AVX1OR2-NEXT: shrl $26, %ecx
-; AVX1OR2-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F
-; AVX1OR2-NEXT: subl %ecx, %eax
-; AVX1OR2-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0
-; AVX1OR2-NEXT: retq
-;
-; AVX512-LABEL: dont_fold_urem_one:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpextrw $2, %xmm0, %eax
-; AVX512-NEXT: imull $25645, %eax, %ecx # imm = 0x642D
-; AVX512-NEXT: shrl $16, %ecx
-; AVX512-NEXT: movl %eax, %edx
-; AVX512-NEXT: subl %ecx, %edx
-; AVX512-NEXT: movzwl %dx, %edx
-; AVX512-NEXT: shrl %edx
-; AVX512-NEXT: addl %ecx, %edx
-; AVX512-NEXT: shrl $4, %edx
-; AVX512-NEXT: leal (%rdx,%rdx,2), %ecx
-; AVX512-NEXT: shll $3, %ecx
-; AVX512-NEXT: subl %ecx, %edx
-; AVX512-NEXT: vpextrw $1, %xmm0, %ecx
-; AVX512-NEXT: addl %eax, %edx
-; AVX512-NEXT: imull $51307, %ecx, %eax # imm = 0xC86B
-; AVX512-NEXT: shrl $25, %eax
-; AVX512-NEXT: imull $654, %eax, %eax # imm = 0x28E
-; AVX512-NEXT: subl %eax, %ecx
-; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1
-; AVX512-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1
-; AVX512-NEXT: vpextrw $3, %xmm0, %eax
-; AVX512-NEXT: imull $12375, %eax, %ecx # imm = 0x3057
-; AVX512-NEXT: shrl $26, %ecx
-; AVX512-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F
-; AVX512-NEXT: subl %ecx, %eax
-; AVX512-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0
-; AVX512-NEXT: retq
-=======
-; AVX-LABEL: dont_fold_urem_one:
-; AVX: # %bb.0:
-; AVX-NEXT: vpextrw $1, %xmm0, %eax
-; AVX-NEXT: imull $51307, %eax, %ecx # imm = 0xC86B
-; AVX-NEXT: shrl $25, %ecx
-; AVX-NEXT: imull $654, %ecx, %ecx # imm = 0x28E
-; AVX-NEXT: subl %ecx, %eax
-; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $2, %xmm0, %eax
-; AVX-NEXT: imulq $186738688, %rax, %rcx # imm = 0xB216800
-; AVX-NEXT: shrq $32, %rcx
-; AVX-NEXT: leal (%rcx,%rcx,2), %edx
-; AVX-NEXT: shll $3, %edx
-; AVX-NEXT: subl %edx, %ecx
-; AVX-NEXT: addl %eax, %ecx
-; AVX-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $3, %xmm0, %eax
-; AVX-NEXT: imull $12375, %eax, %ecx # imm = 0x3057
-; AVX-NEXT: shrl $26, %ecx
-; AVX-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F
-; AVX-NEXT: subl %ecx, %eax
-; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0
-; AVX-NEXT: retq
->>>>>>> 49405037fc36 (Use fixup-free 64-bit magic multiply for narrow udiv with IsAdd)
%1 = urem <4 x i16> %x, <i16 1, i16 654, i16 23, i16 5423>
ret <4 x i16> %1
}
@@ -737,5 +445,3 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) {
%1 = urem <4 x i64> %x, <i64 1, i64 654, i64 23, i64 5423>
ret <4 x i64> %1
}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX1OR2: {{.*}}
More information about the llvm-commits
mailing list