[llvm] [SelectionDAG] constant division fallback for existing Constant Division optimization (PR #188402)

Sat Apr 4 19:19:21 PDT 2026

https://github.com/Takashiidobe updated https://github.com/llvm/llvm-project/pull/188402

>From b4a6a891a45f341fe0053456b93829d12c4abd37 Mon Sep 17 00:00:00 2001
From: Takashiidobe <idobetakashi at gmail.com>
Date: Mon, 23 Mar 2026 20:07:36 -0400
Subject: [PATCH 1/6] add pre-commit tests for narrow udiv magic multiply

Add tests showing the current codegen for i8 and i16 udiv-by-constant.
The Hacker's Delight algorithm often needs an expensive add-and-shift
fixup for these narrow types. A subsequent patch will improve BuildUDIV
to use a simple magic multiply at a wider legal type instead.
---
 llvm/test/CodeGen/X86/udiv-narrow-magic.ll | 170 +++++++++++++++++++++
 1 file changed, 170 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/udiv-narrow-magic.ll

diff --git a/llvm/test/CodeGen/X86/udiv-narrow-magic.ll b/llvm/test/CodeGen/X86/udiv-narrow-magic.ll
new file mode 100644
index 0000000000000..5aebf694da0ff
--- /dev/null
+++ b/llvm/test/CodeGen/X86/udiv-narrow-magic.ll
@@ -0,0 +1,170 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=x86_64 < %s | FileCheck %s
+
+; When a narrow udiv-by-constant (i8, i16) is lowered via BuildUDIV, the
+; Hacker's Delight algorithm may need an expensive add-and-shift fixup.
+; A wider legal type (e.g. i32 for i8) has enough headroom for a simple
+; Magic = ceil(2^Shift / C) that needs no fixup at all.
+
+; --- i8 cases ---
+
+define i8 @udiv_i8_by7(i8 %x) nounwind {
+; CHECK-LABEL: udiv_i8_by7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    leal (%rax,%rax,8), %ecx
+; CHECK-NEXT:    leal (%rax,%rcx,4), %ecx
+; CHECK-NEXT:    shrl $8, %ecx
+; CHECK-NEXT:    subb %cl, %al
+; CHECK-NEXT:    shrb %al
+; CHECK-NEXT:    addb %cl, %al
+; CHECK-NEXT:    shrb $2, %al
+; CHECK-NEXT:    # kill: def $al killed $al killed $rax
+; CHECK-NEXT:    retq
+  %d = udiv i8 %x, 7
+  ret i8 %d
+}
+
+define i8 @udiv_i8_by5(i8 %x) nounwind {
+; CHECK-LABEL: udiv_i8_by5:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    imull $205, %eax, %eax
+; CHECK-NEXT:    shrl $10, %eax
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK-NEXT:    retq
+  %d = udiv i8 %x, 5
+  ret i8 %d
+}
+
+define i8 @udiv_i8_by3(i8 %x) nounwind {
+; CHECK-LABEL: udiv_i8_by3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    imull $171, %eax, %eax
+; CHECK-NEXT:    shrl $9, %eax
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK-NEXT:    retq
+  %d = udiv i8 %x, 3
+  ret i8 %d
+}
+
+; Even divisor - stock uses pre-shift + magic; simple magic avoids pre-shift.
+define i8 @udiv_i8_by78(i8 %x) nounwind {
+; CHECK-LABEL: udiv_i8_by78:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    shrb %dil
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    imull $211, %eax, %eax
+; CHECK-NEXT:    shrl $13, %eax
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK-NEXT:    retq
+  %d = udiv i8 %x, 78
+  ret i8 %d
+}
+
+define i8 @udiv_i8_by116(i8 %x) nounwind {
+; CHECK-LABEL: udiv_i8_by116:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    shrb $2, %dil
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    imull $71, %eax, %eax
+; CHECK-NEXT:    shrl $11, %eax
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK-NEXT:    retq
+  %d = udiv i8 %x, 116
+  ret i8 %d
+}
+
+; Power of 2 - should NOT fire (already lowered to shift).
+define i8 @udiv_i8_by4(i8 %x) nounwind {
+; CHECK-LABEL: udiv_i8_by4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    shrb $2, %al
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK-NEXT:    retq
+  %d = udiv i8 %x, 4
+  ret i8 %d
+}
+
+; Division by 1 - should NOT fire.
+define i8 @udiv_i8_by1(i8 %x) nounwind {
+; CHECK-LABEL: udiv_i8_by1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK-NEXT:    retq
+  %d = udiv i8 %x, 1
+  ret i8 %d
+}
+
+; Bare i8 udiv feeding another i8 op (no zext).
+define i8 @udiv_i8_then_add(i8 %x, i8 %y) nounwind {
+; CHECK-LABEL: udiv_i8_then_add:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    leal (%rax,%rax,8), %ecx
+; CHECK-NEXT:    leal (%rax,%rcx,4), %ecx
+; CHECK-NEXT:    shrl $8, %ecx
+; CHECK-NEXT:    subb %cl, %al
+; CHECK-NEXT:    shrb %al
+; CHECK-NEXT:    addb %cl, %al
+; CHECK-NEXT:    shrb $2, %al
+; CHECK-NEXT:    addb %sil, %al
+; CHECK-NEXT:    # kill: def $al killed $al killed $rax
+; CHECK-NEXT:    retq
+  %d = udiv i8 %x, 7
+  %r = add i8 %d, %y
+  ret i8 %r
+}
+
+; --- i16 cases ---
+
+define i16 @udiv_i16_by7(i16 %x) nounwind {
+; CHECK-LABEL: udiv_i16_by7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movzwl %di, %eax
+; CHECK-NEXT:    imull $9363, %eax, %ecx # imm = 0x2493
+; CHECK-NEXT:    shrl $16, %ecx
+; CHECK-NEXT:    subl %ecx, %edi
+; CHECK-NEXT:    movzwl %di, %eax
+; CHECK-NEXT:    shrl %eax
+; CHECK-NEXT:    addl %ecx, %eax
+; CHECK-NEXT:    shrl $2, %eax
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    retq
+  %d = udiv i16 %x, 7
+  ret i16 %d
+}
+
+define i16 @udiv_i16_by100(i16 %x) nounwind {
+; CHECK-LABEL: udiv_i16_by100:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movzwl %di, %eax
+; CHECK-NEXT:    shrl $2, %eax
+; CHECK-NEXT:    imull $5243, %eax, %eax # imm = 0x147B
+; CHECK-NEXT:    shrl $17, %eax
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    retq
+  %d = udiv i16 %x, 100
+  ret i16 %d
+}
+
+; zext(udiv i16) - should also improve.
+define i32 @zext_udiv_i16_by7(i16 %x) nounwind {
+; CHECK-LABEL: zext_udiv_i16_by7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movzwl %di, %eax
+; CHECK-NEXT:    imull $9363, %eax, %ecx # imm = 0x2493
+; CHECK-NEXT:    shrl $16, %ecx
+; CHECK-NEXT:    subl %ecx, %edi
+; CHECK-NEXT:    movzwl %di, %eax
+; CHECK-NEXT:    shrl %eax
+; CHECK-NEXT:    addl %ecx, %eax
+; CHECK-NEXT:    shrl $2, %eax
+; CHECK-NEXT:    retq
+  %d = udiv i16 %x, 7
+  %z = zext i16 %d to i32
+  ret i32 %z
+}

>From 5b9bb3086de6024f404eaec034f230db5fb9c280 Mon Sep 17 00:00:00 2001
From: Takashiidobe <idobetakashi at gmail.com>
Date: Tue, 24 Mar 2026 17:36:01 -0400
Subject: [PATCH 2/6] Use fixup-free 64-bit magic multiply for narrow udiv with
 IsAdd

For i8/i16 udiv by constant where Hacker's Delight requires an
add-and-shift fixup (IsAdd), try a fixup-free alternative:

  trunc(srl(mul(zext(x, 64), ceil(2^Shift / C)), Shift))

This is strictly cheaper than the HD NPQ path (zext + MUL + SRL vs.
MULHU + sub + srl + add + srl). The optimization is gated on i64 MUL
being natively legal, since some divisors near powers of two require
widening to 64 bits and a synthesized 64-bit multiply would be worse
than the fixup. When no fixup-free solution exists, the HD path is used.

Implementation notes:
- findSimpleWideMagic searches 64-bit space only: when HD requires IsAdd
  for i8/i16, no 32-bit fixup-free solution exists (brute-force verified)
- Single i64 legality check at call site gates the entire optimization
- Magic*C overflow check removed: Magic*C <= 2^63 + 65535 < 2^64 for
  i8/i16, so it never overflows
- Break on Check (1) overflow is correct: Magic = ceil(2^Shift / C)
  grows monotonically with Shift, so overflow can only worsen
- Return type uses a named struct + bool following LLVM convention
- UseSimpleWideMul flag replaced by SimpleWideMulMagic.getNode() sentinel
---
 .../CodeGen/SelectionDAG/TargetLowering.cpp   | 101 +++++-
 llvm/test/CodeGen/X86/udiv-narrow-magic.ll    |  43 +--
 llvm/test/CodeGen/X86/urem-vector-lkk.ll      | 298 +++++++++++++++++-
 3 files changed, 408 insertions(+), 34 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 092bc283c84dc..28f962c93b981 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -6781,6 +6781,56 @@ SDValue TargetLowering::BuildSDIV(SDNode *N, SelectionDAG &DAG,
   return DAG.getNode(ISD::ADD, dl, VT, Q, T);
 }
 
+/// For narrow scalar types (i8/i16) where Hacker's Delight requires an
+/// add-and-shift fixup (IsAdd), check whether a fixup-free 64-bit magic
+/// multiply exists:
+///   trunc(srl(mul(zext(x, 64), Magic), Shift))
+/// where Magic = ceil(2^Shift / C).
+///
+/// No fixup is needed when two conditions hold:
+///   (1) MaxX * Magic < 2^64    (multiply doesn't overflow 64 bits)
+///   (2) MaxX * (Magic*C - 2^Shift) < 2^Shift  (approximation error is exact)
+///
+/// When IsAdd is required by HD, no 32-bit fixup-free solution exists, so we
+/// search only in 64-bit space. Populates Info and returns true on success.
+struct SimpleWideMagicInfo {
+  APInt Magic;
+  unsigned Shift;
+};
+
+static bool findSimpleWideMagic(const APInt &Divisor, const APInt &MaxX,
+                                SimpleWideMagicInfo &Info) {
+  APInt DivWide = Divisor.zext(64);
+  APInt MaxWide = MaxX.zext(64);
+  unsigned MinShift = Divisor.ceilLogBase2();
+
+  for (unsigned Shift = MinShift; Shift < 64; ++Shift) {
+    APInt TwoToS = APInt(64, 1).shl(Shift);
+    APInt Magic = APIntOps::RoundingUDiv(TwoToS, DivWide, APInt::Rounding::UP);
+
+    // Check (1): MaxX * Magic must fit in 64 bits. Magic = ceil(2^Shift / C)
+    // grows monotonically with Shift, so once this overflows no larger Shift
+    // can succeed either.
+    bool Overflow = false;
+    (void)MaxWide.umul_ov(Magic, Overflow);
+    if (Overflow)
+      break;
+
+    // Check (2): MaxX * (Magic*C - 2^Shift) < 2^Shift.
+    // Magic*C never overflows 64 bits for i8/i16: Magic*C <= 2^Shift + C
+    // <= 2^63 + 65535 < 2^64.
+    APInt Error = Magic * DivWide - TwoToS;
+    APInt MaxError = MaxWide.umul_ov(Error, Overflow);
+    if (Overflow || MaxError.uge(TwoToS))
+      continue;
+
+    Info = {Magic, Shift};
+    return true;
+  }
+
+  return false;
+}
+
 /// Given an ISD::UDIV node expressing a divide by constant,
 /// return a DAG expression to select that will generate the same value by
 /// multiplying by a magic number.
@@ -6852,9 +6902,13 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
 
+  // Compute known bits once; used both to reduce the Hacker's Delight magic
+  // and to check simple-wide-magic conditions below.
+  KnownBits Known0 = DAG.computeKnownBits(N0);
+
   // Try to use leading zeros of the dividend to reduce the multiplier and
   // avoid expensive fixups.
-  unsigned KnownLeadingZeros = DAG.computeKnownBits(N0).countMinLeadingZeros();
+  unsigned KnownLeadingZeros = Known0.countMinLeadingZeros();
 
   // If we're after type legalization and SVT is not legal, use the
   // promoted type for creating constants to avoid creating nodes with
@@ -6882,6 +6936,13 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
 
   bool UseNPQ = false, UsePreShift = false, UsePostShift = false;
   bool UseWiden = false;
+  // For narrow scalar types (i8, i16) a simple fixup-free wide magic may exist:
+  //   trunc(srl(mul(zext(x, W), ceil(2^Shift / C)), Shift))
+  // This is preferred over the NPQ add-and-shift fixup when it applies.
+  // SimpleWideMulMagic being non-null indicates this path was taken.
+  EVT SimpleWideMulVT;
+  SDValue SimpleWideMulMagic;
+  SDValue SimpleWideMulShift;
   SmallVector<SDValue, 16> PreShifts, PostShifts, MagicFactors, NPQFactors;
 
   auto BuildUDIVPattern = [&](ConstantSDNode *C) {
@@ -6927,6 +6988,29 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
       UseNPQ |= magics.IsAdd;
       UsePreShift |= magics.PreShift != 0;
       UsePostShift |= magics.PostShift != 0;
+
+      // For narrow scalar types (i8, i16), when the Hacker's Delight magic
+      // requires the expensive NPQ add-and-shift fixup (IsAdd), check whether
+      // a simple fixup-free 64-bit magic exists:
+      //   trunc(srl(mul(zext(x, 64), ceil(2^Shift / C)), Shift))
+      // When IsAdd is required by HD, no 32-bit fixup-free solution exists, so
+      // we go directly to 64-bit. Only attempt when i64 MUL is natively legal.
+      EVT I64VT = EVT::getIntegerVT(*DAG.getContext(), 64);
+      bool IsScalar = !VT.isVector();
+      bool IsNarrow = EltBits <= 16;
+      bool NeedsAddFixup = magics.IsAdd;
+      bool HasLegalI64Mul =
+          isOperationLegalOrCustom(ISD::MUL, I64VT, IsAfterLegalization);
+      if (IsScalar && IsNarrow && NeedsAddFixup && HasLegalI64Mul) {
+        APInt MaxX = Known0.getMaxValue();
+        SimpleWideMagicInfo Info;
+        if (findSimpleWideMagic(Divisor, MaxX, Info)) {
+          SimpleWideMulVT = I64VT;
+          EVT WideShVT = getShiftAmountTy(I64VT, DAG.getDataLayout());
+          SimpleWideMulMagic = DAG.getConstant(Info.Magic, dl, I64VT);
+          SimpleWideMulShift = DAG.getConstant(Info.Shift, dl, WideShVT);
+        }
+      }
     }
 
     PreShifts.push_back(PreShift);
@@ -6962,6 +7046,21 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
     PostShift = PostShifts[0];
   }
 
+  // Simple wide magic: trunc(srl(mul(zext(x, 64), Magic), Shift)).
+  // Only applies to narrow scalars (i8, i16); divisor=1 is excluded (never
+  // sets SimpleWideMulMagic), so no IsOne select is needed here.
+  if (SimpleWideMulMagic.getNode()) {
+    SDValue Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, SimpleWideMulVT, N0);
+    Created.push_back(Wide.getNode());
+    SDValue Mul =
+        DAG.getNode(ISD::MUL, dl, SimpleWideMulVT, Wide, SimpleWideMulMagic);
+    Created.push_back(Mul.getNode());
+    SDValue Srl =
+        DAG.getNode(ISD::SRL, dl, SimpleWideMulVT, Mul, SimpleWideMulShift);
+    Created.push_back(Srl.getNode());
+    return DAG.getNode(ISD::TRUNCATE, dl, VT, Srl);
+  }
+
   if (UseWiden) {
     // Compute: (WideSVT(x) * MagicFactor) >> WideSVTBits.
     SDValue WideN0 = DAG.getNode(ISD::ZERO_EXTEND, dl, WideSVT, N0);
diff --git a/llvm/test/CodeGen/X86/udiv-narrow-magic.ll b/llvm/test/CodeGen/X86/udiv-narrow-magic.ll
index 5aebf694da0ff..6c20b14f90ae6 100644
--- a/llvm/test/CodeGen/X86/udiv-narrow-magic.ll
+++ b/llvm/test/CodeGen/X86/udiv-narrow-magic.ll
@@ -12,14 +12,9 @@ define i8 @udiv_i8_by7(i8 %x) nounwind {
 ; CHECK-LABEL: udiv_i8_by7:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    leal (%rax,%rax,8), %ecx
-; CHECK-NEXT:    leal (%rax,%rcx,4), %ecx
-; CHECK-NEXT:    shrl $8, %ecx
-; CHECK-NEXT:    subb %cl, %al
-; CHECK-NEXT:    shrb %al
-; CHECK-NEXT:    addb %cl, %al
-; CHECK-NEXT:    shrb $2, %al
-; CHECK-NEXT:    # kill: def $al killed $al killed $rax
+; CHECK-NEXT:    imull $293, %eax, %eax # imm = 0x125
+; CHECK-NEXT:    shrl $11, %eax
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %d = udiv i8 %x, 7
   ret i8 %d
@@ -104,15 +99,10 @@ define i8 @udiv_i8_then_add(i8 %x, i8 %y) nounwind {
 ; CHECK-LABEL: udiv_i8_then_add:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    leal (%rax,%rax,8), %ecx
-; CHECK-NEXT:    leal (%rax,%rcx,4), %ecx
-; CHECK-NEXT:    shrl $8, %ecx
-; CHECK-NEXT:    subb %cl, %al
-; CHECK-NEXT:    shrb %al
-; CHECK-NEXT:    addb %cl, %al
-; CHECK-NEXT:    shrb $2, %al
+; CHECK-NEXT:    imull $293, %eax, %eax # imm = 0x125
+; CHECK-NEXT:    shrl $11, %eax
 ; CHECK-NEXT:    addb %sil, %al
-; CHECK-NEXT:    # kill: def $al killed $al killed $rax
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %d = udiv i8 %x, 7
   %r = add i8 %d, %y
@@ -125,14 +115,9 @@ define i16 @udiv_i16_by7(i16 %x) nounwind {
 ; CHECK-LABEL: udiv_i16_by7:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movzwl %di, %eax
-; CHECK-NEXT:    imull $9363, %eax, %ecx # imm = 0x2493
-; CHECK-NEXT:    shrl $16, %ecx
-; CHECK-NEXT:    subl %ecx, %edi
-; CHECK-NEXT:    movzwl %di, %eax
-; CHECK-NEXT:    shrl %eax
-; CHECK-NEXT:    addl %ecx, %eax
-; CHECK-NEXT:    shrl $2, %eax
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    imulq $74899, %rax, %rax # imm = 0x12493
+; CHECK-NEXT:    shrq $19, %rax
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $rax
 ; CHECK-NEXT:    retq
   %d = udiv i16 %x, 7
   ret i16 %d
@@ -156,13 +141,9 @@ define i32 @zext_udiv_i16_by7(i16 %x) nounwind {
 ; CHECK-LABEL: zext_udiv_i16_by7:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movzwl %di, %eax
-; CHECK-NEXT:    imull $9363, %eax, %ecx # imm = 0x2493
-; CHECK-NEXT:    shrl $16, %ecx
-; CHECK-NEXT:    subl %ecx, %edi
-; CHECK-NEXT:    movzwl %di, %eax
-; CHECK-NEXT:    shrl %eax
-; CHECK-NEXT:    addl %ecx, %eax
-; CHECK-NEXT:    shrl $2, %eax
+; CHECK-NEXT:    imulq $74899, %rax, %rax # imm = 0x12493
+; CHECK-NEXT:    shrq $19, %rax
+; CHECK-NEXT:    # kill: def $eax killed $eax killed $rax
 ; CHECK-NEXT:    retq
   %d = udiv i16 %x, 7
   %z = zext i16 %d to i32
diff --git a/llvm/test/CodeGen/X86/urem-vector-lkk.ll b/llvm/test/CodeGen/X86/urem-vector-lkk.ll
index 3faa2a0720d4e..f9a3b017a6748 100644
--- a/llvm/test/CodeGen/X86/urem-vector-lkk.ll
+++ b/llvm/test/CodeGen/X86/urem-vector-lkk.ll
@@ -22,8 +22,78 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) {
 ; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [95,124,98,1003,u,u,u,u]
 ; SSE2-NEXT:    psubw %xmm1, %xmm0
 ; SSE2-NEXT:    retq
-;
-; SSE4-LABEL: fold_urem_vec_1:
+; SSE-LABEL: fold_urem_vec_1:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pextrw $1, %xmm0, %eax
+; SSE-NEXT:    movl %eax, %ecx
+; SSE-NEXT:    shrl $2, %ecx
+; SSE-NEXT:    imull $16913, %ecx, %ecx # imm = 0x4211
+; SSE-NEXT:    shrl $19, %ecx
+; SSE-NEXT:    imull $124, %ecx, %ecx
+; SSE-NEXT:    subl %ecx, %eax
+; SSE-NEXT:    movd %xmm0, %ecx
+; SSE-NEXT:    movzwl %cx, %edx
+; SSE-NEXT:    imull $44151, %edx, %edx # imm = 0xAC77
+; SSE-NEXT:    shrl $22, %edx
+; SSE-NEXT:    imull $95, %edx, %edx
+; SSE-NEXT:    subl %edx, %ecx
+; SSE-NEXT:    movd %ecx, %xmm1
+; SSE-NEXT:    pinsrw $1, %eax, %xmm1
+; SSE-NEXT:    pextrw $2, %xmm0, %eax
+; SSE-NEXT:    movl %eax, %ecx
+; SSE-NEXT:    shrl %ecx
+; SSE-NEXT:    imull $2675, %ecx, %ecx # imm = 0xA73
+; SSE-NEXT:    shrl $17, %ecx
+; SSE-NEXT:    imull $98, %ecx, %ecx
+; SSE-NEXT:    subl %ecx, %eax
+; SSE-NEXT:    pinsrw $2, %eax, %xmm1
+; SSE-NEXT:    pextrw $3, %xmm0, %eax
+; SSE-NEXT:    imull $1373, %eax, %ecx # imm = 0x55D
+; SSE-NEXT:    shrl $16, %ecx
+; SSE-NEXT:    movl %eax, %edx
+; SSE-NEXT:    subl %ecx, %edx
+; SSE-NEXT:    movzwl %dx, %edx
+; SSE-NEXT:    shrl %edx
+; SSE-NEXT:    addl %ecx, %edx
+; SSE-NEXT:    shrl $9, %edx
+; SSE-NEXT:    imull $1003, %edx, %ecx # imm = 0x3EB
+; SSE-NEXT:    subl %ecx, %eax
+; SSE-NEXT:    pinsrw $3, %eax, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+; SSE-LABEL: fold_urem_vec_1:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pextrw $1, %xmm0, %eax
+; SSE-NEXT:    movl %eax, %ecx
+; SSE-NEXT:    shrl $2, %ecx
+; SSE-NEXT:    imull $16913, %ecx, %ecx # imm = 0x4211
+; SSE-NEXT:    shrl $19, %ecx
+; SSE-NEXT:    imull $124, %ecx, %ecx
+; SSE-NEXT:    subl %ecx, %eax
+; SSE-NEXT:    movd %xmm0, %ecx
+; SSE-NEXT:    movzwl %cx, %edx
+; SSE-NEXT:    imull $44151, %edx, %edx # imm = 0xAC77
+; SSE-NEXT:    shrl $22, %edx
+; SSE-NEXT:    imull $95, %edx, %edx
+; SSE-NEXT:    subl %edx, %ecx
+; SSE-NEXT:    movd %ecx, %xmm1
+; SSE-NEXT:    pinsrw $1, %eax, %xmm1
+; SSE-NEXT:    pextrw $2, %xmm0, %eax
+; SSE-NEXT:    movl %eax, %ecx
+; SSE-NEXT:    shrl %ecx
+; SSE-NEXT:    imull $2675, %ecx, %ecx # imm = 0xA73
+; SSE-NEXT:    shrl $17, %ecx
+; SSE-NEXT:    imull $98, %ecx, %ecx
+; SSE-NEXT:    subl %ecx, %eax
+; SSE-NEXT:    pinsrw $2, %eax, %xmm1
+; SSE-NEXT:    pextrw $3, %xmm0, %eax
+; SSE-NEXT:    imulq $66909, %rax, %rcx # imm = 0x1055D
+; SSE-NEXT:    shrq $26, %rcx
+; SSE-NEXT:    imull $1003, %ecx, %ecx # imm = 0x3EB
+; SSE-NEXT:    subl %ecx, %eax
+; SSE-NEXT:    pinsrw $3, %eax, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    movq {{.*#+}} xmm1 = [0,16384,32768,0,0,0,0,0]
 ; SSE4-NEXT:    pmulhuw %xmm0, %xmm1
@@ -62,6 +132,79 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) {
 ; AVX512-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [95,124,98,1003,u,u,u,u]
 ; AVX512-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
+||||||| parent of 49405037fc36 (Use fixup-free 64-bit magic multiply for narrow udiv with IsAdd)
+; AVX-LABEL: fold_urem_vec_1:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpextrw $1, %xmm0, %eax
+; AVX-NEXT:    movl %eax, %ecx
+; AVX-NEXT:    shrl $2, %ecx
+; AVX-NEXT:    imull $16913, %ecx, %ecx # imm = 0x4211
+; AVX-NEXT:    shrl $19, %ecx
+; AVX-NEXT:    imull $124, %ecx, %ecx
+; AVX-NEXT:    subl %ecx, %eax
+; AVX-NEXT:    vmovd %xmm0, %ecx
+; AVX-NEXT:    movzwl %cx, %edx
+; AVX-NEXT:    imull $44151, %edx, %edx # imm = 0xAC77
+; AVX-NEXT:    shrl $22, %edx
+; AVX-NEXT:    imull $95, %edx, %edx
+; AVX-NEXT:    subl %edx, %ecx
+; AVX-NEXT:    vmovd %ecx, %xmm1
+; AVX-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm1
+; AVX-NEXT:    vpextrw $2, %xmm0, %eax
+; AVX-NEXT:    movl %eax, %ecx
+; AVX-NEXT:    shrl %ecx
+; AVX-NEXT:    imull $2675, %ecx, %ecx # imm = 0xA73
+; AVX-NEXT:    shrl $17, %ecx
+; AVX-NEXT:    imull $98, %ecx, %ecx
+; AVX-NEXT:    subl %ecx, %eax
+; AVX-NEXT:    vpinsrw $2, %eax, %xmm1, %xmm1
+; AVX-NEXT:    vpextrw $3, %xmm0, %eax
+; AVX-NEXT:    imull $1373, %eax, %ecx # imm = 0x55D
+; AVX-NEXT:    shrl $16, %ecx
+; AVX-NEXT:    movl %eax, %edx
+; AVX-NEXT:    subl %ecx, %edx
+; AVX-NEXT:    movzwl %dx, %edx
+; AVX-NEXT:    shrl %edx
+; AVX-NEXT:    addl %ecx, %edx
+; AVX-NEXT:    shrl $9, %edx
+; AVX-NEXT:    imull $1003, %edx, %ecx # imm = 0x3EB
+; AVX-NEXT:    subl %ecx, %eax
+; AVX-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm0
+; AVX-NEXT:    retq
+=======
+; AVX-LABEL: fold_urem_vec_1:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpextrw $1, %xmm0, %eax
+; AVX-NEXT:    movl %eax, %ecx
+; AVX-NEXT:    shrl $2, %ecx
+; AVX-NEXT:    imull $16913, %ecx, %ecx # imm = 0x4211
+; AVX-NEXT:    shrl $19, %ecx
+; AVX-NEXT:    imull $124, %ecx, %ecx
+; AVX-NEXT:    subl %ecx, %eax
+; AVX-NEXT:    vmovd %xmm0, %ecx
+; AVX-NEXT:    movzwl %cx, %edx
+; AVX-NEXT:    imull $44151, %edx, %edx # imm = 0xAC77
+; AVX-NEXT:    shrl $22, %edx
+; AVX-NEXT:    imull $95, %edx, %edx
+; AVX-NEXT:    subl %edx, %ecx
+; AVX-NEXT:    vmovd %ecx, %xmm1
+; AVX-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm1
+; AVX-NEXT:    vpextrw $2, %xmm0, %eax
+; AVX-NEXT:    movl %eax, %ecx
+; AVX-NEXT:    shrl %ecx
+; AVX-NEXT:    imull $2675, %ecx, %ecx # imm = 0xA73
+; AVX-NEXT:    shrl $17, %ecx
+; AVX-NEXT:    imull $98, %ecx, %ecx
+; AVX-NEXT:    subl %ecx, %eax
+; AVX-NEXT:    vpinsrw $2, %eax, %xmm1, %xmm1
+; AVX-NEXT:    vpextrw $3, %xmm0, %eax
+; AVX-NEXT:    imulq $66909, %rax, %rcx # imm = 0x1055D
+; AVX-NEXT:    shrq $26, %rcx
+; AVX-NEXT:    imull $1003, %ecx, %ecx # imm = 0x3EB
+; AVX-NEXT:    subl %ecx, %eax
+; AVX-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm0
+; AVX-NEXT:    retq
+>>>>>>> 49405037fc36 (Use fixup-free 64-bit magic multiply for narrow udiv with IsAdd)
   %1 = urem <4 x i16> %x, <i16 95, i16 124, i16 98, i16 1003>
   ret <4 x i16> %1
 }
@@ -175,6 +318,7 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) {
 
 ; Don't fold if the divisor is one.
 define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) {
+<<<<<<< HEAD
 ; SSE2-LABEL: dont_fold_urem_one:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
@@ -204,7 +348,67 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) {
 ; SSE4-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [1,654,23,5423,u,u,u,u]
 ; SSE4-NEXT:    psubw %xmm2, %xmm0
 ; SSE4-NEXT:    retq
+||||||| parent of 49405037fc36 (Use fixup-free 64-bit magic multiply for narrow udiv with IsAdd)
+; SSE-LABEL: dont_fold_urem_one:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pextrw $2, %xmm0, %eax
+; SSE-NEXT:    imull $25645, %eax, %ecx # imm = 0x642D
+; SSE-NEXT:    shrl $16, %ecx
+; SSE-NEXT:    movl %eax, %edx
+; SSE-NEXT:    subl %ecx, %edx
+; SSE-NEXT:    movzwl %dx, %edx
+; SSE-NEXT:    shrl %edx
+; SSE-NEXT:    addl %ecx, %edx
+; SSE-NEXT:    shrl $4, %edx
+; SSE-NEXT:    leal (%rdx,%rdx,2), %ecx
+; SSE-NEXT:    shll $3, %ecx
+; SSE-NEXT:    subl %ecx, %edx
+; SSE-NEXT:    addl %eax, %edx
+; SSE-NEXT:    pextrw $1, %xmm0, %eax
+; SSE-NEXT:    imull $51307, %eax, %ecx # imm = 0xC86B
+; SSE-NEXT:    shrl $25, %ecx
+; SSE-NEXT:    imull $654, %ecx, %ecx # imm = 0x28E
+; SSE-NEXT:    subl %ecx, %eax
+; SSE-NEXT:    pxor %xmm1, %xmm1
+; SSE-NEXT:    pinsrw $1, %eax, %xmm1
+; SSE-NEXT:    pinsrw $2, %edx, %xmm1
+; SSE-NEXT:    pextrw $3, %xmm0, %eax
+; SSE-NEXT:    imull $12375, %eax, %ecx # imm = 0x3057
+; SSE-NEXT:    shrl $26, %ecx
+; SSE-NEXT:    imull $5423, %ecx, %ecx # imm = 0x152F
+; SSE-NEXT:    subl %ecx, %eax
+; SSE-NEXT:    pinsrw $3, %eax, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+=======
+; SSE-LABEL: dont_fold_urem_one:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pextrw $1, %xmm0, %eax
+; SSE-NEXT:    imull $51307, %eax, %ecx # imm = 0xC86B
+; SSE-NEXT:    shrl $25, %ecx
+; SSE-NEXT:    imull $654, %ecx, %ecx # imm = 0x28E
+; SSE-NEXT:    subl %ecx, %eax
+; SSE-NEXT:    pxor %xmm1, %xmm1
+; SSE-NEXT:    pinsrw $1, %eax, %xmm1
+; SSE-NEXT:    pextrw $2, %xmm0, %eax
+; SSE-NEXT:    imulq $91181, %rax, %rcx # imm = 0x1642D
+; SSE-NEXT:    shrq $21, %rcx
+; SSE-NEXT:    leal (%rcx,%rcx,2), %edx
+; SSE-NEXT:    shll $3, %edx
+; SSE-NEXT:    subl %edx, %ecx
+; SSE-NEXT:    addl %eax, %ecx
+; SSE-NEXT:    pinsrw $2, %ecx, %xmm1
+; SSE-NEXT:    pextrw $3, %xmm0, %eax
+; SSE-NEXT:    imull $12375, %eax, %ecx # imm = 0x3057
+; SSE-NEXT:    shrl $26, %ecx
+; SSE-NEXT:    imull $5423, %ecx, %ecx # imm = 0x152F
+; SSE-NEXT:    subl %ecx, %eax
+; SSE-NEXT:    pinsrw $3, %eax, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+>>>>>>> 49405037fc36 (Use fixup-free 64-bit magic multiply for narrow udiv with IsAdd)
 ;
+<<<<<<< HEAD
 ; AVX1OR2-LABEL: dont_fold_urem_one:
 ; AVX1OR2:       # %bb.0:
 ; AVX1OR2-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [u,51307,25645,12375,u,u,u,u]
@@ -228,6 +432,94 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) {
 ; AVX512-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,654,23,5423,u,u,u,u]
 ; AVX512-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
+||||||| parent of 49405037fc36 (Use fixup-free 64-bit magic multiply for narrow udiv with IsAdd)
+; AVX1OR2-LABEL: dont_fold_urem_one:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vpextrw $2, %xmm0, %eax
+; AVX1OR2-NEXT:    imull $25645, %eax, %ecx # imm = 0x642D
+; AVX1OR2-NEXT:    shrl $16, %ecx
+; AVX1OR2-NEXT:    movl %eax, %edx
+; AVX1OR2-NEXT:    subl %ecx, %edx
+; AVX1OR2-NEXT:    movzwl %dx, %edx
+; AVX1OR2-NEXT:    shrl %edx
+; AVX1OR2-NEXT:    addl %ecx, %edx
+; AVX1OR2-NEXT:    shrl $4, %edx
+; AVX1OR2-NEXT:    leal (%rdx,%rdx,2), %ecx
+; AVX1OR2-NEXT:    shll $3, %ecx
+; AVX1OR2-NEXT:    subl %ecx, %edx
+; AVX1OR2-NEXT:    addl %eax, %edx
+; AVX1OR2-NEXT:    vpextrw $1, %xmm0, %eax
+; AVX1OR2-NEXT:    imull $51307, %eax, %ecx # imm = 0xC86B
+; AVX1OR2-NEXT:    shrl $25, %ecx
+; AVX1OR2-NEXT:    imull $654, %ecx, %ecx # imm = 0x28E
+; AVX1OR2-NEXT:    subl %ecx, %eax
+; AVX1OR2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1OR2-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm1
+; AVX1OR2-NEXT:    vpinsrw $2, %edx, %xmm1, %xmm1
+; AVX1OR2-NEXT:    vpextrw $3, %xmm0, %eax
+; AVX1OR2-NEXT:    imull $12375, %eax, %ecx # imm = 0x3057
+; AVX1OR2-NEXT:    shrl $26, %ecx
+; AVX1OR2-NEXT:    imull $5423, %ecx, %ecx # imm = 0x152F
+; AVX1OR2-NEXT:    subl %ecx, %eax
+; AVX1OR2-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm0
+; AVX1OR2-NEXT:    retq
+;
+; AVX512-LABEL: dont_fold_urem_one:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpextrw $2, %xmm0, %eax
+; AVX512-NEXT:    imull $25645, %eax, %ecx # imm = 0x642D
+; AVX512-NEXT:    shrl $16, %ecx
+; AVX512-NEXT:    movl %eax, %edx
+; AVX512-NEXT:    subl %ecx, %edx
+; AVX512-NEXT:    movzwl %dx, %edx
+; AVX512-NEXT:    shrl %edx
+; AVX512-NEXT:    addl %ecx, %edx
+; AVX512-NEXT:    shrl $4, %edx
+; AVX512-NEXT:    leal (%rdx,%rdx,2), %ecx
+; AVX512-NEXT:    shll $3, %ecx
+; AVX512-NEXT:    subl %ecx, %edx
+; AVX512-NEXT:    vpextrw $1, %xmm0, %ecx
+; AVX512-NEXT:    addl %eax, %edx
+; AVX512-NEXT:    imull $51307, %ecx, %eax # imm = 0xC86B
+; AVX512-NEXT:    shrl $25, %eax
+; AVX512-NEXT:    imull $654, %eax, %eax # imm = 0x28E
+; AVX512-NEXT:    subl %eax, %ecx
+; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpinsrw $1, %ecx, %xmm1, %xmm1
+; AVX512-NEXT:    vpinsrw $2, %edx, %xmm1, %xmm1
+; AVX512-NEXT:    vpextrw $3, %xmm0, %eax
+; AVX512-NEXT:    imull $12375, %eax, %ecx # imm = 0x3057
+; AVX512-NEXT:    shrl $26, %ecx
+; AVX512-NEXT:    imull $5423, %ecx, %ecx # imm = 0x152F
+; AVX512-NEXT:    subl %ecx, %eax
+; AVX512-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+=======
+; AVX-LABEL: dont_fold_urem_one:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpextrw $1, %xmm0, %eax
+; AVX-NEXT:    imull $51307, %eax, %ecx # imm = 0xC86B
+; AVX-NEXT:    shrl $25, %ecx
+; AVX-NEXT:    imull $654, %ecx, %ecx # imm = 0x28E
+; AVX-NEXT:    subl %ecx, %eax
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm1
+; AVX-NEXT:    vpextrw $2, %xmm0, %eax
+; AVX-NEXT:    imulq $91181, %rax, %rcx # imm = 0x1642D
+; AVX-NEXT:    shrq $21, %rcx
+; AVX-NEXT:    leal (%rcx,%rcx,2), %edx
+; AVX-NEXT:    shll $3, %edx
+; AVX-NEXT:    subl %edx, %ecx
+; AVX-NEXT:    addl %eax, %ecx
+; AVX-NEXT:    vpinsrw $2, %ecx, %xmm1, %xmm1
+; AVX-NEXT:    vpextrw $3, %xmm0, %eax
+; AVX-NEXT:    imull $12375, %eax, %ecx # imm = 0x3057
+; AVX-NEXT:    shrl $26, %ecx
+; AVX-NEXT:    imull $5423, %ecx, %ecx # imm = 0x152F
+; AVX-NEXT:    subl %ecx, %eax
+; AVX-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm0
+; AVX-NEXT:    retq
+>>>>>>> 49405037fc36 (Use fixup-free 64-bit magic multiply for narrow udiv with IsAdd)
   %1 = urem <4 x i16> %x, <i16 1, i16 654, i16 23, i16 5423>
   ret <4 x i16> %1
 }
@@ -445,3 +737,5 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) {
   %1 = urem <4 x i64> %x, <i64 1, i64 654, i64 23, i64 5423>
   ret <4 x i64> %1
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AVX1OR2: {{.*}}

>From e72af0a35e18d987eb29ef9b2daead17006e522a Mon Sep 17 00:00:00 2001
From: Takashiidobe <idobetakashi at gmail.com>
Date: Wed, 25 Mar 2026 17:38:22 -0400
Subject: [PATCH 3/6] refactor to embed the fallback division inside the
 cosntant division strength reduction

---
 .../llvm/Support/DivisionByConstantInfo.h     |  24 ++-
 .../CodeGen/SelectionDAG/TargetLowering.cpp   | 156 ++++++------------
 llvm/lib/Support/DivisionByConstantInfo.cpp   |  74 +++++++--
 llvm/test/CodeGen/AArch64/rem-by-const.ll     |  20 +--
 llvm/test/CodeGen/RISCV/div-by-constant.ll    |  44 ++---
 llvm/test/CodeGen/X86/rotate-extract.ll       |  10 +-
 llvm/test/CodeGen/X86/udiv-narrow-magic.ll    |  40 +++++
 .../Support/DivisionByConstantTest.cpp        |  86 +++++++++-
 8 files changed, 279 insertions(+), 175 deletions(-)

diff --git a/llvm/include/llvm/Support/DivisionByConstantInfo.h b/llvm/include/llvm/Support/DivisionByConstantInfo.h
index 283283c912dfe..3e898e9321d1d 100644
--- a/llvm/include/llvm/Support/DivisionByConstantInfo.h
+++ b/llvm/include/llvm/Support/DivisionByConstantInfo.h
@@ -18,6 +18,26 @@
 
 namespace llvm {
 
+/// Standard integer bitwidths that division strength-reduction may widen to.
+/// The numeric value is the actual bit count, so arithmetic on it is valid.
+enum class IntegerBitWidth : unsigned {
+  None = 0,
+  I8 = 8,
+  I16 = 16,
+  I32 = 32,
+  I64 = 64,
+  I128 = 128,
+};
+
+/// Widening strategies for unsigned division by a constant.
+enum class UnsignedDivisionByConstantWidening {
+  None,
+  /// Use a widened high-half multiply and truncate the result.
+  MulHigh,
+  /// Use a widened full multiply followed by an explicit right shift.
+  FullMultiply,
+};
+
 /// Magic data for optimising signed division by a constant.
 struct SignedDivisionByConstantInfo {
   LLVM_ABI static SignedDivisionByConstantInfo get(const APInt &D);
@@ -30,12 +50,12 @@ struct UnsignedDivisionByConstantInfo {
   LLVM_ABI static UnsignedDivisionByConstantInfo
   get(const APInt &D, unsigned LeadingZeros = 0,
       bool AllowEvenDivisorOptimization = true,
-      bool AllowWidenOptimization = false);
+      IntegerBitWidth MaxBitWidth = IntegerBitWidth::None);
   APInt Magic;          ///< magic number
   bool IsAdd;           ///< add indicator
   unsigned PostShift;   ///< post-shift amount
   unsigned PreShift;    ///< pre-shift amount
-  bool Widen;           ///< use widen optimization
+  UnsignedDivisionByConstantWidening Widening;
 };
 
 } // namespace llvm
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 28f962c93b981..8880fd150ad2e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -6781,56 +6781,6 @@ SDValue TargetLowering::BuildSDIV(SDNode *N, SelectionDAG &DAG,
   return DAG.getNode(ISD::ADD, dl, VT, Q, T);
 }
 
-/// For narrow scalar types (i8/i16) where Hacker's Delight requires an
-/// add-and-shift fixup (IsAdd), check whether a fixup-free 64-bit magic
-/// multiply exists:
-///   trunc(srl(mul(zext(x, 64), Magic), Shift))
-/// where Magic = ceil(2^Shift / C).
-///
-/// No fixup is needed when two conditions hold:
-///   (1) MaxX * Magic < 2^64    (multiply doesn't overflow 64 bits)
-///   (2) MaxX * (Magic*C - 2^Shift) < 2^Shift  (approximation error is exact)
-///
-/// When IsAdd is required by HD, no 32-bit fixup-free solution exists, so we
-/// search only in 64-bit space. Populates Info and returns true on success.
-struct SimpleWideMagicInfo {
-  APInt Magic;
-  unsigned Shift;
-};
-
-static bool findSimpleWideMagic(const APInt &Divisor, const APInt &MaxX,
-                                SimpleWideMagicInfo &Info) {
-  APInt DivWide = Divisor.zext(64);
-  APInt MaxWide = MaxX.zext(64);
-  unsigned MinShift = Divisor.ceilLogBase2();
-
-  for (unsigned Shift = MinShift; Shift < 64; ++Shift) {
-    APInt TwoToS = APInt(64, 1).shl(Shift);
-    APInt Magic = APIntOps::RoundingUDiv(TwoToS, DivWide, APInt::Rounding::UP);
-
-    // Check (1): MaxX * Magic must fit in 64 bits. Magic = ceil(2^Shift / C)
-    // grows monotonically with Shift, so once this overflows no larger Shift
-    // can succeed either.
-    bool Overflow = false;
-    (void)MaxWide.umul_ov(Magic, Overflow);
-    if (Overflow)
-      break;
-
-    // Check (2): MaxX * (Magic*C - 2^Shift) < 2^Shift.
-    // Magic*C never overflows 64 bits for i8/i16: Magic*C <= 2^Shift + C
-    // <= 2^63 + 65535 < 2^64.
-    APInt Error = Magic * DivWide - TwoToS;
-    APInt MaxError = MaxWide.umul_ov(Error, Overflow);
-    if (Overflow || MaxError.uge(TwoToS))
-      continue;
-
-    Info = {Magic, Shift};
-    return true;
-  }
-
-  return false;
-}
-
 /// Given an ISD::UDIV node expressing a divide by constant,
 /// return a DAG expression to select that will generate the same value by
 /// multiplying by a magic number.
@@ -6933,16 +6883,20 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
       VT == MVT::i32 &&
       isOperationLegalOrCustom(ISD::UMUL_LOHI, WideSVT, IsAfterLegalization);
   const bool AllowWiden = (HasWideMULHU || HasWideUMUL_LOHI);
+  // For narrow scalars (i8, i16), a fixup-free 64-bit magic may exist when
+  // i64 MUL is available: trunc(srl(mul(zext(x, 64), ceil(2^S/C)), S)).
+  const bool HasLegalI64Mul =
+      isOperationLegalOrCustom(ISD::MUL, WideSVT, IsAfterLegalization);
+  const bool AllowNarrowWiden =
+      EltBits <= 16 && !VT.isVector() && HasLegalI64Mul;
+  const IntegerBitWidth MaxBitWidth = (AllowWiden || AllowNarrowWiden)
+                                          ? IntegerBitWidth::I64
+                                          : IntegerBitWidth::None;
 
   bool UseNPQ = false, UsePreShift = false, UsePostShift = false;
-  bool UseWiden = false;
-  // For narrow scalar types (i8, i16) a simple fixup-free wide magic may exist:
-  //   trunc(srl(mul(zext(x, W), ceil(2^Shift / C)), Shift))
-  // This is preferred over the NPQ add-and-shift fixup when it applies.
-  // SimpleWideMulMagic being non-null indicates this path was taken.
-  EVT SimpleWideMulVT;
-  SDValue SimpleWideMulMagic;
-  SDValue SimpleWideMulShift;
+  UnsignedDivisionByConstantWidening WideningKind =
+      UnsignedDivisionByConstantWidening::None;
+  SDValue SimpleWidenShift;
   SmallVector<SDValue, 16> PreShifts, PostShifts, MagicFactors, NPQFactors;
 
   auto BuildUDIVPattern = [&](ConstantSDNode *C) {
@@ -6964,18 +6918,31 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
           UnsignedDivisionByConstantInfo::get(
               Divisor, std::min(KnownLeadingZeros, Divisor.countl_zero()),
               /*AllowEvenDivisorOptimization=*/true,
-              /*AllowWidenOptimization=*/AllowWiden);
+              /*MaxBitWidth=*/MaxBitWidth);
 
-      if (magics.Widen) {
-        UseWiden = true;
-        MagicFactor = DAG.getConstant(magics.Magic, dl, WideSVT);
-      } else {
+      switch (magics.Widening) {
+      case UnsignedDivisionByConstantWidening::None:
         MagicFactor = DAG.getConstant(magics.Magic.zext(SVTBits), dl, SVT);
+        break;
+      case UnsignedDivisionByConstantWidening::MulHigh:
+        WideningKind = UnsignedDivisionByConstantWidening::MulHigh;
+        MagicFactor = DAG.getConstant(magics.Magic, dl, WideSVT);
+        break;
+      case UnsignedDivisionByConstantWidening::FullMultiply:
+        WideningKind = UnsignedDivisionByConstantWidening::FullMultiply;
+        MagicFactor = DAG.getConstant(magics.Magic, dl, WideSVT);
+        // Simple wide magic (narrow types): explicit shift after multiply.
+        SimpleWidenShift =
+            DAG.getConstant(magics.PostShift, dl,
+                            getShiftAmountTy(WideSVT, DAG.getDataLayout()));
+        break;
       }
 
       assert(magics.PreShift < Divisor.getBitWidth() &&
              "We shouldn't generate an undefined shift!");
-      assert(magics.PostShift < Divisor.getBitWidth() &&
+      assert((magics.Widening !=
+                  UnsignedDivisionByConstantWidening::FullMultiply ||
+              magics.PostShift < magics.Magic.getBitWidth()) &&
              "We shouldn't generate an undefined shift!");
       assert((!magics.IsAdd || magics.PreShift == 0) &&
              "Unexpected pre-shift");
@@ -6987,30 +6954,9 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
           dl, SVT);
       UseNPQ |= magics.IsAdd;
       UsePreShift |= magics.PreShift != 0;
-      UsePostShift |= magics.PostShift != 0;
-
-      // For narrow scalar types (i8, i16), when the Hacker's Delight magic
-      // requires the expensive NPQ add-and-shift fixup (IsAdd), check whether
-      // a simple fixup-free 64-bit magic exists:
-      //   trunc(srl(mul(zext(x, 64), ceil(2^Shift / C)), Shift))
-      // When IsAdd is required by HD, no 32-bit fixup-free solution exists, so
-      // we go directly to 64-bit. Only attempt when i64 MUL is natively legal.
-      EVT I64VT = EVT::getIntegerVT(*DAG.getContext(), 64);
-      bool IsScalar = !VT.isVector();
-      bool IsNarrow = EltBits <= 16;
-      bool NeedsAddFixup = magics.IsAdd;
-      bool HasLegalI64Mul =
-          isOperationLegalOrCustom(ISD::MUL, I64VT, IsAfterLegalization);
-      if (IsScalar && IsNarrow && NeedsAddFixup && HasLegalI64Mul) {
-        APInt MaxX = Known0.getMaxValue();
-        SimpleWideMagicInfo Info;
-        if (findSimpleWideMagic(Divisor, MaxX, Info)) {
-          SimpleWideMulVT = I64VT;
-          EVT WideShVT = getShiftAmountTy(I64VT, DAG.getDataLayout());
-          SimpleWideMulMagic = DAG.getConstant(Info.Magic, dl, I64VT);
-          SimpleWideMulShift = DAG.getConstant(Info.Shift, dl, WideShVT);
-        }
-      }
+      UsePostShift |=
+          magics.Widening == UnsignedDivisionByConstantWidening::None &&
+          magics.PostShift != 0;
     }
 
     PreShifts.push_back(PreShift);
@@ -7046,27 +6992,27 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
     PostShift = PostShifts[0];
   }
 
-  // Simple wide magic: trunc(srl(mul(zext(x, 64), Magic), Shift)).
-  // Only applies to narrow scalars (i8, i16); divisor=1 is excluded (never
-  // sets SimpleWideMulMagic), so no IsOne select is needed here.
-  if (SimpleWideMulMagic.getNode()) {
-    SDValue Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, SimpleWideMulVT, N0);
-    Created.push_back(Wide.getNode());
-    SDValue Mul =
-        DAG.getNode(ISD::MUL, dl, SimpleWideMulVT, Wide, SimpleWideMulMagic);
+  switch (WideningKind) {
+  case UnsignedDivisionByConstantWidening::None:
+    break;
+  case UnsignedDivisionByConstantWidening::FullMultiply: {
+    SDValue WideN0 = DAG.getNode(ISD::ZERO_EXTEND, dl, WideSVT, N0);
+    Created.push_back(WideN0.getNode());
+    assert(EltBits <= 16 && !VT.isVector() &&
+           "FullMultiply widening is only expected for narrow scalars");
+    // Narrow scalar: trunc(srl(mul(zext(x, 64), ceil(2^S/C)), S)).
+    // divisor=1 never reaches here (handled above), so no IsOne select needed.
+    SDValue Mul = DAG.getNode(ISD::MUL, dl, WideSVT, WideN0, MagicFactor);
     Created.push_back(Mul.getNode());
-    SDValue Srl =
-        DAG.getNode(ISD::SRL, dl, SimpleWideMulVT, Mul, SimpleWideMulShift);
+    SDValue Srl = DAG.getNode(ISD::SRL, dl, WideSVT, Mul, SimpleWidenShift);
     Created.push_back(Srl.getNode());
     return DAG.getNode(ISD::TRUNCATE, dl, VT, Srl);
   }
-
-  if (UseWiden) {
-    // Compute: (WideSVT(x) * MagicFactor) >> WideSVTBits.
+  case UnsignedDivisionByConstantWidening::MulHigh: {
     SDValue WideN0 = DAG.getNode(ISD::ZERO_EXTEND, dl, WideSVT, N0);
-
-    // Perform WideSVTxWideSVT -> 2*WideSVT multiplication and extract high
-    // WideSVT bits
+    Created.push_back(WideN0.getNode());
+    assert(VT == MVT::i32 && "MulHigh widening is only expected for i32");
+    // i32 -> i64: extract high 32 bits of the 64-bit multiply.
     SDValue High;
     if (HasWideMULHU) {
       High = DAG.getNode(ISD::MULHU, dl, WideSVT, WideN0, MagicFactor);
@@ -7077,10 +7023,10 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
                       WideN0, MagicFactor);
       High = LoHi.getValue(1);
     }
-
     Created.push_back(High.getNode());
     return DAG.getNode(ISD::TRUNCATE, dl, VT, High);
   }
+  }
 
   SDValue Q = N0;
   if (UsePreShift) {
diff --git a/llvm/lib/Support/DivisionByConstantInfo.cpp b/llvm/lib/Support/DivisionByConstantInfo.cpp
index 18b756d1ce8db..4fa3d1fa54b81 100644
--- a/llvm/lib/Support/DivisionByConstantInfo.cpp
+++ b/llvm/lib/Support/DivisionByConstantInfo.cpp
@@ -14,6 +14,38 @@
 
 using namespace llvm;
 
+/// Find M = ceil(2^S / D) and S such that
+///   trunc(srl(mul(zext(x, W), M), S)) == udiv(x, D)
+/// for all x in [0, MaxX], where the multiply stays within W bits (no MULHU).
+///
+/// This gives a fixup-free alternative to the Hacker's Delight add-and-shift
+/// for narrow types (i8/i16) widened into a larger integer.  The HD algorithm
+/// in wide space produces MULHU-style magic (≈2^W/D), which overflows a plain
+/// W-bit multiply; this routine instead finds the smallest S ≥ ceil(log2(D))
+/// for which the product MaxX * ceil(2^S/D) fits in W bits and the rounding
+/// error is harmless.
+static bool findSimpleWideMagic(const APInt &D, const APInt &MaxX, unsigned W,
+                                APInt &Magic, unsigned &Shift) {
+  APInt DivW = D.zext(W);
+  APInt MaxW = MaxX.zext(W);
+  for (unsigned S = D.ceilLogBase2(); S < W; ++S) {
+    APInt TwoToS = APInt::getOneBitSet(W, S);
+    APInt M = APIntOps::RoundingUDiv(TwoToS, DivW, APInt::Rounding::UP);
+    bool Overflow = false;
+    (void)MaxW.umul_ov(M, Overflow);
+    if (Overflow)
+      break; // M grows monotonically; no larger S can succeed.
+    APInt Error = M * DivW - TwoToS;
+    APInt MaxError = MaxW.umul_ov(Error, Overflow);
+    if (Overflow || MaxError.uge(TwoToS))
+      continue;
+    Magic = M;
+    Shift = S;
+    return true;
+  }
+  return false;
+}
+
 /// Calculate the magic numbers required to implement a signed integer division
 /// by a constant as a sequence of multiplies, adds and shifts.  Requires that
 /// the divisor not be 0, 1, or -1.  Taken from "Hacker's Delight", Henry S.
@@ -73,14 +105,15 @@ SignedDivisionByConstantInfo SignedDivisionByConstantInfo::get(const APInt &D) {
 UnsignedDivisionByConstantInfo
 UnsignedDivisionByConstantInfo::get(const APInt &D, unsigned LeadingZeros,
                                     bool AllowEvenDivisorOptimization,
-                                    bool AllowWidenOptimization) {
+                                    IntegerBitWidth MaxBitWidth) {
+  unsigned WideningBitWidth = static_cast<unsigned>(MaxBitWidth);
   assert(!D.isZero() && !D.isOne() && "Precondition violation.");
   assert(D.getBitWidth() > 1 && "Does not work at smaller bitwidths.");
 
   APInt Delta;
   struct UnsignedDivisionByConstantInfo Retval;
   Retval.IsAdd = false; // initialize "add" indicator
-  Retval.Widen = false; // initialize widen indicator
+  Retval.Widening = UnsignedDivisionByConstantWidening::None;
   APInt AllOnes =
       APInt::getLowBitsSet(D.getBitWidth(), D.getBitWidth() - LeadingZeros);
   APInt SignedMin = APInt::getSignedMinValue(D.getBitWidth());
@@ -154,19 +187,32 @@ UnsignedDivisionByConstantInfo::get(const APInt &D, unsigned LeadingZeros,
   }
   Retval.PreShift = 0;
 
-  // For IsAdd case with AllowWidenOptimization, compute widened magic.
-  // This is for optimizing 32-bit division using 64-bit multiplication.
-  // The actual magic constant is 2^W + Magic ((W+1)-bit).
-  // We pre-shift it left by (W*2 - OriginalShift) to avoid runtime shift.
-  if (Retval.IsAdd && AllowWidenOptimization) {
+  if (Retval.IsAdd && WideningBitWidth) {
     unsigned W = D.getBitWidth();
-    unsigned OriginalShift = Retval.PostShift + W + 1;
-    // Since PostShift >= 1, shift amount is at most W-2, so W*2 bits suffice.
-    Retval.Magic = (APInt::getOneBitSet(W * 2, W) + Retval.Magic.zext(W * 2))
-                       .shl(W * 2 - OriginalShift);
-    Retval.IsAdd = false;
-    Retval.PostShift = 0;
-    Retval.Widen = true;
+    if (WideningBitWidth == W * 2) {
+      // MULHU-style widen: pre-shift the (W+1)-bit magic into a W*2-bit value
+      // so the high W bits of the wide multiply give the quotient directly.
+      unsigned OriginalShift = Retval.PostShift + W + 1;
+      // Since PostShift >= 1, shift amount is at most W-2, so W*2 bits suffice.
+      Retval.Magic = (APInt::getOneBitSet(W * 2, W) + Retval.Magic.zext(W * 2))
+                         .shl(W * 2 - OriginalShift);
+      Retval.IsAdd = false;
+      Retval.PostShift = 0;
+      Retval.Widening = UnsignedDivisionByConstantWidening::MulHigh;
+    } else if (WideningBitWidth > W * 2) {
+      // Simple wide magic: trunc(srl(mul(zext(x, W), ceil(2^S/D)), S)).
+      // The HD algorithm in wide space produces MULHU-style magic (≈2^W/D)
+      // whose full product overflows W bits; findSimpleWideMagic instead finds
+      // the smallest ceil(2^S/D) whose W-bit product with MaxX stays in bounds.
+      APInt Magic;
+      unsigned Shift;
+      if (findSimpleWideMagic(D, AllOnes, WideningBitWidth, Magic, Shift)) {
+        Retval.Magic = std::move(Magic);
+        Retval.PostShift = Shift;
+        Retval.IsAdd = false;
+        Retval.Widening = UnsignedDivisionByConstantWidening::FullMultiply;
+      }
+    }
   }
 
   return Retval;
diff --git a/llvm/test/CodeGen/AArch64/rem-by-const.ll b/llvm/test/CodeGen/AArch64/rem-by-const.ll
index 1c6b241cb8f12..b1f2d20553c6d 100644
--- a/llvm/test/CodeGen/AArch64/rem-by-const.ll
+++ b/llvm/test/CodeGen/AArch64/rem-by-const.ll
@@ -67,14 +67,10 @@ entry:
 define i8 @ui8_7(i8 %a, i8 %b) {
 ; CHECK-SD-LABEL: ui8_7:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mov w8, #37 // =0x25
+; CHECK-SD-NEXT:    mov w8, #293 // =0x125
 ; CHECK-SD-NEXT:    and w9, w0, #0xff
 ; CHECK-SD-NEXT:    mul w8, w9, w8
-; CHECK-SD-NEXT:    lsr w8, w8, #8
-; CHECK-SD-NEXT:    sub w9, w0, w8
-; CHECK-SD-NEXT:    and w9, w9, #0xfe
-; CHECK-SD-NEXT:    add w8, w8, w9, lsr #1
-; CHECK-SD-NEXT:    lsr w8, w8, #2
+; CHECK-SD-NEXT:    lsr w8, w8, #11
 ; CHECK-SD-NEXT:    sub w8, w8, w8, lsl #3
 ; CHECK-SD-NEXT:    add w0, w0, w8
 ; CHECK-SD-NEXT:    ret
@@ -187,13 +183,11 @@ define i16 @ui16_7(i16 %a, i16 %b) {
 ; CHECK-SD-LABEL: ui16_7:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    mov w8, #9363 // =0x2493
-; CHECK-SD-NEXT:    and w9, w0, #0xffff
-; CHECK-SD-NEXT:    mul w8, w9, w8
-; CHECK-SD-NEXT:    lsr w8, w8, #16
-; CHECK-SD-NEXT:    sub w9, w0, w8
-; CHECK-SD-NEXT:    and w9, w9, #0xfffe
-; CHECK-SD-NEXT:    add w8, w8, w9, lsr #1
-; CHECK-SD-NEXT:    lsr w8, w8, #2
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    and x9, x0, #0xffff
+; CHECK-SD-NEXT:    movk w8, #1, lsl #16
+; CHECK-SD-NEXT:    umull x8, w9, w8
+; CHECK-SD-NEXT:    lsr x8, x8, #19
 ; CHECK-SD-NEXT:    sub w8, w8, w8, lsl #3
 ; CHECK-SD-NEXT:    add w0, w0, w8
 ; CHECK-SD-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/div-by-constant.ll b/llvm/test/CodeGen/RISCV/div-by-constant.ll
index 1aa0cd053f3ed..40b599bc1a076 100644
--- a/llvm/test/CodeGen/RISCV/div-by-constant.ll
+++ b/llvm/test/CodeGen/RISCV/div-by-constant.ll
@@ -245,31 +245,13 @@ define i8 @udiv8_constant_add(i8 %a) nounwind {
 ; RV32IMZB-NEXT:    srli a0, a0, 2
 ; RV32IMZB-NEXT:    ret
 ;
-; RV64IM-LABEL: udiv8_constant_add:
-; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    zext.b a1, a0
-; RV64IM-NEXT:    li a2, 37
-; RV64IM-NEXT:    mul a1, a1, a2
-; RV64IM-NEXT:    srli a1, a1, 8
-; RV64IM-NEXT:    sub a0, a0, a1
-; RV64IM-NEXT:    slli a0, a0, 56
-; RV64IM-NEXT:    srli a0, a0, 57
-; RV64IM-NEXT:    add a0, a0, a1
-; RV64IM-NEXT:    srli a0, a0, 2
-; RV64IM-NEXT:    ret
-;
-; RV64IMZB-LABEL: udiv8_constant_add:
-; RV64IMZB:       # %bb.0:
-; RV64IMZB-NEXT:    zext.b a1, a0
-; RV64IMZB-NEXT:    sh3add a2, a1, a1
-; RV64IMZB-NEXT:    sh2add a1, a2, a1
-; RV64IMZB-NEXT:    srli a1, a1, 8
-; RV64IMZB-NEXT:    sub a0, a0, a1
-; RV64IMZB-NEXT:    slli a0, a0, 56
-; RV64IMZB-NEXT:    srli a0, a0, 57
-; RV64IMZB-NEXT:    add a0, a0, a1
-; RV64IMZB-NEXT:    srli a0, a0, 2
-; RV64IMZB-NEXT:    ret
+; RV64-LABEL: udiv8_constant_add:
+; RV64:       # %bb.0:
+; RV64-NEXT:    zext.b a0, a0
+; RV64-NEXT:    li a1, 293
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    srli a0, a0, 11
+; RV64-NEXT:    ret
   %1 = udiv i8 %a, 7
   ret i8 %1
 }
@@ -311,15 +293,11 @@ define i16 @udiv16_constant_add(i16 %a) nounwind {
 ;
 ; RV64-LABEL: udiv16_constant_add:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    slli a1, a0, 48
-; RV64-NEXT:    lui a2, 149808
-; RV64-NEXT:    mulhu a1, a1, a2
-; RV64-NEXT:    srli a1, a1, 16
-; RV64-NEXT:    sub a0, a0, a1
+; RV64-NEXT:    lui a1, 74899
+; RV64-NEXT:    slli a1, a1, 4
 ; RV64-NEXT:    slli a0, a0, 48
-; RV64-NEXT:    srli a0, a0, 49
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    srli a0, a0, 2
+; RV64-NEXT:    mulhu a0, a0, a1
+; RV64-NEXT:    srli a0, a0, 19
 ; RV64-NEXT:    ret
   %1 = udiv i16 %a, 7
   ret i16 %1
diff --git a/llvm/test/CodeGen/X86/rotate-extract.ll b/llvm/test/CodeGen/X86/rotate-extract.ll
index b5332068d7edd..66e53b21343fb 100644
--- a/llvm/test/CodeGen/X86/rotate-extract.ll
+++ b/llvm/test/CodeGen/X86/rotate-extract.ll
@@ -240,15 +240,11 @@ define i8 @no_extract_udiv(i8 %i) nounwind {
 ; X64-NEXT:    movzbl %dil, %ecx
 ; X64-NEXT:    imull $171, %ecx, %eax
 ; X64-NEXT:    shrl $8, %eax
-; X64-NEXT:    imull $79, %ecx, %edx
-; X64-NEXT:    shrl $8, %edx
-; X64-NEXT:    subb %dl, %cl
-; X64-NEXT:    shrb %cl
-; X64-NEXT:    addb %dl, %cl
-; X64-NEXT:    shrb $5, %cl
+; X64-NEXT:    imull $335, %ecx, %ecx # imm = 0x14F
+; X64-NEXT:    shrl $14, %ecx
 ; X64-NEXT:    shlb $3, %al
+; X64-NEXT:    andb $-16, %al
 ; X64-NEXT:    orb %cl, %al
-; X64-NEXT:    andb $-9, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %lhs_div = udiv i8 %i, 3
diff --git a/llvm/test/CodeGen/X86/udiv-narrow-magic.ll b/llvm/test/CodeGen/X86/udiv-narrow-magic.ll
index 6c20b14f90ae6..5d5e893a3c59f 100644
--- a/llvm/test/CodeGen/X86/udiv-narrow-magic.ll
+++ b/llvm/test/CodeGen/X86/udiv-narrow-magic.ll
@@ -136,6 +136,46 @@ define i16 @udiv_i16_by100(i16 %x) nounwind {
   ret i16 %d
 }
 
+; Vector narrow udiv - should NOT use the scalar narrow-magic widening path.
+define <16 x i8> @udiv_v16i8_by7(<16 x i8> %x) nounwind {
+; CHECK-LABEL: udiv_v16i8_by7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pxor %xmm1, %xmm1
+; CHECK-NEXT:    movdqa %xmm0, %xmm2
+; CHECK-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; CHECK-NEXT:    movdqa {{.*#+}} xmm3 = [37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0]
+; CHECK-NEXT:    pmullw %xmm3, %xmm2
+; CHECK-NEXT:    psrlw $8, %xmm2
+; CHECK-NEXT:    movdqa %xmm0, %xmm4
+; CHECK-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
+; CHECK-NEXT:    pmullw %xmm3, %xmm4
+; CHECK-NEXT:    psrlw $8, %xmm4
+; CHECK-NEXT:    packuswb %xmm2, %xmm4
+; CHECK-NEXT:    psubb %xmm4, %xmm0
+; CHECK-NEXT:    psrlw $1, %xmm0
+; CHECK-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT:    paddb %xmm4, %xmm0
+; CHECK-NEXT:    psrlw $2, %xmm0
+; CHECK-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT:    retq
+  %d = udiv <16 x i8> %x, splat (i8 7)
+  ret <16 x i8> %d
+}
+
+define <8 x i16> @udiv_v8i16_by7(<8 x i16> %x) nounwind {
+; CHECK-LABEL: udiv_v8i16_by7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [9363,9363,9363,9363,9363,9363,9363,9363]
+; CHECK-NEXT:    pmulhuw %xmm0, %xmm1
+; CHECK-NEXT:    psubw %xmm1, %xmm0
+; CHECK-NEXT:    psrlw $1, %xmm0
+; CHECK-NEXT:    paddw %xmm1, %xmm0
+; CHECK-NEXT:    psrlw $2, %xmm0
+; CHECK-NEXT:    retq
+  %d = udiv <8 x i16> %x, splat (i16 7)
+  ret <8 x i16> %d
+}
+
 ; zext(udiv i16) - should also improve.
 define i32 @zext_udiv_i16_by7(i16 %x) nounwind {
 ; CHECK-LABEL: zext_udiv_i16_by7:
diff --git a/llvm/unittests/Support/DivisionByConstantTest.cpp b/llvm/unittests/Support/DivisionByConstantTest.cpp
index 715dded68ff01..7ea87851ac926 100644
--- a/llvm/unittests/Support/DivisionByConstantTest.cpp
+++ b/llvm/unittests/Support/DivisionByConstantTest.cpp
@@ -95,6 +95,13 @@ APInt MULHU(APInt X, APInt Y) {
   return (X.zext(WideBits) * Y.zext(WideBits)).lshr(Bits).trunc(Bits);
 }
 
+APInt WideMULHU(APInt X, APInt Y) {
+  assert(X.getBitWidth() == Y.getBitWidth() && "Expected matching widths");
+  unsigned Bits = X.getBitWidth();
+  unsigned WideBits = 2 * Bits;
+  return (X.zext(WideBits) * Y.zext(WideBits)).lshr(Bits).trunc(Bits);
+}
+
 APInt UnsignedDivideUsingMagic(const APInt &Numerator, const APInt &Divisor,
                                bool LZOptimization,
                                bool AllowEvenDivisorOptimization, bool ForceNPQ,
@@ -116,13 +123,26 @@ APInt UnsignedDivideUsingMagic(const APInt &Numerator, const APInt &Divisor,
 
   assert(Magics.PreShift < Divisor.getBitWidth() &&
          "We shouldn't generate an undefined shift!");
-  assert(Magics.PostShift < Divisor.getBitWidth() &&
+  assert((Magics.Widening != UnsignedDivisionByConstantWidening::FullMultiply ||
+          Magics.PostShift < Magics.Magic.getBitWidth()) &&
          "We shouldn't generate an undefined shift!");
   assert((!Magics.IsAdd || Magics.PreShift == 0) && "Unexpected pre-shift");
   unsigned PreShift = Magics.PreShift;
   unsigned PostShift = Magics.PostShift;
   bool UseNPQ = Magics.IsAdd;
 
+  if (Magics.Widening == UnsignedDivisionByConstantWidening::MulHigh) {
+    unsigned WideBits = Magics.Magic.getBitWidth();
+    APInt Q = WideMULHU(Numerator.zext(WideBits), Magics.Magic);
+    return Q.trunc(Bits);
+  }
+
+  if (Magics.Widening == UnsignedDivisionByConstantWidening::FullMultiply) {
+    unsigned WideBits = Magics.Magic.getBitWidth();
+    APInt Q = Numerator.zext(WideBits) * Magics.Magic;
+    return Q.lshr(PostShift).trunc(Bits);
+  }
+
   APInt NPQFactor =
       UseNPQ ? APInt::getSignedMinValue(Bits) : APInt::getZero(Bits);
 
@@ -186,4 +206,68 @@ TEST(UnsignedDivisionByConstantTest, Test) {
   }
 }
 
+TEST(UnsignedDivisionByConstantTest, WideningKinds) {
+  {
+    APInt Divisor(8, 7);
+    auto Magics = UnsignedDivisionByConstantInfo::get(
+        Divisor, /*LeadingZeros=*/0, /*AllowEvenDivisorOptimization=*/true,
+        IntegerBitWidth::I16);
+    EXPECT_EQ(Magics.Widening, UnsignedDivisionByConstantWidening::MulHigh);
+    EXPECT_EQ(Magics.Magic.getBitWidth(), 16u);
+    EXPECT_FALSE(Magics.IsAdd);
+    EXPECT_EQ(Magics.PostShift, 0u);
+  }
+
+  {
+    APInt Divisor(8, 7);
+    auto Magics = UnsignedDivisionByConstantInfo::get(
+        Divisor, /*LeadingZeros=*/0, /*AllowEvenDivisorOptimization=*/true,
+        IntegerBitWidth::I64);
+    EXPECT_EQ(Magics.Widening,
+              UnsignedDivisionByConstantWidening::FullMultiply);
+    EXPECT_EQ(Magics.Magic.getBitWidth(), 64u);
+    EXPECT_FALSE(Magics.IsAdd);
+    EXPECT_GT(Magics.PostShift, 0u);
+  }
+
+  {
+    APInt Divisor(32, 7);
+    auto Magics = UnsignedDivisionByConstantInfo::get(
+        Divisor, /*LeadingZeros=*/0, /*AllowEvenDivisorOptimization=*/true,
+        IntegerBitWidth::I64);
+    EXPECT_EQ(Magics.Widening, UnsignedDivisionByConstantWidening::MulHigh);
+    EXPECT_EQ(Magics.Magic.getBitWidth(), 64u);
+    EXPECT_FALSE(Magics.IsAdd);
+    EXPECT_EQ(Magics.PostShift, 0u);
+  }
+}
+
+TEST(UnsignedDivisionByConstantTest, WidenedMagicExecutesCorrectly) {
+  auto CheckAllNumerators = [](const APInt &Divisor,
+                               IntegerBitWidth MaxBitWidth,
+                               UnsignedDivisionByConstantWidening Widening) {
+    auto Magics = UnsignedDivisionByConstantInfo::get(
+        Divisor, /*LeadingZeros=*/0, /*AllowEvenDivisorOptimization=*/true,
+        MaxBitWidth);
+    ASSERT_EQ(Magics.Widening, Widening);
+    EnumerateAPInts(Divisor.getBitWidth(), [&](const APInt &Numerator) {
+      ASSERT_EQ(UnsignedDivideUsingMagic(Numerator, Divisor,
+                                         /*LZOptimization=*/false,
+                                         /*AllowEvenDivisorOptimization=*/true,
+                                         /*ForceNPQ=*/false, Magics),
+                Numerator.udiv(Divisor))
+          << " ... given the operation: udiv i" << Divisor.getBitWidth() << " "
+          << Numerator << ", " << Divisor << " with widening "
+          << static_cast<int>(Widening);
+    });
+  };
+
+  CheckAllNumerators(APInt(8, 7), IntegerBitWidth::I16,
+                     UnsignedDivisionByConstantWidening::MulHigh);
+  CheckAllNumerators(APInt(8, 7), IntegerBitWidth::I64,
+                     UnsignedDivisionByConstantWidening::FullMultiply);
+  CheckAllNumerators(APInt(16, 7), IntegerBitWidth::I64,
+                     UnsignedDivisionByConstantWidening::FullMultiply);
+}
+
 } // end anonymous namespace

>From e53bcb19211881e1ac6f25f9f9469a1b05ee6d06 Mon Sep 17 00:00:00 2001
From: Takashiidobe <idobetakashi at gmail.com>
Date: Wed, 25 Mar 2026 19:44:51 -0400
Subject: [PATCH 4/6] allow 32-bit targets with UMUL_LOHI to use the shift free
 and fixup free division strength reduction for i8/i16

---
 .../CodeGen/SelectionDAG/TargetLowering.cpp   |  47 +-
 .../CodeGen/Generic/udiv-narrow-widening.ll   | 487 ++++++++++++++++++
 .../CodeGen/X86/udiv-const-optimization.ll    |  61 +++
 llvm/test/CodeGen/X86/udiv-narrow-magic.ll    | 191 -------
 llvm/test/CodeGen/X86/urem-vector-lkk.ll      |  16 +-
 5 files changed, 588 insertions(+), 214 deletions(-)
 create mode 100644 llvm/test/CodeGen/Generic/udiv-narrow-widening.ll
 delete mode 100644 llvm/test/CodeGen/X86/udiv-narrow-magic.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 8880fd150ad2e..f2e6ca9f1c154 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -6873,25 +6873,41 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
   }
   const unsigned SVTBits = SVT.getSizeInBits();
 
-  // Allow i32 to be widened to i64 for uncooperative divisors if i64 MULHU or
-  // UMUL_LOHI is supported.
-  const EVT WideSVT = MVT::i64;
+  // Allow scalar i16 to be widened to i32 for uncooperative divisors if i32
+  // MULHU or UMUL_LOHI is supported (shiftless MulHigh, prefer over i64 widen).
+  const bool HasWideI32MULHU =
+      VT == MVT::i16 &&
+      isOperationLegalOrCustom(ISD::MULHU, MVT::i32, IsAfterLegalization);
+  const bool HasWideI32UMUL_LOHI =
+      VT == MVT::i16 &&
+      isOperationLegalOrCustom(ISD::UMUL_LOHI, MVT::i32, IsAfterLegalization);
+  // Allow scalar i32 to be widened to i64 for uncooperative divisors if i64
+  // MULHU or UMUL_LOHI is supported (shiftless MulHigh).
   const bool HasWideMULHU =
-      VT == MVT::i32 &&
-      isOperationLegalOrCustom(ISD::MULHU, WideSVT, IsAfterLegalization);
+      HasWideI32MULHU ||
+      (VT == MVT::i32 &&
+       isOperationLegalOrCustom(ISD::MULHU, MVT::i64, IsAfterLegalization));
   const bool HasWideUMUL_LOHI =
-      VT == MVT::i32 &&
-      isOperationLegalOrCustom(ISD::UMUL_LOHI, WideSVT, IsAfterLegalization);
+      HasWideI32UMUL_LOHI ||
+      (VT == MVT::i32 &&
+       isOperationLegalOrCustom(ISD::UMUL_LOHI, MVT::i64, IsAfterLegalization));
   const bool AllowWiden = (HasWideMULHU || HasWideUMUL_LOHI);
+  // WideSVT: the doubled type for MulHigh multiplication.
+  // Use i32 for the i16->i32 case, i64 otherwise.
+  const EVT WideSVT =
+      (HasWideI32MULHU || HasWideI32UMUL_LOHI) ? MVT::i32 : MVT::i64;
   // For narrow scalars (i8, i16), a fixup-free 64-bit magic may exist when
   // i64 MUL is available: trunc(srl(mul(zext(x, 64), ceil(2^S/C)), S)).
+  // Skip this when i32 MulHigh is already preferred for i16.
   const bool HasLegalI64Mul =
-      isOperationLegalOrCustom(ISD::MUL, WideSVT, IsAfterLegalization);
-  const bool AllowNarrowWiden =
-      EltBits <= 16 && !VT.isVector() && HasLegalI64Mul;
-  const IntegerBitWidth MaxBitWidth = (AllowWiden || AllowNarrowWiden)
-                                          ? IntegerBitWidth::I64
-                                          : IntegerBitWidth::None;
+      isOperationLegalOrCustom(ISD::MUL, MVT::i64, IsAfterLegalization);
+  const bool AllowNarrowWiden = EltBits <= 16 && !VT.isVector() &&
+                                !(HasWideI32MULHU || HasWideI32UMUL_LOHI) &&
+                                HasLegalI64Mul;
+  const IntegerBitWidth MaxBitWidth =
+      (HasWideI32MULHU || HasWideI32UMUL_LOHI) ? IntegerBitWidth::I32
+      : (AllowWiden || AllowNarrowWiden)       ? IntegerBitWidth::I64
+                                               : IntegerBitWidth::None;
 
   bool UseNPQ = false, UsePreShift = false, UsePostShift = false;
   UnsignedDivisionByConstantWidening WideningKind =
@@ -7011,8 +7027,9 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
   case UnsignedDivisionByConstantWidening::MulHigh: {
     SDValue WideN0 = DAG.getNode(ISD::ZERO_EXTEND, dl, WideSVT, N0);
     Created.push_back(WideN0.getNode());
-    assert(VT == MVT::i32 && "MulHigh widening is only expected for i32");
-    // i32 -> i64: extract high 32 bits of the 64-bit multiply.
+    assert((VT == MVT::i32 || VT == MVT::i16) &&
+           "MulHigh widening is only expected for i32 or i16");
+    // Extract the high half of the widened multiply (i16->i32 or i32->i64).
     SDValue High;
     if (HasWideMULHU) {
       High = DAG.getNode(ISD::MULHU, dl, WideSVT, WideN0, MagicFactor);
diff --git a/llvm/test/CodeGen/Generic/udiv-narrow-widening.ll b/llvm/test/CodeGen/Generic/udiv-narrow-widening.ll
new file mode 100644
index 0000000000000..124bb7cb73565
--- /dev/null
+++ b/llvm/test/CodeGen/Generic/udiv-narrow-widening.ll
@@ -0,0 +1,487 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=i386-unknown-linux-gnu              < %s | FileCheck %s --check-prefixes=CHECK,X86
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu            < %s | FileCheck %s --check-prefixes=CHECK,X64
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu           < %s | FileCheck %s --check-prefixes=CHECK,AARCH64
+; RUN: llc -mtriple=riscv32-unknown-linux-gnu -mattr=+m < %s | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: llc -mtriple=riscv64-unknown-linux-gnu -mattr=+m < %s | FileCheck %s --check-prefixes=CHECK,RV64
+
+; When a narrow udiv-by-constant (i8, i16) is lowered via BuildUDIV, the
+; Granlund-Montgomery (GM) algorithm may need an expensive add-and-shift fixup
+; for IsAdd=true divisors. A wider legal type (e.g. i32 for i16) has enough
+; headroom for a simple magic = ceil(2^Shift / C) that needs no fixup at all.
+;
+; i16 IsAdd=true divisors (e.g. 7): on targets with i32 UMUL_LOHI (i386,
+; x86-64) we emit a shiftless 32x32->64 multiply and take the high 32 bits.
+; On AArch64 (no i32 UMUL_LOHI) we fall back to FullMultiply via i64.
+
+; --- i8 cases ---
+
+define i8 @udiv_i8_by7(i8 %x) nounwind {
+; X86-LABEL: udiv_i8_by7:
+; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (%eax,%eax,8), %ecx
+; X86-NEXT:    leal (%eax,%ecx,4), %ecx
+; X86-NEXT:    subb %ch, %al
+; X86-NEXT:    shrb %al
+; X86-NEXT:    addb %ch, %al
+; X86-NEXT:    shrb $2, %al
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: udiv_i8_by7:
+; X64:       # %bb.0:
+; X64-NEXT:    movzbl %dil, %eax
+; X64-NEXT:    imull $293, %eax, %eax # imm = 0x125
+; X64-NEXT:    shrl $11, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+;
+; AARCH64-LABEL: udiv_i8_by7:
+; AARCH64:       // %bb.0:
+; AARCH64-NEXT:    // kill: def $w0 killed $w0 def $x0
+; AARCH64-NEXT:    mov w8, #293 // =0x125
+; AARCH64-NEXT:    and x9, x0, #0xff
+; AARCH64-NEXT:    umull x8, w9, w8
+; AARCH64-NEXT:    lsr x0, x8, #11
+; AARCH64-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; AARCH64-NEXT:    ret
+;
+; RV32-LABEL: udiv_i8_by7:
+; RV32:       # %bb.0:
+; RV32-NEXT:    zext.b a1, a0
+; RV32-NEXT:    li a2, 37
+; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    srli a1, a1, 8
+; RV32-NEXT:    sub a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 24
+; RV32-NEXT:    srli a0, a0, 25
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    srli a0, a0, 2
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: udiv_i8_by7:
+; RV64:       # %bb.0:
+; RV64-NEXT:    zext.b a0, a0
+; RV64-NEXT:    li a1, 293
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    srli a0, a0, 11
+; RV64-NEXT:    ret
+  %d = udiv i8 %x, 7
+  ret i8 %d
+}
+
+define i8 @udiv_i8_by5(i8 %x) nounwind {
+; X86-LABEL: udiv_i8_by5:
+; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    imull $205, %eax, %eax
+; X86-NEXT:    shrl $10, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: udiv_i8_by5:
+; X64:       # %bb.0:
+; X64-NEXT:    movzbl %dil, %eax
+; X64-NEXT:    imull $205, %eax, %eax
+; X64-NEXT:    shrl $10, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+;
+; AARCH64-LABEL: udiv_i8_by5:
+; AARCH64:       // %bb.0:
+; AARCH64-NEXT:    mov w8, #205 // =0xcd
+; AARCH64-NEXT:    and w9, w0, #0xff
+; AARCH64-NEXT:    mul w8, w9, w8
+; AARCH64-NEXT:    lsr w0, w8, #10
+; AARCH64-NEXT:    ret
+;
+; RV32-LABEL: udiv_i8_by5:
+; RV32:       # %bb.0:
+; RV32-NEXT:    zext.b a0, a0
+; RV32-NEXT:    li a1, 205
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    srli a0, a0, 10
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: udiv_i8_by5:
+; RV64:       # %bb.0:
+; RV64-NEXT:    zext.b a0, a0
+; RV64-NEXT:    li a1, 205
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    srli a0, a0, 10
+; RV64-NEXT:    ret
+  %d = udiv i8 %x, 5
+  ret i8 %d
+}
+
+define i8 @udiv_i8_by3(i8 %x) nounwind {
+; X86-LABEL: udiv_i8_by3:
+; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    imull $171, %eax, %eax
+; X86-NEXT:    shrl $9, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: udiv_i8_by3:
+; X64:       # %bb.0:
+; X64-NEXT:    movzbl %dil, %eax
+; X64-NEXT:    imull $171, %eax, %eax
+; X64-NEXT:    shrl $9, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+;
+; AARCH64-LABEL: udiv_i8_by3:
+; AARCH64:       // %bb.0:
+; AARCH64-NEXT:    mov w8, #171 // =0xab
+; AARCH64-NEXT:    and w9, w0, #0xff
+; AARCH64-NEXT:    mul w8, w9, w8
+; AARCH64-NEXT:    lsr w0, w8, #9
+; AARCH64-NEXT:    ret
+;
+; RV32-LABEL: udiv_i8_by3:
+; RV32:       # %bb.0:
+; RV32-NEXT:    zext.b a0, a0
+; RV32-NEXT:    li a1, 171
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    srli a0, a0, 9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: udiv_i8_by3:
+; RV64:       # %bb.0:
+; RV64-NEXT:    zext.b a0, a0
+; RV64-NEXT:    li a1, 171
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    srli a0, a0, 9
+; RV64-NEXT:    ret
+  %d = udiv i8 %x, 3
+  ret i8 %d
+}
+
+; Even divisor - stock uses pre-shift + magic; simple magic avoids pre-shift.
+define i8 @udiv_i8_by78(i8 %x) nounwind {
+; X86-LABEL: udiv_i8_by78:
+; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shrb %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    imull $211, %eax, %eax
+; X86-NEXT:    shrl $13, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: udiv_i8_by78:
+; X64:       # %bb.0:
+; X64-NEXT:    shrb %dil
+; X64-NEXT:    movzbl %dil, %eax
+; X64-NEXT:    imull $211, %eax, %eax
+; X64-NEXT:    shrl $13, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+;
+; AARCH64-LABEL: udiv_i8_by78:
+; AARCH64:       // %bb.0:
+; AARCH64-NEXT:    ubfx w8, w0, #1, #7
+; AARCH64-NEXT:    mov w9, #211 // =0xd3
+; AARCH64-NEXT:    mul w8, w8, w9
+; AARCH64-NEXT:    lsr w0, w8, #13
+; AARCH64-NEXT:    ret
+;
+; RV32-LABEL: udiv_i8_by78:
+; RV32:       # %bb.0:
+; RV32-NEXT:    slli a0, a0, 24
+; RV32-NEXT:    srli a0, a0, 25
+; RV32-NEXT:    li a1, 211
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    srli a0, a0, 13
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: udiv_i8_by78:
+; RV64:       # %bb.0:
+; RV64-NEXT:    slli a0, a0, 56
+; RV64-NEXT:    srli a0, a0, 57
+; RV64-NEXT:    li a1, 211
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    srli a0, a0, 13
+; RV64-NEXT:    ret
+  %d = udiv i8 %x, 78
+  ret i8 %d
+}
+
+define i8 @udiv_i8_by116(i8 %x) nounwind {
+; X86-LABEL: udiv_i8_by116:
+; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shrb $2, %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    imull $71, %eax, %eax
+; X86-NEXT:    shrl $11, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: udiv_i8_by116:
+; X64:       # %bb.0:
+; X64-NEXT:    shrb $2, %dil
+; X64-NEXT:    movzbl %dil, %eax
+; X64-NEXT:    imull $71, %eax, %eax
+; X64-NEXT:    shrl $11, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+;
+; AARCH64-LABEL: udiv_i8_by116:
+; AARCH64:       // %bb.0:
+; AARCH64-NEXT:    ubfx w8, w0, #2, #6
+; AARCH64-NEXT:    mov w9, #71 // =0x47
+; AARCH64-NEXT:    mul w8, w8, w9
+; AARCH64-NEXT:    lsr w0, w8, #11
+; AARCH64-NEXT:    ret
+;
+; RV32-LABEL: udiv_i8_by116:
+; RV32:       # %bb.0:
+; RV32-NEXT:    slli a0, a0, 24
+; RV32-NEXT:    srli a0, a0, 26
+; RV32-NEXT:    li a1, 71
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    srli a0, a0, 11
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: udiv_i8_by116:
+; RV64:       # %bb.0:
+; RV64-NEXT:    slli a0, a0, 56
+; RV64-NEXT:    srli a0, a0, 58
+; RV64-NEXT:    li a1, 71
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    srli a0, a0, 11
+; RV64-NEXT:    ret
+  %d = udiv i8 %x, 116
+  ret i8 %d
+}
+
+; Power of 2 - should NOT fire (already lowered to shift).
+define i8 @udiv_i8_by4(i8 %x) nounwind {
+; X86-LABEL: udiv_i8_by4:
+; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shrb $2, %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: udiv_i8_by4:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    shrb $2, %al
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+;
+; AARCH64-LABEL: udiv_i8_by4:
+; AARCH64:       // %bb.0:
+; AARCH64-NEXT:    ubfx w0, w0, #2, #6
+; AARCH64-NEXT:    ret
+;
+; RV32-LABEL: udiv_i8_by4:
+; RV32:       # %bb.0:
+; RV32-NEXT:    slli a0, a0, 24
+; RV32-NEXT:    srli a0, a0, 26
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: udiv_i8_by4:
+; RV64:       # %bb.0:
+; RV64-NEXT:    slli a0, a0, 56
+; RV64-NEXT:    srli a0, a0, 58
+; RV64-NEXT:    ret
+  %d = udiv i8 %x, 4
+  ret i8 %d
+}
+
+; Division by 1 - should NOT fire.
+define i8 @udiv_i8_by1(i8 %x) nounwind {
+; X86-LABEL: udiv_i8_by1:
+; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: udiv_i8_by1:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+;
+; AARCH64-LABEL: udiv_i8_by1:
+; AARCH64:       // %bb.0:
+; AARCH64-NEXT:    ret
+;
+; RV32-LABEL: udiv_i8_by1:
+; RV32:       # %bb.0:
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: udiv_i8_by1:
+; RV64:       # %bb.0:
+; RV64-NEXT:    ret
+  %d = udiv i8 %x, 1
+  ret i8 %d
+}
+
+; Bare i8 udiv feeding another i8 op (no zext).
+define i8 @udiv_i8_then_add(i8 %x, i8 %y) nounwind {
+; X86-LABEL: udiv_i8_then_add:
+; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (%eax,%eax,8), %ecx
+; X86-NEXT:    leal (%eax,%ecx,4), %ecx
+; X86-NEXT:    subb %ch, %al
+; X86-NEXT:    shrb %al
+; X86-NEXT:    addb %ch, %al
+; X86-NEXT:    shrb $2, %al
+; X86-NEXT:    addb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: udiv_i8_then_add:
+; X64:       # %bb.0:
+; X64-NEXT:    movzbl %dil, %eax
+; X64-NEXT:    imull $293, %eax, %eax # imm = 0x125
+; X64-NEXT:    shrl $11, %eax
+; X64-NEXT:    addb %sil, %al
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+;
+; AARCH64-LABEL: udiv_i8_then_add:
+; AARCH64:       // %bb.0:
+; AARCH64-NEXT:    mov w8, #293 // =0x125
+; AARCH64-NEXT:    and w9, w0, #0xff
+; AARCH64-NEXT:    mul w8, w9, w8
+; AARCH64-NEXT:    add w0, w1, w8, lsr #11
+; AARCH64-NEXT:    ret
+;
+; RV32-LABEL: udiv_i8_then_add:
+; RV32:       # %bb.0:
+; RV32-NEXT:    zext.b a2, a0
+; RV32-NEXT:    li a3, 37
+; RV32-NEXT:    mul a2, a2, a3
+; RV32-NEXT:    srli a2, a2, 8
+; RV32-NEXT:    sub a0, a0, a2
+; RV32-NEXT:    slli a0, a0, 24
+; RV32-NEXT:    srli a0, a0, 25
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    srli a0, a0, 2
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: udiv_i8_then_add:
+; RV64:       # %bb.0:
+; RV64-NEXT:    zext.b a0, a0
+; RV64-NEXT:    li a2, 293
+; RV64-NEXT:    mul a0, a0, a2
+; RV64-NEXT:    srli a0, a0, 11
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    ret
+  %d = udiv i8 %x, 7
+  %r = add i8 %d, %y
+  ret i8 %r
+}
+
+; --- i16 cases ---
+
+; IsAdd=true: i386 uses shiftless mull (UMUL_LOHI), x86-64 uses imulq+shrq $32,
+; AArch64 falls back to umull+lsr (FullMultiply via i64).
+define i16 @udiv_i16_by7(i16 %x) nounwind {
+; X86-LABEL: udiv_i16_by7:
+; X86:       # %bb.0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl $613572608, %ecx # imm = 0x24926000
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: udiv_i16_by7:
+; X64:       # %bb.0:
+; X64-NEXT:    movzwl %di, %eax
+; X64-NEXT:    imulq $613572608, %rax, %rax # imm = 0x24926000
+; X64-NEXT:    shrq $32, %rax
+; X64-NEXT:    # kill: def $ax killed $ax killed $rax
+; X64-NEXT:    retq
+;
+; AARCH64-LABEL: udiv_i16_by7:
+; AARCH64:       // %bb.0:
+; AARCH64-NEXT:    mov w8, #9363 // =0x2493
+; AARCH64-NEXT:    // kill: def $w0 killed $w0 def $x0
+; AARCH64-NEXT:    and x9, x0, #0xffff
+; AARCH64-NEXT:    movk w8, #1, lsl #16
+; AARCH64-NEXT:    umull x8, w9, w8
+; AARCH64-NEXT:    lsr x0, x8, #19
+; AARCH64-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; AARCH64-NEXT:    ret
+;
+; RV32-LABEL: udiv_i16_by7:
+; RV32:       # %bb.0:
+; RV32-NEXT:    slli a0, a0, 16
+; RV32-NEXT:    srli a0, a0, 16
+; RV32-NEXT:    lui a1, 149798
+; RV32-NEXT:    mulhu a0, a0, a1
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: udiv_i16_by7:
+; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 74899
+; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    slli a0, a0, 48
+; RV64-NEXT:    mulhu a0, a0, a1
+; RV64-NEXT:    srli a0, a0, 19
+; RV64-NEXT:    ret
+  %d = udiv i16 %x, 7
+  ret i16 %d
+}
+
+define i16 @udiv_i16_by100(i16 %x) nounwind {
+; X86-LABEL: udiv_i16_by100:
+; X86:       # %bb.0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shrl $2, %eax
+; X86-NEXT:    imull $5243, %eax, %eax # imm = 0x147B
+; X86-NEXT:    shrl $17, %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: udiv_i16_by100:
+; X64:       # %bb.0:
+; X64-NEXT:    movzwl %di, %eax
+; X64-NEXT:    shrl $2, %eax
+; X64-NEXT:    imull $5243, %eax, %eax # imm = 0x147B
+; X64-NEXT:    shrl $17, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+;
+; AARCH64-LABEL: udiv_i16_by100:
+; AARCH64:       // %bb.0:
+; AARCH64-NEXT:    ubfx w8, w0, #2, #14
+; AARCH64-NEXT:    mov w9, #5243 // =0x147b
+; AARCH64-NEXT:    mul w8, w8, w9
+; AARCH64-NEXT:    lsr w0, w8, #17
+; AARCH64-NEXT:    ret
+;
+; RV32-LABEL: udiv_i16_by100:
+; RV32:       # %bb.0:
+; RV32-NEXT:    slli a0, a0, 16
+; RV32-NEXT:    lui a1, 1
+; RV32-NEXT:    srli a0, a0, 18
+; RV32-NEXT:    addi a1, a1, 1147
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    srli a0, a0, 17
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: udiv_i16_by100:
+; RV64:       # %bb.0:
+; RV64-NEXT:    slli a0, a0, 48
+; RV64-NEXT:    lui a1, 1
+; RV64-NEXT:    srli a0, a0, 50
+; RV64-NEXT:    addi a1, a1, 1147
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    srli a0, a0, 17
+; RV64-NEXT:    ret
+  %d = udiv i16 %x, 100
+  ret i16 %d
+}
+
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/X86/udiv-const-optimization.ll b/llvm/test/CodeGen/X86/udiv-const-optimization.ll
index a4fa413bab038..5e2518108512a 100644
--- a/llvm/test/CodeGen/X86/udiv-const-optimization.ll
+++ b/llvm/test/CodeGen/X86/udiv-const-optimization.ll
@@ -139,3 +139,64 @@ define i32 @udiv_by_3(i32 %x) nounwind {
   %div = udiv i32 %x, 3
   ret i32 %div
 }
+
+; Test i16 udiv optimization: shiftless 32x32->hi32 via UMUL_LOHI(i32).
+; On i386, this uses mull (4 instructions, down from the 8-instruction IsAdd GM sequence).
+; On x86-64, the UMUL_LOHI(i32) folds into imulq+shrq $32 (3 instructions).
+define i16 @udiv_i16_by_7(i16 %x) nounwind {
+; X64-LABEL: udiv_i16_by_7:
+; X64:       # %bb.0:
+; X64-NEXT:    movzwl %di, %eax
+; X64-NEXT:    imulq $613572608, %rax, %rax # imm = 0x24926000
+; X64-NEXT:    shrq $32, %rax
+; X64-NEXT:    # kill: def $ax killed $ax killed $rax
+; X64-NEXT:    retq
+;
+; X64-BMI2-LABEL: udiv_i16_by_7:
+; X64-BMI2:       # %bb.0:
+; X64-BMI2-NEXT:    movzwl %di, %eax
+; X64-BMI2-NEXT:    imulq $613572608, %rax, %rax # imm = 0x24926000
+; X64-BMI2-NEXT:    shrq $32, %rax
+; X64-BMI2-NEXT:    # kill: def $ax killed $ax killed $rax
+; X64-BMI2-NEXT:    retq
+;
+; X86-LABEL: udiv_i16_by_7:
+; X86:       # %bb.0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl $613572608, %ecx # imm = 0x24926000
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+  %div = udiv i16 %x, 7
+  ret i16 %div
+}
+
+; Test non-optimized i16 case: IsAdd=false divisor uses regular 32-bit MULHU.
+define i16 @udiv_i16_by_3(i16 %x) nounwind {
+; X64-LABEL: udiv_i16_by_3:
+; X64:       # %bb.0:
+; X64-NEXT:    movzwl %di, %eax
+; X64-NEXT:    imull $43691, %eax, %eax # imm = 0xAAAB
+; X64-NEXT:    shrl $17, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+;
+; X64-BMI2-LABEL: udiv_i16_by_3:
+; X64-BMI2:       # %bb.0:
+; X64-BMI2-NEXT:    movzwl %di, %eax
+; X64-BMI2-NEXT:    imull $43691, %eax, %eax # imm = 0xAAAB
+; X64-BMI2-NEXT:    shrl $17, %eax
+; X64-BMI2-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-BMI2-NEXT:    retq
+;
+; X86-LABEL: udiv_i16_by_3:
+; X86:       # %bb.0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    imull $43691, %eax, %eax # imm = 0xAAAB
+; X86-NEXT:    shrl $17, %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+  %div = udiv i16 %x, 3
+  ret i16 %div
+}
diff --git a/llvm/test/CodeGen/X86/udiv-narrow-magic.ll b/llvm/test/CodeGen/X86/udiv-narrow-magic.ll
deleted file mode 100644
index 5d5e893a3c59f..0000000000000
--- a/llvm/test/CodeGen/X86/udiv-narrow-magic.ll
+++ /dev/null
@@ -1,191 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=x86_64 < %s | FileCheck %s
-
-; When a narrow udiv-by-constant (i8, i16) is lowered via BuildUDIV, the
-; Hacker's Delight algorithm may need an expensive add-and-shift fixup.
-; A wider legal type (e.g. i32 for i8) has enough headroom for a simple
-; Magic = ceil(2^Shift / C) that needs no fixup at all.
-
-; --- i8 cases ---
-
-define i8 @udiv_i8_by7(i8 %x) nounwind {
-; CHECK-LABEL: udiv_i8_by7:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    imull $293, %eax, %eax # imm = 0x125
-; CHECK-NEXT:    shrl $11, %eax
-; CHECK-NEXT:    # kill: def $al killed $al killed $eax
-; CHECK-NEXT:    retq
-  %d = udiv i8 %x, 7
-  ret i8 %d
-}
-
-define i8 @udiv_i8_by5(i8 %x) nounwind {
-; CHECK-LABEL: udiv_i8_by5:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    imull $205, %eax, %eax
-; CHECK-NEXT:    shrl $10, %eax
-; CHECK-NEXT:    # kill: def $al killed $al killed $eax
-; CHECK-NEXT:    retq
-  %d = udiv i8 %x, 5
-  ret i8 %d
-}
-
-define i8 @udiv_i8_by3(i8 %x) nounwind {
-; CHECK-LABEL: udiv_i8_by3:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    imull $171, %eax, %eax
-; CHECK-NEXT:    shrl $9, %eax
-; CHECK-NEXT:    # kill: def $al killed $al killed $eax
-; CHECK-NEXT:    retq
-  %d = udiv i8 %x, 3
-  ret i8 %d
-}
-
-; Even divisor - stock uses pre-shift + magic; simple magic avoids pre-shift.
-define i8 @udiv_i8_by78(i8 %x) nounwind {
-; CHECK-LABEL: udiv_i8_by78:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    shrb %dil
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    imull $211, %eax, %eax
-; CHECK-NEXT:    shrl $13, %eax
-; CHECK-NEXT:    # kill: def $al killed $al killed $eax
-; CHECK-NEXT:    retq
-  %d = udiv i8 %x, 78
-  ret i8 %d
-}
-
-define i8 @udiv_i8_by116(i8 %x) nounwind {
-; CHECK-LABEL: udiv_i8_by116:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    shrb $2, %dil
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    imull $71, %eax, %eax
-; CHECK-NEXT:    shrl $11, %eax
-; CHECK-NEXT:    # kill: def $al killed $al killed $eax
-; CHECK-NEXT:    retq
-  %d = udiv i8 %x, 116
-  ret i8 %d
-}
-
-; Power of 2 - should NOT fire (already lowered to shift).
-define i8 @udiv_i8_by4(i8 %x) nounwind {
-; CHECK-LABEL: udiv_i8_by4:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    shrb $2, %al
-; CHECK-NEXT:    # kill: def $al killed $al killed $eax
-; CHECK-NEXT:    retq
-  %d = udiv i8 %x, 4
-  ret i8 %d
-}
-
-; Division by 1 - should NOT fire.
-define i8 @udiv_i8_by1(i8 %x) nounwind {
-; CHECK-LABEL: udiv_i8_by1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    # kill: def $al killed $al killed $eax
-; CHECK-NEXT:    retq
-  %d = udiv i8 %x, 1
-  ret i8 %d
-}
-
-; Bare i8 udiv feeding another i8 op (no zext).
-define i8 @udiv_i8_then_add(i8 %x, i8 %y) nounwind {
-; CHECK-LABEL: udiv_i8_then_add:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    imull $293, %eax, %eax # imm = 0x125
-; CHECK-NEXT:    shrl $11, %eax
-; CHECK-NEXT:    addb %sil, %al
-; CHECK-NEXT:    # kill: def $al killed $al killed $eax
-; CHECK-NEXT:    retq
-  %d = udiv i8 %x, 7
-  %r = add i8 %d, %y
-  ret i8 %r
-}
-
-; --- i16 cases ---
-
-define i16 @udiv_i16_by7(i16 %x) nounwind {
-; CHECK-LABEL: udiv_i16_by7:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movzwl %di, %eax
-; CHECK-NEXT:    imulq $74899, %rax, %rax # imm = 0x12493
-; CHECK-NEXT:    shrq $19, %rax
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $rax
-; CHECK-NEXT:    retq
-  %d = udiv i16 %x, 7
-  ret i16 %d
-}
-
-define i16 @udiv_i16_by100(i16 %x) nounwind {
-; CHECK-LABEL: udiv_i16_by100:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movzwl %di, %eax
-; CHECK-NEXT:    shrl $2, %eax
-; CHECK-NEXT:    imull $5243, %eax, %eax # imm = 0x147B
-; CHECK-NEXT:    shrl $17, %eax
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
-; CHECK-NEXT:    retq
-  %d = udiv i16 %x, 100
-  ret i16 %d
-}
-
-; Vector narrow udiv - should NOT use the scalar narrow-magic widening path.
-define <16 x i8> @udiv_v16i8_by7(<16 x i8> %x) nounwind {
-; CHECK-LABEL: udiv_v16i8_by7:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    pxor %xmm1, %xmm1
-; CHECK-NEXT:    movdqa %xmm0, %xmm2
-; CHECK-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; CHECK-NEXT:    movdqa {{.*#+}} xmm3 = [37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0]
-; CHECK-NEXT:    pmullw %xmm3, %xmm2
-; CHECK-NEXT:    psrlw $8, %xmm2
-; CHECK-NEXT:    movdqa %xmm0, %xmm4
-; CHECK-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
-; CHECK-NEXT:    pmullw %xmm3, %xmm4
-; CHECK-NEXT:    psrlw $8, %xmm4
-; CHECK-NEXT:    packuswb %xmm2, %xmm4
-; CHECK-NEXT:    psubb %xmm4, %xmm0
-; CHECK-NEXT:    psrlw $1, %xmm0
-; CHECK-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    paddb %xmm4, %xmm0
-; CHECK-NEXT:    psrlw $2, %xmm0
-; CHECK-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    retq
-  %d = udiv <16 x i8> %x, splat (i8 7)
-  ret <16 x i8> %d
-}
-
-define <8 x i16> @udiv_v8i16_by7(<8 x i16> %x) nounwind {
-; CHECK-LABEL: udiv_v8i16_by7:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [9363,9363,9363,9363,9363,9363,9363,9363]
-; CHECK-NEXT:    pmulhuw %xmm0, %xmm1
-; CHECK-NEXT:    psubw %xmm1, %xmm0
-; CHECK-NEXT:    psrlw $1, %xmm0
-; CHECK-NEXT:    paddw %xmm1, %xmm0
-; CHECK-NEXT:    psrlw $2, %xmm0
-; CHECK-NEXT:    retq
-  %d = udiv <8 x i16> %x, splat (i16 7)
-  ret <8 x i16> %d
-}
-
-; zext(udiv i16) - should also improve.
-define i32 @zext_udiv_i16_by7(i16 %x) nounwind {
-; CHECK-LABEL: zext_udiv_i16_by7:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movzwl %di, %eax
-; CHECK-NEXT:    imulq $74899, %rax, %rax # imm = 0x12493
-; CHECK-NEXT:    shrq $19, %rax
-; CHECK-NEXT:    # kill: def $eax killed $eax killed $rax
-; CHECK-NEXT:    retq
-  %d = udiv i16 %x, 7
-  %z = zext i16 %d to i32
-  ret i32 %z
-}
diff --git a/llvm/test/CodeGen/X86/urem-vector-lkk.ll b/llvm/test/CodeGen/X86/urem-vector-lkk.ll
index f9a3b017a6748..89cef0daffd77 100644
--- a/llvm/test/CodeGen/X86/urem-vector-lkk.ll
+++ b/llvm/test/CodeGen/X86/urem-vector-lkk.ll
@@ -87,8 +87,8 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) {
 ; SSE-NEXT:    subl %ecx, %eax
 ; SSE-NEXT:    pinsrw $2, %eax, %xmm1
 ; SSE-NEXT:    pextrw $3, %xmm0, %eax
-; SSE-NEXT:    imulq $66909, %rax, %rcx # imm = 0x1055D
-; SSE-NEXT:    shrq $26, %rcx
+; SSE-NEXT:    imulq $4282176, %rax, %rcx # imm = 0x415740
+; SSE-NEXT:    shrq $32, %rcx
 ; SSE-NEXT:    imull $1003, %ecx, %ecx # imm = 0x3EB
 ; SSE-NEXT:    subl %ecx, %eax
 ; SSE-NEXT:    pinsrw $3, %eax, %xmm1
@@ -198,8 +198,8 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) {
 ; AVX-NEXT:    subl %ecx, %eax
 ; AVX-NEXT:    vpinsrw $2, %eax, %xmm1, %xmm1
 ; AVX-NEXT:    vpextrw $3, %xmm0, %eax
-; AVX-NEXT:    imulq $66909, %rax, %rcx # imm = 0x1055D
-; AVX-NEXT:    shrq $26, %rcx
+; AVX-NEXT:    imulq $4282176, %rax, %rcx # imm = 0x415740
+; AVX-NEXT:    shrq $32, %rcx
 ; AVX-NEXT:    imull $1003, %ecx, %ecx # imm = 0x3EB
 ; AVX-NEXT:    subl %ecx, %eax
 ; AVX-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm0
@@ -391,8 +391,8 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) {
 ; SSE-NEXT:    pxor %xmm1, %xmm1
 ; SSE-NEXT:    pinsrw $1, %eax, %xmm1
 ; SSE-NEXT:    pextrw $2, %xmm0, %eax
-; SSE-NEXT:    imulq $91181, %rax, %rcx # imm = 0x1642D
-; SSE-NEXT:    shrq $21, %rcx
+; SSE-NEXT:    imulq $186738688, %rax, %rcx # imm = 0xB216800
+; SSE-NEXT:    shrq $32, %rcx
 ; SSE-NEXT:    leal (%rcx,%rcx,2), %edx
 ; SSE-NEXT:    shll $3, %edx
 ; SSE-NEXT:    subl %edx, %ecx
@@ -505,8 +505,8 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) {
 ; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm1
 ; AVX-NEXT:    vpextrw $2, %xmm0, %eax
-; AVX-NEXT:    imulq $91181, %rax, %rcx # imm = 0x1642D
-; AVX-NEXT:    shrq $21, %rcx
+; AVX-NEXT:    imulq $186738688, %rax, %rcx # imm = 0xB216800
+; AVX-NEXT:    shrq $32, %rcx
 ; AVX-NEXT:    leal (%rcx,%rcx,2), %edx
 ; AVX-NEXT:    shll $3, %edx
 ; AVX-NEXT:    subl %edx, %ecx

>From b19aeb0f818f7232db18efc98bcad0ab9949fc86 Mon Sep 17 00:00:00 2001
From: Takashiidobe <idobetakashi at gmail.com>
Date: Wed, 25 Mar 2026 20:24:41 -0400
Subject: [PATCH 5/6] code review feedback, remove tests from generic

---
 .../CodeGen/SelectionDAG/TargetLowering.cpp   |   6 +-
 .../CodeGen/Generic/udiv-narrow-widening.ll   | 487 ------------------
 llvm/test/CodeGen/RISCV/div-by-constant.ll    |  26 +-
 .../CodeGen/X86/udiv-const-optimization.ll    |   6 +-
 4 files changed, 18 insertions(+), 507 deletions(-)
 delete mode 100644 llvm/test/CodeGen/Generic/udiv-narrow-widening.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index f2e6ca9f1c154..2f9bce103ff36 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -6852,13 +6852,9 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
 
-  // Compute known bits once; used both to reduce the Hacker's Delight magic
-  // and to check simple-wide-magic conditions below.
-  KnownBits Known0 = DAG.computeKnownBits(N0);
-
   // Try to use leading zeros of the dividend to reduce the multiplier and
   // avoid expensive fixups.
-  unsigned KnownLeadingZeros = Known0.countMinLeadingZeros();
+  unsigned KnownLeadingZeros = DAG.computeKnownBits(N0).countMinLeadingZeros();
 
   // If we're after type legalization and SVT is not legal, use the
   // promoted type for creating constants to avoid creating nodes with
diff --git a/llvm/test/CodeGen/Generic/udiv-narrow-widening.ll b/llvm/test/CodeGen/Generic/udiv-narrow-widening.ll
deleted file mode 100644
index 124bb7cb73565..0000000000000
--- a/llvm/test/CodeGen/Generic/udiv-narrow-widening.ll
+++ /dev/null
@@ -1,487 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=i386-unknown-linux-gnu              < %s | FileCheck %s --check-prefixes=CHECK,X86
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu            < %s | FileCheck %s --check-prefixes=CHECK,X64
-; RUN: llc -mtriple=aarch64-unknown-linux-gnu           < %s | FileCheck %s --check-prefixes=CHECK,AARCH64
-; RUN: llc -mtriple=riscv32-unknown-linux-gnu -mattr=+m < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64-unknown-linux-gnu -mattr=+m < %s | FileCheck %s --check-prefixes=CHECK,RV64
-
-; When a narrow udiv-by-constant (i8, i16) is lowered via BuildUDIV, the
-; Granlund-Montgomery (GM) algorithm may need an expensive add-and-shift fixup
-; for IsAdd=true divisors. A wider legal type (e.g. i32 for i16) has enough
-; headroom for a simple magic = ceil(2^Shift / C) that needs no fixup at all.
-;
-; i16 IsAdd=true divisors (e.g. 7): on targets with i32 UMUL_LOHI (i386,
-; x86-64) we emit a shiftless 32x32->64 multiply and take the high 32 bits.
-; On AArch64 (no i32 UMUL_LOHI) we fall back to FullMultiply via i64.
-
-; --- i8 cases ---
-
-define i8 @udiv_i8_by7(i8 %x) nounwind {
-; X86-LABEL: udiv_i8_by7:
-; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    leal (%eax,%eax,8), %ecx
-; X86-NEXT:    leal (%eax,%ecx,4), %ecx
-; X86-NEXT:    subb %ch, %al
-; X86-NEXT:    shrb %al
-; X86-NEXT:    addb %ch, %al
-; X86-NEXT:    shrb $2, %al
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-;
-; X64-LABEL: udiv_i8_by7:
-; X64:       # %bb.0:
-; X64-NEXT:    movzbl %dil, %eax
-; X64-NEXT:    imull $293, %eax, %eax # imm = 0x125
-; X64-NEXT:    shrl $11, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    retq
-;
-; AARCH64-LABEL: udiv_i8_by7:
-; AARCH64:       // %bb.0:
-; AARCH64-NEXT:    // kill: def $w0 killed $w0 def $x0
-; AARCH64-NEXT:    mov w8, #293 // =0x125
-; AARCH64-NEXT:    and x9, x0, #0xff
-; AARCH64-NEXT:    umull x8, w9, w8
-; AARCH64-NEXT:    lsr x0, x8, #11
-; AARCH64-NEXT:    // kill: def $w0 killed $w0 killed $x0
-; AARCH64-NEXT:    ret
-;
-; RV32-LABEL: udiv_i8_by7:
-; RV32:       # %bb.0:
-; RV32-NEXT:    zext.b a1, a0
-; RV32-NEXT:    li a2, 37
-; RV32-NEXT:    mul a1, a1, a2
-; RV32-NEXT:    srli a1, a1, 8
-; RV32-NEXT:    sub a0, a0, a1
-; RV32-NEXT:    slli a0, a0, 24
-; RV32-NEXT:    srli a0, a0, 25
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    srli a0, a0, 2
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: udiv_i8_by7:
-; RV64:       # %bb.0:
-; RV64-NEXT:    zext.b a0, a0
-; RV64-NEXT:    li a1, 293
-; RV64-NEXT:    mul a0, a0, a1
-; RV64-NEXT:    srli a0, a0, 11
-; RV64-NEXT:    ret
-  %d = udiv i8 %x, 7
-  ret i8 %d
-}
-
-define i8 @udiv_i8_by5(i8 %x) nounwind {
-; X86-LABEL: udiv_i8_by5:
-; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    imull $205, %eax, %eax
-; X86-NEXT:    shrl $10, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-;
-; X64-LABEL: udiv_i8_by5:
-; X64:       # %bb.0:
-; X64-NEXT:    movzbl %dil, %eax
-; X64-NEXT:    imull $205, %eax, %eax
-; X64-NEXT:    shrl $10, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    retq
-;
-; AARCH64-LABEL: udiv_i8_by5:
-; AARCH64:       // %bb.0:
-; AARCH64-NEXT:    mov w8, #205 // =0xcd
-; AARCH64-NEXT:    and w9, w0, #0xff
-; AARCH64-NEXT:    mul w8, w9, w8
-; AARCH64-NEXT:    lsr w0, w8, #10
-; AARCH64-NEXT:    ret
-;
-; RV32-LABEL: udiv_i8_by5:
-; RV32:       # %bb.0:
-; RV32-NEXT:    zext.b a0, a0
-; RV32-NEXT:    li a1, 205
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    srli a0, a0, 10
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: udiv_i8_by5:
-; RV64:       # %bb.0:
-; RV64-NEXT:    zext.b a0, a0
-; RV64-NEXT:    li a1, 205
-; RV64-NEXT:    mul a0, a0, a1
-; RV64-NEXT:    srli a0, a0, 10
-; RV64-NEXT:    ret
-  %d = udiv i8 %x, 5
-  ret i8 %d
-}
-
-define i8 @udiv_i8_by3(i8 %x) nounwind {
-; X86-LABEL: udiv_i8_by3:
-; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    imull $171, %eax, %eax
-; X86-NEXT:    shrl $9, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-;
-; X64-LABEL: udiv_i8_by3:
-; X64:       # %bb.0:
-; X64-NEXT:    movzbl %dil, %eax
-; X64-NEXT:    imull $171, %eax, %eax
-; X64-NEXT:    shrl $9, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    retq
-;
-; AARCH64-LABEL: udiv_i8_by3:
-; AARCH64:       // %bb.0:
-; AARCH64-NEXT:    mov w8, #171 // =0xab
-; AARCH64-NEXT:    and w9, w0, #0xff
-; AARCH64-NEXT:    mul w8, w9, w8
-; AARCH64-NEXT:    lsr w0, w8, #9
-; AARCH64-NEXT:    ret
-;
-; RV32-LABEL: udiv_i8_by3:
-; RV32:       # %bb.0:
-; RV32-NEXT:    zext.b a0, a0
-; RV32-NEXT:    li a1, 171
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    srli a0, a0, 9
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: udiv_i8_by3:
-; RV64:       # %bb.0:
-; RV64-NEXT:    zext.b a0, a0
-; RV64-NEXT:    li a1, 171
-; RV64-NEXT:    mul a0, a0, a1
-; RV64-NEXT:    srli a0, a0, 9
-; RV64-NEXT:    ret
-  %d = udiv i8 %x, 3
-  ret i8 %d
-}
-
-; Even divisor - stock uses pre-shift + magic; simple magic avoids pre-shift.
-define i8 @udiv_i8_by78(i8 %x) nounwind {
-; X86-LABEL: udiv_i8_by78:
-; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    shrb %al
-; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    imull $211, %eax, %eax
-; X86-NEXT:    shrl $13, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-;
-; X64-LABEL: udiv_i8_by78:
-; X64:       # %bb.0:
-; X64-NEXT:    shrb %dil
-; X64-NEXT:    movzbl %dil, %eax
-; X64-NEXT:    imull $211, %eax, %eax
-; X64-NEXT:    shrl $13, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    retq
-;
-; AARCH64-LABEL: udiv_i8_by78:
-; AARCH64:       // %bb.0:
-; AARCH64-NEXT:    ubfx w8, w0, #1, #7
-; AARCH64-NEXT:    mov w9, #211 // =0xd3
-; AARCH64-NEXT:    mul w8, w8, w9
-; AARCH64-NEXT:    lsr w0, w8, #13
-; AARCH64-NEXT:    ret
-;
-; RV32-LABEL: udiv_i8_by78:
-; RV32:       # %bb.0:
-; RV32-NEXT:    slli a0, a0, 24
-; RV32-NEXT:    srli a0, a0, 25
-; RV32-NEXT:    li a1, 211
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    srli a0, a0, 13
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: udiv_i8_by78:
-; RV64:       # %bb.0:
-; RV64-NEXT:    slli a0, a0, 56
-; RV64-NEXT:    srli a0, a0, 57
-; RV64-NEXT:    li a1, 211
-; RV64-NEXT:    mul a0, a0, a1
-; RV64-NEXT:    srli a0, a0, 13
-; RV64-NEXT:    ret
-  %d = udiv i8 %x, 78
-  ret i8 %d
-}
-
-define i8 @udiv_i8_by116(i8 %x) nounwind {
-; X86-LABEL: udiv_i8_by116:
-; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    shrb $2, %al
-; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    imull $71, %eax, %eax
-; X86-NEXT:    shrl $11, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-;
-; X64-LABEL: udiv_i8_by116:
-; X64:       # %bb.0:
-; X64-NEXT:    shrb $2, %dil
-; X64-NEXT:    movzbl %dil, %eax
-; X64-NEXT:    imull $71, %eax, %eax
-; X64-NEXT:    shrl $11, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    retq
-;
-; AARCH64-LABEL: udiv_i8_by116:
-; AARCH64:       // %bb.0:
-; AARCH64-NEXT:    ubfx w8, w0, #2, #6
-; AARCH64-NEXT:    mov w9, #71 // =0x47
-; AARCH64-NEXT:    mul w8, w8, w9
-; AARCH64-NEXT:    lsr w0, w8, #11
-; AARCH64-NEXT:    ret
-;
-; RV32-LABEL: udiv_i8_by116:
-; RV32:       # %bb.0:
-; RV32-NEXT:    slli a0, a0, 24
-; RV32-NEXT:    srli a0, a0, 26
-; RV32-NEXT:    li a1, 71
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    srli a0, a0, 11
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: udiv_i8_by116:
-; RV64:       # %bb.0:
-; RV64-NEXT:    slli a0, a0, 56
-; RV64-NEXT:    srli a0, a0, 58
-; RV64-NEXT:    li a1, 71
-; RV64-NEXT:    mul a0, a0, a1
-; RV64-NEXT:    srli a0, a0, 11
-; RV64-NEXT:    ret
-  %d = udiv i8 %x, 116
-  ret i8 %d
-}
-
-; Power of 2 - should NOT fire (already lowered to shift).
-define i8 @udiv_i8_by4(i8 %x) nounwind {
-; X86-LABEL: udiv_i8_by4:
-; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    shrb $2, %al
-; X86-NEXT:    retl
-;
-; X64-LABEL: udiv_i8_by4:
-; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    shrb $2, %al
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    retq
-;
-; AARCH64-LABEL: udiv_i8_by4:
-; AARCH64:       // %bb.0:
-; AARCH64-NEXT:    ubfx w0, w0, #2, #6
-; AARCH64-NEXT:    ret
-;
-; RV32-LABEL: udiv_i8_by4:
-; RV32:       # %bb.0:
-; RV32-NEXT:    slli a0, a0, 24
-; RV32-NEXT:    srli a0, a0, 26
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: udiv_i8_by4:
-; RV64:       # %bb.0:
-; RV64-NEXT:    slli a0, a0, 56
-; RV64-NEXT:    srli a0, a0, 58
-; RV64-NEXT:    ret
-  %d = udiv i8 %x, 4
-  ret i8 %d
-}
-
-; Division by 1 - should NOT fire.
-define i8 @udiv_i8_by1(i8 %x) nounwind {
-; X86-LABEL: udiv_i8_by1:
-; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    retl
-;
-; X64-LABEL: udiv_i8_by1:
-; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    retq
-;
-; AARCH64-LABEL: udiv_i8_by1:
-; AARCH64:       // %bb.0:
-; AARCH64-NEXT:    ret
-;
-; RV32-LABEL: udiv_i8_by1:
-; RV32:       # %bb.0:
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: udiv_i8_by1:
-; RV64:       # %bb.0:
-; RV64-NEXT:    ret
-  %d = udiv i8 %x, 1
-  ret i8 %d
-}
-
-; Bare i8 udiv feeding another i8 op (no zext).
-define i8 @udiv_i8_then_add(i8 %x, i8 %y) nounwind {
-; X86-LABEL: udiv_i8_then_add:
-; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    leal (%eax,%eax,8), %ecx
-; X86-NEXT:    leal (%eax,%ecx,4), %ecx
-; X86-NEXT:    subb %ch, %al
-; X86-NEXT:    shrb %al
-; X86-NEXT:    addb %ch, %al
-; X86-NEXT:    shrb $2, %al
-; X86-NEXT:    addb {{[0-9]+}}(%esp), %al
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-;
-; X64-LABEL: udiv_i8_then_add:
-; X64:       # %bb.0:
-; X64-NEXT:    movzbl %dil, %eax
-; X64-NEXT:    imull $293, %eax, %eax # imm = 0x125
-; X64-NEXT:    shrl $11, %eax
-; X64-NEXT:    addb %sil, %al
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    retq
-;
-; AARCH64-LABEL: udiv_i8_then_add:
-; AARCH64:       // %bb.0:
-; AARCH64-NEXT:    mov w8, #293 // =0x125
-; AARCH64-NEXT:    and w9, w0, #0xff
-; AARCH64-NEXT:    mul w8, w9, w8
-; AARCH64-NEXT:    add w0, w1, w8, lsr #11
-; AARCH64-NEXT:    ret
-;
-; RV32-LABEL: udiv_i8_then_add:
-; RV32:       # %bb.0:
-; RV32-NEXT:    zext.b a2, a0
-; RV32-NEXT:    li a3, 37
-; RV32-NEXT:    mul a2, a2, a3
-; RV32-NEXT:    srli a2, a2, 8
-; RV32-NEXT:    sub a0, a0, a2
-; RV32-NEXT:    slli a0, a0, 24
-; RV32-NEXT:    srli a0, a0, 25
-; RV32-NEXT:    add a0, a0, a2
-; RV32-NEXT:    srli a0, a0, 2
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: udiv_i8_then_add:
-; RV64:       # %bb.0:
-; RV64-NEXT:    zext.b a0, a0
-; RV64-NEXT:    li a2, 293
-; RV64-NEXT:    mul a0, a0, a2
-; RV64-NEXT:    srli a0, a0, 11
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    ret
-  %d = udiv i8 %x, 7
-  %r = add i8 %d, %y
-  ret i8 %r
-}
-
-; --- i16 cases ---
-
-; IsAdd=true: i386 uses shiftless mull (UMUL_LOHI), x86-64 uses imulq+shrq $32,
-; AArch64 falls back to umull+lsr (FullMultiply via i64).
-define i16 @udiv_i16_by7(i16 %x) nounwind {
-; X86-LABEL: udiv_i16_by7:
-; X86:       # %bb.0:
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl $613572608, %ecx # imm = 0x24926000
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    retl
-;
-; X64-LABEL: udiv_i16_by7:
-; X64:       # %bb.0:
-; X64-NEXT:    movzwl %di, %eax
-; X64-NEXT:    imulq $613572608, %rax, %rax # imm = 0x24926000
-; X64-NEXT:    shrq $32, %rax
-; X64-NEXT:    # kill: def $ax killed $ax killed $rax
-; X64-NEXT:    retq
-;
-; AARCH64-LABEL: udiv_i16_by7:
-; AARCH64:       // %bb.0:
-; AARCH64-NEXT:    mov w8, #9363 // =0x2493
-; AARCH64-NEXT:    // kill: def $w0 killed $w0 def $x0
-; AARCH64-NEXT:    and x9, x0, #0xffff
-; AARCH64-NEXT:    movk w8, #1, lsl #16
-; AARCH64-NEXT:    umull x8, w9, w8
-; AARCH64-NEXT:    lsr x0, x8, #19
-; AARCH64-NEXT:    // kill: def $w0 killed $w0 killed $x0
-; AARCH64-NEXT:    ret
-;
-; RV32-LABEL: udiv_i16_by7:
-; RV32:       # %bb.0:
-; RV32-NEXT:    slli a0, a0, 16
-; RV32-NEXT:    srli a0, a0, 16
-; RV32-NEXT:    lui a1, 149798
-; RV32-NEXT:    mulhu a0, a0, a1
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: udiv_i16_by7:
-; RV64:       # %bb.0:
-; RV64-NEXT:    lui a1, 74899
-; RV64-NEXT:    slli a1, a1, 4
-; RV64-NEXT:    slli a0, a0, 48
-; RV64-NEXT:    mulhu a0, a0, a1
-; RV64-NEXT:    srli a0, a0, 19
-; RV64-NEXT:    ret
-  %d = udiv i16 %x, 7
-  ret i16 %d
-}
-
-define i16 @udiv_i16_by100(i16 %x) nounwind {
-; X86-LABEL: udiv_i16_by100:
-; X86:       # %bb.0:
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    shrl $2, %eax
-; X86-NEXT:    imull $5243, %eax, %eax # imm = 0x147B
-; X86-NEXT:    shrl $17, %eax
-; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    retl
-;
-; X64-LABEL: udiv_i16_by100:
-; X64:       # %bb.0:
-; X64-NEXT:    movzwl %di, %eax
-; X64-NEXT:    shrl $2, %eax
-; X64-NEXT:    imull $5243, %eax, %eax # imm = 0x147B
-; X64-NEXT:    shrl $17, %eax
-; X64-NEXT:    # kill: def $ax killed $ax killed $eax
-; X64-NEXT:    retq
-;
-; AARCH64-LABEL: udiv_i16_by100:
-; AARCH64:       // %bb.0:
-; AARCH64-NEXT:    ubfx w8, w0, #2, #14
-; AARCH64-NEXT:    mov w9, #5243 // =0x147b
-; AARCH64-NEXT:    mul w8, w8, w9
-; AARCH64-NEXT:    lsr w0, w8, #17
-; AARCH64-NEXT:    ret
-;
-; RV32-LABEL: udiv_i16_by100:
-; RV32:       # %bb.0:
-; RV32-NEXT:    slli a0, a0, 16
-; RV32-NEXT:    lui a1, 1
-; RV32-NEXT:    srli a0, a0, 18
-; RV32-NEXT:    addi a1, a1, 1147
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    srli a0, a0, 17
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: udiv_i16_by100:
-; RV64:       # %bb.0:
-; RV64-NEXT:    slli a0, a0, 48
-; RV64-NEXT:    lui a1, 1
-; RV64-NEXT:    srli a0, a0, 50
-; RV64-NEXT:    addi a1, a1, 1147
-; RV64-NEXT:    mul a0, a0, a1
-; RV64-NEXT:    srli a0, a0, 17
-; RV64-NEXT:    ret
-  %d = udiv i16 %x, 100
-  ret i16 %d
-}
-
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/div-by-constant.ll b/llvm/test/CodeGen/RISCV/div-by-constant.ll
index 40b599bc1a076..44773ff04924b 100644
--- a/llvm/test/CodeGen/RISCV/div-by-constant.ll
+++ b/llvm/test/CodeGen/RISCV/div-by-constant.ll
@@ -278,18 +278,20 @@ define i16 @udiv16_constant_no_add(i16 %a) nounwind {
 }
 
 define i16 @udiv16_constant_add(i16 %a) nounwind {
-; RV32-LABEL: udiv16_constant_add:
-; RV32:       # %bb.0:
-; RV32-NEXT:    slli a1, a0, 16
-; RV32-NEXT:    lui a2, 149808
-; RV32-NEXT:    mulhu a1, a1, a2
-; RV32-NEXT:    srli a1, a1, 16
-; RV32-NEXT:    sub a0, a0, a1
-; RV32-NEXT:    slli a0, a0, 16
-; RV32-NEXT:    srli a0, a0, 17
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    srli a0, a0, 2
-; RV32-NEXT:    ret
+; RV32IM-LABEL: udiv16_constant_add:
+; RV32IM:       # %bb.0:
+; RV32IM-NEXT:    slli a0, a0, 16
+; RV32IM-NEXT:    srli a0, a0, 16
+; RV32IM-NEXT:    lui a1, 149798
+; RV32IM-NEXT:    mulhu a0, a0, a1
+; RV32IM-NEXT:    ret
+;
+; RV32IMZB-LABEL: udiv16_constant_add:
+; RV32IMZB:       # %bb.0:
+; RV32IMZB-NEXT:    zext.h a0, a0
+; RV32IMZB-NEXT:    lui a1, 149798
+; RV32IMZB-NEXT:    mulhu a0, a0, a1
+; RV32IMZB-NEXT:    ret
 ;
 ; RV64-LABEL: udiv16_constant_add:
 ; RV64:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/udiv-const-optimization.ll b/llvm/test/CodeGen/X86/udiv-const-optimization.ll
index 5e2518108512a..b4152aaddf39d 100644
--- a/llvm/test/CodeGen/X86/udiv-const-optimization.ll
+++ b/llvm/test/CodeGen/X86/udiv-const-optimization.ll
@@ -62,7 +62,7 @@ define i32 @udiv_by_19(i32 %x) nounwind {
 ; X86-LABEL: udiv_by_19:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl ${{-?[0-9]+}}, %edx # imm = 0xAF286BCB
+; X86-NEXT:    movl $-1356305461, %edx # imm = 0xAF286BCB
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull %edx
 ; X86-NEXT:    subl %edx, %ecx
@@ -96,7 +96,7 @@ define i32 @udiv_by_21(i32 %x) nounwind {
 ; X86-LABEL: udiv_by_21:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl ${{-?[0-9]+}}, %edx # imm = 0x86186187
+; X86-NEXT:    movl $-2045222521, %edx # imm = 0x86186187
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull %edx
 ; X86-NEXT:    subl %edx, %ecx
@@ -131,7 +131,7 @@ define i32 @udiv_by_3(i32 %x) nounwind {
 ;
 ; X86-LABEL: udiv_by_3:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl ${{-?[0-9]+}}, %eax # imm = 0xAAAAAAAB
+; X86-NEXT:    movl $-1431655765, %eax # imm = 0xAAAAAAAB
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    shrl %eax

>From 8a953862ab10de99f4abf31a7b245d3be7faf463 Mon Sep 17 00:00:00 2001
From: Takashiidobe <idobetakashi at gmail.com>
Date: Sat, 4 Apr 2026 22:19:06 -0400
Subject: [PATCH 6/6] fix botched rebase

---
 llvm/test/CodeGen/X86/urem-vector-lkk.ll | 298 +----------------------
 1 file changed, 2 insertions(+), 296 deletions(-)

diff --git a/llvm/test/CodeGen/X86/urem-vector-lkk.ll b/llvm/test/CodeGen/X86/urem-vector-lkk.ll
index 89cef0daffd77..3faa2a0720d4e 100644
--- a/llvm/test/CodeGen/X86/urem-vector-lkk.ll
+++ b/llvm/test/CodeGen/X86/urem-vector-lkk.ll
@@ -22,78 +22,8 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) {
 ; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [95,124,98,1003,u,u,u,u]
 ; SSE2-NEXT:    psubw %xmm1, %xmm0
 ; SSE2-NEXT:    retq
-; SSE-LABEL: fold_urem_vec_1:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pextrw $1, %xmm0, %eax
-; SSE-NEXT:    movl %eax, %ecx
-; SSE-NEXT:    shrl $2, %ecx
-; SSE-NEXT:    imull $16913, %ecx, %ecx # imm = 0x4211
-; SSE-NEXT:    shrl $19, %ecx
-; SSE-NEXT:    imull $124, %ecx, %ecx
-; SSE-NEXT:    subl %ecx, %eax
-; SSE-NEXT:    movd %xmm0, %ecx
-; SSE-NEXT:    movzwl %cx, %edx
-; SSE-NEXT:    imull $44151, %edx, %edx # imm = 0xAC77
-; SSE-NEXT:    shrl $22, %edx
-; SSE-NEXT:    imull $95, %edx, %edx
-; SSE-NEXT:    subl %edx, %ecx
-; SSE-NEXT:    movd %ecx, %xmm1
-; SSE-NEXT:    pinsrw $1, %eax, %xmm1
-; SSE-NEXT:    pextrw $2, %xmm0, %eax
-; SSE-NEXT:    movl %eax, %ecx
-; SSE-NEXT:    shrl %ecx
-; SSE-NEXT:    imull $2675, %ecx, %ecx # imm = 0xA73
-; SSE-NEXT:    shrl $17, %ecx
-; SSE-NEXT:    imull $98, %ecx, %ecx
-; SSE-NEXT:    subl %ecx, %eax
-; SSE-NEXT:    pinsrw $2, %eax, %xmm1
-; SSE-NEXT:    pextrw $3, %xmm0, %eax
-; SSE-NEXT:    imull $1373, %eax, %ecx # imm = 0x55D
-; SSE-NEXT:    shrl $16, %ecx
-; SSE-NEXT:    movl %eax, %edx
-; SSE-NEXT:    subl %ecx, %edx
-; SSE-NEXT:    movzwl %dx, %edx
-; SSE-NEXT:    shrl %edx
-; SSE-NEXT:    addl %ecx, %edx
-; SSE-NEXT:    shrl $9, %edx
-; SSE-NEXT:    imull $1003, %edx, %ecx # imm = 0x3EB
-; SSE-NEXT:    subl %ecx, %eax
-; SSE-NEXT:    pinsrw $3, %eax, %xmm1
-; SSE-NEXT:    movdqa %xmm1, %xmm0
-; SSE-NEXT:    retq
-; SSE-LABEL: fold_urem_vec_1:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pextrw $1, %xmm0, %eax
-; SSE-NEXT:    movl %eax, %ecx
-; SSE-NEXT:    shrl $2, %ecx
-; SSE-NEXT:    imull $16913, %ecx, %ecx # imm = 0x4211
-; SSE-NEXT:    shrl $19, %ecx
-; SSE-NEXT:    imull $124, %ecx, %ecx
-; SSE-NEXT:    subl %ecx, %eax
-; SSE-NEXT:    movd %xmm0, %ecx
-; SSE-NEXT:    movzwl %cx, %edx
-; SSE-NEXT:    imull $44151, %edx, %edx # imm = 0xAC77
-; SSE-NEXT:    shrl $22, %edx
-; SSE-NEXT:    imull $95, %edx, %edx
-; SSE-NEXT:    subl %edx, %ecx
-; SSE-NEXT:    movd %ecx, %xmm1
-; SSE-NEXT:    pinsrw $1, %eax, %xmm1
-; SSE-NEXT:    pextrw $2, %xmm0, %eax
-; SSE-NEXT:    movl %eax, %ecx
-; SSE-NEXT:    shrl %ecx
-; SSE-NEXT:    imull $2675, %ecx, %ecx # imm = 0xA73
-; SSE-NEXT:    shrl $17, %ecx
-; SSE-NEXT:    imull $98, %ecx, %ecx
-; SSE-NEXT:    subl %ecx, %eax
-; SSE-NEXT:    pinsrw $2, %eax, %xmm1
-; SSE-NEXT:    pextrw $3, %xmm0, %eax
-; SSE-NEXT:    imulq $4282176, %rax, %rcx # imm = 0x415740
-; SSE-NEXT:    shrq $32, %rcx
-; SSE-NEXT:    imull $1003, %ecx, %ecx # imm = 0x3EB
-; SSE-NEXT:    subl %ecx, %eax
-; SSE-NEXT:    pinsrw $3, %eax, %xmm1
-; SSE-NEXT:    movdqa %xmm1, %xmm0
-; SSE-NEXT:    retq
+;
+; SSE4-LABEL: fold_urem_vec_1:
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    movq {{.*#+}} xmm1 = [0,16384,32768,0,0,0,0,0]
 ; SSE4-NEXT:    pmulhuw %xmm0, %xmm1
@@ -132,79 +62,6 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) {
 ; AVX512-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [95,124,98,1003,u,u,u,u]
 ; AVX512-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-||||||| parent of 49405037fc36 (Use fixup-free 64-bit magic multiply for narrow udiv with IsAdd)
-; AVX-LABEL: fold_urem_vec_1:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpextrw $1, %xmm0, %eax
-; AVX-NEXT:    movl %eax, %ecx
-; AVX-NEXT:    shrl $2, %ecx
-; AVX-NEXT:    imull $16913, %ecx, %ecx # imm = 0x4211
-; AVX-NEXT:    shrl $19, %ecx
-; AVX-NEXT:    imull $124, %ecx, %ecx
-; AVX-NEXT:    subl %ecx, %eax
-; AVX-NEXT:    vmovd %xmm0, %ecx
-; AVX-NEXT:    movzwl %cx, %edx
-; AVX-NEXT:    imull $44151, %edx, %edx # imm = 0xAC77
-; AVX-NEXT:    shrl $22, %edx
-; AVX-NEXT:    imull $95, %edx, %edx
-; AVX-NEXT:    subl %edx, %ecx
-; AVX-NEXT:    vmovd %ecx, %xmm1
-; AVX-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrw $2, %xmm0, %eax
-; AVX-NEXT:    movl %eax, %ecx
-; AVX-NEXT:    shrl %ecx
-; AVX-NEXT:    imull $2675, %ecx, %ecx # imm = 0xA73
-; AVX-NEXT:    shrl $17, %ecx
-; AVX-NEXT:    imull $98, %ecx, %ecx
-; AVX-NEXT:    subl %ecx, %eax
-; AVX-NEXT:    vpinsrw $2, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrw $3, %xmm0, %eax
-; AVX-NEXT:    imull $1373, %eax, %ecx # imm = 0x55D
-; AVX-NEXT:    shrl $16, %ecx
-; AVX-NEXT:    movl %eax, %edx
-; AVX-NEXT:    subl %ecx, %edx
-; AVX-NEXT:    movzwl %dx, %edx
-; AVX-NEXT:    shrl %edx
-; AVX-NEXT:    addl %ecx, %edx
-; AVX-NEXT:    shrl $9, %edx
-; AVX-NEXT:    imull $1003, %edx, %ecx # imm = 0x3EB
-; AVX-NEXT:    subl %ecx, %eax
-; AVX-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm0
-; AVX-NEXT:    retq
-=======
-; AVX-LABEL: fold_urem_vec_1:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpextrw $1, %xmm0, %eax
-; AVX-NEXT:    movl %eax, %ecx
-; AVX-NEXT:    shrl $2, %ecx
-; AVX-NEXT:    imull $16913, %ecx, %ecx # imm = 0x4211
-; AVX-NEXT:    shrl $19, %ecx
-; AVX-NEXT:    imull $124, %ecx, %ecx
-; AVX-NEXT:    subl %ecx, %eax
-; AVX-NEXT:    vmovd %xmm0, %ecx
-; AVX-NEXT:    movzwl %cx, %edx
-; AVX-NEXT:    imull $44151, %edx, %edx # imm = 0xAC77
-; AVX-NEXT:    shrl $22, %edx
-; AVX-NEXT:    imull $95, %edx, %edx
-; AVX-NEXT:    subl %edx, %ecx
-; AVX-NEXT:    vmovd %ecx, %xmm1
-; AVX-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrw $2, %xmm0, %eax
-; AVX-NEXT:    movl %eax, %ecx
-; AVX-NEXT:    shrl %ecx
-; AVX-NEXT:    imull $2675, %ecx, %ecx # imm = 0xA73
-; AVX-NEXT:    shrl $17, %ecx
-; AVX-NEXT:    imull $98, %ecx, %ecx
-; AVX-NEXT:    subl %ecx, %eax
-; AVX-NEXT:    vpinsrw $2, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrw $3, %xmm0, %eax
-; AVX-NEXT:    imulq $4282176, %rax, %rcx # imm = 0x415740
-; AVX-NEXT:    shrq $32, %rcx
-; AVX-NEXT:    imull $1003, %ecx, %ecx # imm = 0x3EB
-; AVX-NEXT:    subl %ecx, %eax
-; AVX-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm0
-; AVX-NEXT:    retq
->>>>>>> 49405037fc36 (Use fixup-free 64-bit magic multiply for narrow udiv with IsAdd)
   %1 = urem <4 x i16> %x, <i16 95, i16 124, i16 98, i16 1003>
   ret <4 x i16> %1
 }
@@ -318,7 +175,6 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) {
 
 ; Don't fold if the divisor is one.
 define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) {
-<<<<<<< HEAD
 ; SSE2-LABEL: dont_fold_urem_one:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
@@ -348,67 +204,7 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) {
 ; SSE4-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [1,654,23,5423,u,u,u,u]
 ; SSE4-NEXT:    psubw %xmm2, %xmm0
 ; SSE4-NEXT:    retq
-||||||| parent of 49405037fc36 (Use fixup-free 64-bit magic multiply for narrow udiv with IsAdd)
-; SSE-LABEL: dont_fold_urem_one:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pextrw $2, %xmm0, %eax
-; SSE-NEXT:    imull $25645, %eax, %ecx # imm = 0x642D
-; SSE-NEXT:    shrl $16, %ecx
-; SSE-NEXT:    movl %eax, %edx
-; SSE-NEXT:    subl %ecx, %edx
-; SSE-NEXT:    movzwl %dx, %edx
-; SSE-NEXT:    shrl %edx
-; SSE-NEXT:    addl %ecx, %edx
-; SSE-NEXT:    shrl $4, %edx
-; SSE-NEXT:    leal (%rdx,%rdx,2), %ecx
-; SSE-NEXT:    shll $3, %ecx
-; SSE-NEXT:    subl %ecx, %edx
-; SSE-NEXT:    addl %eax, %edx
-; SSE-NEXT:    pextrw $1, %xmm0, %eax
-; SSE-NEXT:    imull $51307, %eax, %ecx # imm = 0xC86B
-; SSE-NEXT:    shrl $25, %ecx
-; SSE-NEXT:    imull $654, %ecx, %ecx # imm = 0x28E
-; SSE-NEXT:    subl %ecx, %eax
-; SSE-NEXT:    pxor %xmm1, %xmm1
-; SSE-NEXT:    pinsrw $1, %eax, %xmm1
-; SSE-NEXT:    pinsrw $2, %edx, %xmm1
-; SSE-NEXT:    pextrw $3, %xmm0, %eax
-; SSE-NEXT:    imull $12375, %eax, %ecx # imm = 0x3057
-; SSE-NEXT:    shrl $26, %ecx
-; SSE-NEXT:    imull $5423, %ecx, %ecx # imm = 0x152F
-; SSE-NEXT:    subl %ecx, %eax
-; SSE-NEXT:    pinsrw $3, %eax, %xmm1
-; SSE-NEXT:    movdqa %xmm1, %xmm0
-; SSE-NEXT:    retq
-=======
-; SSE-LABEL: dont_fold_urem_one:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pextrw $1, %xmm0, %eax
-; SSE-NEXT:    imull $51307, %eax, %ecx # imm = 0xC86B
-; SSE-NEXT:    shrl $25, %ecx
-; SSE-NEXT:    imull $654, %ecx, %ecx # imm = 0x28E
-; SSE-NEXT:    subl %ecx, %eax
-; SSE-NEXT:    pxor %xmm1, %xmm1
-; SSE-NEXT:    pinsrw $1, %eax, %xmm1
-; SSE-NEXT:    pextrw $2, %xmm0, %eax
-; SSE-NEXT:    imulq $186738688, %rax, %rcx # imm = 0xB216800
-; SSE-NEXT:    shrq $32, %rcx
-; SSE-NEXT:    leal (%rcx,%rcx,2), %edx
-; SSE-NEXT:    shll $3, %edx
-; SSE-NEXT:    subl %edx, %ecx
-; SSE-NEXT:    addl %eax, %ecx
-; SSE-NEXT:    pinsrw $2, %ecx, %xmm1
-; SSE-NEXT:    pextrw $3, %xmm0, %eax
-; SSE-NEXT:    imull $12375, %eax, %ecx # imm = 0x3057
-; SSE-NEXT:    shrl $26, %ecx
-; SSE-NEXT:    imull $5423, %ecx, %ecx # imm = 0x152F
-; SSE-NEXT:    subl %ecx, %eax
-; SSE-NEXT:    pinsrw $3, %eax, %xmm1
-; SSE-NEXT:    movdqa %xmm1, %xmm0
-; SSE-NEXT:    retq
->>>>>>> 49405037fc36 (Use fixup-free 64-bit magic multiply for narrow udiv with IsAdd)
 ;
-<<<<<<< HEAD
 ; AVX1OR2-LABEL: dont_fold_urem_one:
 ; AVX1OR2:       # %bb.0:
 ; AVX1OR2-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [u,51307,25645,12375,u,u,u,u]
@@ -432,94 +228,6 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) {
 ; AVX512-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,654,23,5423,u,u,u,u]
 ; AVX512-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-||||||| parent of 49405037fc36 (Use fixup-free 64-bit magic multiply for narrow udiv with IsAdd)
-; AVX1OR2-LABEL: dont_fold_urem_one:
-; AVX1OR2:       # %bb.0:
-; AVX1OR2-NEXT:    vpextrw $2, %xmm0, %eax
-; AVX1OR2-NEXT:    imull $25645, %eax, %ecx # imm = 0x642D
-; AVX1OR2-NEXT:    shrl $16, %ecx
-; AVX1OR2-NEXT:    movl %eax, %edx
-; AVX1OR2-NEXT:    subl %ecx, %edx
-; AVX1OR2-NEXT:    movzwl %dx, %edx
-; AVX1OR2-NEXT:    shrl %edx
-; AVX1OR2-NEXT:    addl %ecx, %edx
-; AVX1OR2-NEXT:    shrl $4, %edx
-; AVX1OR2-NEXT:    leal (%rdx,%rdx,2), %ecx
-; AVX1OR2-NEXT:    shll $3, %ecx
-; AVX1OR2-NEXT:    subl %ecx, %edx
-; AVX1OR2-NEXT:    addl %eax, %edx
-; AVX1OR2-NEXT:    vpextrw $1, %xmm0, %eax
-; AVX1OR2-NEXT:    imull $51307, %eax, %ecx # imm = 0xC86B
-; AVX1OR2-NEXT:    shrl $25, %ecx
-; AVX1OR2-NEXT:    imull $654, %ecx, %ecx # imm = 0x28E
-; AVX1OR2-NEXT:    subl %ecx, %eax
-; AVX1OR2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1OR2-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm1
-; AVX1OR2-NEXT:    vpinsrw $2, %edx, %xmm1, %xmm1
-; AVX1OR2-NEXT:    vpextrw $3, %xmm0, %eax
-; AVX1OR2-NEXT:    imull $12375, %eax, %ecx # imm = 0x3057
-; AVX1OR2-NEXT:    shrl $26, %ecx
-; AVX1OR2-NEXT:    imull $5423, %ecx, %ecx # imm = 0x152F
-; AVX1OR2-NEXT:    subl %ecx, %eax
-; AVX1OR2-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm0
-; AVX1OR2-NEXT:    retq
-;
-; AVX512-LABEL: dont_fold_urem_one:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpextrw $2, %xmm0, %eax
-; AVX512-NEXT:    imull $25645, %eax, %ecx # imm = 0x642D
-; AVX512-NEXT:    shrl $16, %ecx
-; AVX512-NEXT:    movl %eax, %edx
-; AVX512-NEXT:    subl %ecx, %edx
-; AVX512-NEXT:    movzwl %dx, %edx
-; AVX512-NEXT:    shrl %edx
-; AVX512-NEXT:    addl %ecx, %edx
-; AVX512-NEXT:    shrl $4, %edx
-; AVX512-NEXT:    leal (%rdx,%rdx,2), %ecx
-; AVX512-NEXT:    shll $3, %ecx
-; AVX512-NEXT:    subl %ecx, %edx
-; AVX512-NEXT:    vpextrw $1, %xmm0, %ecx
-; AVX512-NEXT:    addl %eax, %edx
-; AVX512-NEXT:    imull $51307, %ecx, %eax # imm = 0xC86B
-; AVX512-NEXT:    shrl $25, %eax
-; AVX512-NEXT:    imull $654, %eax, %eax # imm = 0x28E
-; AVX512-NEXT:    subl %eax, %ecx
-; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT:    vpinsrw $1, %ecx, %xmm1, %xmm1
-; AVX512-NEXT:    vpinsrw $2, %edx, %xmm1, %xmm1
-; AVX512-NEXT:    vpextrw $3, %xmm0, %eax
-; AVX512-NEXT:    imull $12375, %eax, %ecx # imm = 0x3057
-; AVX512-NEXT:    shrl $26, %ecx
-; AVX512-NEXT:    imull $5423, %ecx, %ecx # imm = 0x152F
-; AVX512-NEXT:    subl %ecx, %eax
-; AVX512-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm0
-; AVX512-NEXT:    retq
-=======
-; AVX-LABEL: dont_fold_urem_one:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpextrw $1, %xmm0, %eax
-; AVX-NEXT:    imull $51307, %eax, %ecx # imm = 0xC86B
-; AVX-NEXT:    shrl $25, %ecx
-; AVX-NEXT:    imull $654, %ecx, %ecx # imm = 0x28E
-; AVX-NEXT:    subl %ecx, %eax
-; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrw $2, %xmm0, %eax
-; AVX-NEXT:    imulq $186738688, %rax, %rcx # imm = 0xB216800
-; AVX-NEXT:    shrq $32, %rcx
-; AVX-NEXT:    leal (%rcx,%rcx,2), %edx
-; AVX-NEXT:    shll $3, %edx
-; AVX-NEXT:    subl %edx, %ecx
-; AVX-NEXT:    addl %eax, %ecx
-; AVX-NEXT:    vpinsrw $2, %ecx, %xmm1, %xmm1
-; AVX-NEXT:    vpextrw $3, %xmm0, %eax
-; AVX-NEXT:    imull $12375, %eax, %ecx # imm = 0x3057
-; AVX-NEXT:    shrl $26, %ecx
-; AVX-NEXT:    imull $5423, %ecx, %ecx # imm = 0x152F
-; AVX-NEXT:    subl %ecx, %eax
-; AVX-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm0
-; AVX-NEXT:    retq
->>>>>>> 49405037fc36 (Use fixup-free 64-bit magic multiply for narrow udiv with IsAdd)
   %1 = urem <4 x i16> %x, <i16 1, i16 654, i16 23, i16 5423>
   ret <4 x i16> %1
 }
@@ -737,5 +445,3 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) {
   %1 = urem <4 x i64> %x, <i64 1, i64 654, i64 23, i64 5423>
   ret <4 x i64> %1
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX1OR2: {{.*}}