[llvm] [LegalizeTypes] Expand UDIV/UREM by constant via chunk summation (PR #146238)
Shivam Gupta via llvm-commits
llvm-commits at lists.llvm.org
Sun Mar 15 23:19:22 PDT 2026
https://github.com/xgupta updated https://github.com/llvm/llvm-project/pull/146238
>From 97af698fd6e99e993492d1bbce3d8923161d7c8d Mon Sep 17 00:00:00 2001
From: Shivam Gupta <shivam98.tkg at gmail.com>
Date: Fri, 27 Jun 2025 19:01:03 +0530
Subject: [PATCH 01/15] [LegalizeTypes] Expand 128-bit UDIV/UREM by constant
via Chunk Addition
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
This patch improves the lowering of 128-bit unsigned division and remainder
by constants (UDIV/UREM) by avoiding a fallback to libcall (__udivti3/uremti3)
for specific divisors.
When a divisor D satisfies the condition (1 << ChunkWidth) % D == 1, the
128-bit value is split into fixed-width chunks (e.g., 30-bit) and summed
before applying a smaller UDIV/UREM. This transformation is based on the
"remainder by summing digits" trick described in Hacker’s Delight.
This fixes PR137514 for some constants.
---
.../CodeGen/SelectionDAG/TargetLowering.cpp | 76 ++++++-
llvm/test/CodeGen/RISCV/div-by-constant.ll | 80 ++++++-
.../CodeGen/RISCV/split-udiv-by-constant.ll | 183 ++++++++++++---
.../CodeGen/RISCV/split-urem-by-constant.ll | 135 ++++++++---
llvm/test/CodeGen/X86/divide-by-constant.ll | 42 +++-
llvm/test/CodeGen/X86/divmod128.ll | 81 +++++--
llvm/test/CodeGen/X86/uint128-div-const.ll | 210 ++++++++++++++++++
7 files changed, 717 insertions(+), 90 deletions(-)
create mode 100644 llvm/test/CodeGen/X86/uint128-div-const.ll
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 5748ef89aef4e..ec5e7b44a3d09 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8184,8 +8184,6 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
// If (1 << HBitWidth) % divisor == 1, we can add the two halves together and
// then add in the carry.
- // TODO: If we can't split it in half, we might be able to split into 3 or
- // more pieces using a smaller bit width.
if (HalfMaxPlus1.urem(Divisor).isOne()) {
assert(!LL == !LH && "Expected both input halves or no input halves!");
if (!LL)
@@ -8233,6 +8231,80 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
DAG.getConstant(0, dl, HiLoVT));
Sum = DAG.getNode(ISD::ADD, dl, HiLoVT, Sum, Carry);
}
+
+ } else {
+ // If we cannot split in two halves. Let's look for a smaller chunk
+ // width where (1 << ChunkWidth) mod Divisor == 1.
+ // This ensures that the sum of all such chunks modulo Divisor
+ // is equivalent to the original value modulo Divisor.
+ const APInt &Divisor = CN->getAPIntValue();
+ unsigned BitWidth = VT.getScalarSizeInBits();
+ unsigned BestChunkWidth = 0;
+
+ // We restrict to small chunk sizes (e.g., ≤ 32 bits) to ensure that all
+ // operations remain legal on most targets.
+ unsigned MaxChunk = 32;
+ for (int i = MaxChunk; i >= 1; --i) {
+ APInt ChunkMaxPlus1 = APInt::getOneBitSet(BitWidth, i);
+ if (ChunkMaxPlus1.urem(Divisor).isOne()) {
+ BestChunkWidth = i;
+ break;
+ }
+ }
+
+ // If we found a good chunk width, slice the number and sum the pieces.
+ if (BestChunkWidth > 0) {
+ EVT ChunkVT = EVT::getIntegerVT(*DAG.getContext(), BestChunkWidth);
+
+ if (!LL)
+ std::tie(LL, LH) =
+ DAG.SplitScalar(N->getOperand(0), dl, HiLoVT, HiLoVT);
+ SDValue In = DAG.getNode(ISD::BUILD_PAIR, dl, VT, LL, LH);
+
+ SmallVector<SDValue, 8> Parts;
+ // Split into fixed-size chunks
+ for (unsigned i = 0; i < BitWidth; i += BestChunkWidth) {
+ SDValue Shift = DAG.getShiftAmountConstant(i, VT, dl);
+ SDValue Chunk = DAG.getNode(ISD::SRL, dl, VT, In, Shift);
+ Chunk = DAG.getNode(ISD::TRUNCATE, dl, ChunkVT, Chunk);
+ Parts.push_back(Chunk);
+ }
+ if (Parts.empty())
+ return false;
+ Sum = Parts[0];
+
+ // Use uaddo_carry if we can, otherwise use a compare to detect overflow.
+ // same logic as used in above if condition.
+ SDValue Carry = DAG.getConstant(0, dl, ChunkVT);
+ EVT SetCCType =
+ getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), ChunkVT);
+ for (unsigned i = 1; i < Parts.size(); ++i) {
+ if (isOperationLegalOrCustom(ISD::UADDO_CARRY, ChunkVT)) {
+ SDVTList VTList = DAG.getVTList(ChunkVT, SetCCType);
+ SDValue UAdd = DAG.getNode(ISD::UADDO, dl, VTList, Sum, Parts[i]);
+ Sum = DAG.getNode(ISD::UADDO_CARRY, dl, VTList, UAdd, Carry,
+ UAdd.getValue(1));
+ } else {
+ SDValue Add = DAG.getNode(ISD::ADD, dl, ChunkVT, Sum, Parts[i]);
+ SDValue NewCarry = DAG.getSetCC(dl, SetCCType, Add, Sum, ISD::SETULT);
+
+ if (getBooleanContents(ChunkVT) ==
+ TargetLoweringBase::ZeroOrOneBooleanContent)
+ NewCarry = DAG.getZExtOrTrunc(NewCarry, dl, ChunkVT);
+ else
+ NewCarry = DAG.getSelect(dl, ChunkVT, NewCarry,
+ DAG.getConstant(1, dl, ChunkVT),
+ DAG.getConstant(0, dl, ChunkVT));
+
+ Sum = DAG.getNode(ISD::ADD, dl, ChunkVT, Add, Carry);
+ Carry = NewCarry;
+ }
+ }
+
+ Sum = DAG.getNode(ISD::ZERO_EXTEND, dl, HiLoVT, Sum);
+ } else {
+ return false;
+ }
}
// If we didn't find a sum, we can't do the expansion.
diff --git a/llvm/test/CodeGen/RISCV/div-by-constant.ll b/llvm/test/CodeGen/RISCV/div-by-constant.ll
index 24c882daa113d..bf73f37b09d08 100644
--- a/llvm/test/CodeGen/RISCV/div-by-constant.ll
+++ b/llvm/test/CodeGen/RISCV/div-by-constant.ll
@@ -111,16 +111,76 @@ define i64 @udiv64_constant_no_add(i64 %a) nounwind {
}
define i64 @udiv64_constant_add(i64 %a) nounwind {
-; RV32-LABEL: udiv64_constant_add:
-; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT: li a2, 7
-; RV32-NEXT: li a3, 0
-; RV32-NEXT: call __udivdi3
-; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT: addi sp, sp, 16
-; RV32-NEXT: ret
+; RV32IM-LABEL: udiv64_constant_add:
+; RV32IM: # %bb.0:
+; RV32IM-NEXT: lui a2, 262144
+; RV32IM-NEXT: slli a3, a1, 2
+; RV32IM-NEXT: srli a4, a0, 30
+; RV32IM-NEXT: srli a5, a1, 28
+; RV32IM-NEXT: lui a6, 149797
+; RV32IM-NEXT: addi a2, a2, -1
+; RV32IM-NEXT: or a3, a4, a3
+; RV32IM-NEXT: and a4, a0, a2
+; RV32IM-NEXT: add a3, a0, a3
+; RV32IM-NEXT: add a5, a3, a5
+; RV32IM-NEXT: and a3, a3, a2
+; RV32IM-NEXT: sltu a3, a3, a4
+; RV32IM-NEXT: lui a4, 449390
+; RV32IM-NEXT: add a3, a5, a3
+; RV32IM-NEXT: lui a5, 748983
+; RV32IM-NEXT: addi a6, a6, -1755
+; RV32IM-NEXT: addi a4, a4, -1171
+; RV32IM-NEXT: addi a5, a5, -585
+; RV32IM-NEXT: and a2, a3, a2
+; RV32IM-NEXT: mulhu a3, a2, a6
+; RV32IM-NEXT: slli a6, a3, 3
+; RV32IM-NEXT: add a2, a2, a3
+; RV32IM-NEXT: sub a2, a2, a6
+; RV32IM-NEXT: sub a3, a0, a2
+; RV32IM-NEXT: sltu a0, a0, a2
+; RV32IM-NEXT: mul a2, a3, a4
+; RV32IM-NEXT: mulhu a4, a3, a5
+; RV32IM-NEXT: sub a1, a1, a0
+; RV32IM-NEXT: add a2, a4, a2
+; RV32IM-NEXT: mul a1, a1, a5
+; RV32IM-NEXT: add a1, a2, a1
+; RV32IM-NEXT: mul a0, a3, a5
+; RV32IM-NEXT: ret
+;
+; RV32IMZB-LABEL: udiv64_constant_add:
+; RV32IMZB: # %bb.0:
+; RV32IMZB-NEXT: srli a2, a0, 30
+; RV32IMZB-NEXT: srli a3, a1, 28
+; RV32IMZB-NEXT: lui a4, 786432
+; RV32IMZB-NEXT: slli a5, a0, 2
+; RV32IMZB-NEXT: lui a6, 149797
+; RV32IMZB-NEXT: sh2add a2, a1, a2
+; RV32IMZB-NEXT: srli a5, a5, 2
+; RV32IMZB-NEXT: add a2, a0, a2
+; RV32IMZB-NEXT: add a3, a2, a3
+; RV32IMZB-NEXT: andn a2, a2, a4
+; RV32IMZB-NEXT: sltu a2, a2, a5
+; RV32IMZB-NEXT: lui a5, 449390
+; RV32IMZB-NEXT: add a2, a3, a2
+; RV32IMZB-NEXT: lui a3, 748983
+; RV32IMZB-NEXT: addi a6, a6, -1755
+; RV32IMZB-NEXT: addi a5, a5, -1171
+; RV32IMZB-NEXT: addi a3, a3, -585
+; RV32IMZB-NEXT: andn a2, a2, a4
+; RV32IMZB-NEXT: mulhu a4, a2, a6
+; RV32IMZB-NEXT: slli a6, a4, 3
+; RV32IMZB-NEXT: add a2, a2, a4
+; RV32IMZB-NEXT: sub a2, a2, a6
+; RV32IMZB-NEXT: sub a4, a0, a2
+; RV32IMZB-NEXT: sltu a0, a0, a2
+; RV32IMZB-NEXT: mul a2, a4, a5
+; RV32IMZB-NEXT: mulhu a5, a4, a3
+; RV32IMZB-NEXT: sub a1, a1, a0
+; RV32IMZB-NEXT: add a2, a5, a2
+; RV32IMZB-NEXT: mul a1, a1, a3
+; RV32IMZB-NEXT: add a1, a2, a1
+; RV32IMZB-NEXT: mul a0, a4, a3
+; RV32IMZB-NEXT: ret
;
; RV64-LABEL: udiv64_constant_add:
; RV64: # %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll b/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll
index eb70d7f43c0ef..8250fc3a176e2 100644
--- a/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll
+++ b/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll
@@ -117,24 +117,89 @@ define iXLen2 @test_udiv_5(iXLen2 %x) nounwind {
define iXLen2 @test_udiv_7(iXLen2 %x) nounwind {
; RV32-LABEL: test_udiv_7:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT: li a2, 7
-; RV32-NEXT: li a3, 0
-; RV32-NEXT: call __udivdi3
-; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: lui a2, 262144
+; RV32-NEXT: slli a3, a1, 2
+; RV32-NEXT: srli a4, a0, 30
+; RV32-NEXT: srli a5, a1, 28
+; RV32-NEXT: lui a6, 149797
+; RV32-NEXT: addi a2, a2, -1
+; RV32-NEXT: or a3, a4, a3
+; RV32-NEXT: and a4, a0, a2
+; RV32-NEXT: add a3, a0, a3
+; RV32-NEXT: add a5, a3, a5
+; RV32-NEXT: and a3, a3, a2
+; RV32-NEXT: sltu a3, a3, a4
+; RV32-NEXT: lui a4, 449390
+; RV32-NEXT: add a3, a5, a3
+; RV32-NEXT: lui a5, 748983
+; RV32-NEXT: addi a6, a6, -1755
+; RV32-NEXT: addi a4, a4, -1171
+; RV32-NEXT: addi a5, a5, -585
+; RV32-NEXT: and a2, a3, a2
+; RV32-NEXT: mulhu a3, a2, a6
+; RV32-NEXT: slli a6, a3, 3
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: sub a2, a2, a6
+; RV32-NEXT: sub a3, a0, a2
+; RV32-NEXT: sltu a0, a0, a2
+; RV32-NEXT: mul a2, a3, a4
+; RV32-NEXT: mulhu a4, a3, a5
+; RV32-NEXT: sub a1, a1, a0
+; RV32-NEXT: add a2, a4, a2
+; RV32-NEXT: mul a1, a1, a5
+; RV32-NEXT: add a1, a2, a1
+; RV32-NEXT: mul a0, a3, a5
; RV32-NEXT: ret
;
; RV64-LABEL: test_udiv_7:
; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -16
-; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
-; RV64-NEXT: li a2, 7
-; RV64-NEXT: li a3, 0
-; RV64-NEXT: call __udivti3
-; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
-; RV64-NEXT: addi sp, sp, 16
+; RV64-NEXT: slli a2, a1, 4
+; RV64-NEXT: srli a3, a0, 60
+; RV64-NEXT: slli a4, a1, 34
+; RV64-NEXT: srli a5, a0, 30
+; RV64-NEXT: lui a6, 262144
+; RV64-NEXT: srli a7, a1, 26
+; RV64-NEXT: or a2, a3, a2
+; RV64-NEXT: lui a3, 748983
+; RV64-NEXT: or a4, a5, a4
+; RV64-NEXT: addi a6, a6, -1
+; RV64-NEXT: addi a3, a3, -585
+; RV64-NEXT: add a4, a0, a4
+; RV64-NEXT: slli a5, a3, 33
+; RV64-NEXT: add a3, a3, a5
+; RV64-NEXT: and a5, a0, a6
+; RV64-NEXT: add a2, a4, a2
+; RV64-NEXT: and a4, a4, a6
+; RV64-NEXT: sltu a5, a4, a5
+; RV64-NEXT: add a5, a2, a5
+; RV64-NEXT: and a2, a2, a6
+; RV64-NEXT: sltu a2, a2, a4
+; RV64-NEXT: srli a4, a1, 56
+; RV64-NEXT: add a2, a2, a4
+; RV64-NEXT: lui a4, %hi(.LCPI2_0)
+; RV64-NEXT: add a7, a5, a7
+; RV64-NEXT: and a5, a5, a6
+; RV64-NEXT: add a2, a7, a2
+; RV64-NEXT: and a7, a7, a6
+; RV64-NEXT: sltu a5, a7, a5
+; RV64-NEXT: lui a7, %hi(.LCPI2_1)
+; RV64-NEXT: ld a4, %lo(.LCPI2_0)(a4)
+; RV64-NEXT: ld a7, %lo(.LCPI2_1)(a7)
+; RV64-NEXT: add a2, a2, a5
+; RV64-NEXT: and a2, a2, a6
+; RV64-NEXT: mulhu a4, a2, a4
+; RV64-NEXT: slli a5, a4, 3
+; RV64-NEXT: add a2, a2, a4
+; RV64-NEXT: sub a2, a2, a5
+; RV64-NEXT: sub a4, a0, a2
+; RV64-NEXT: sltu a0, a0, a2
+; RV64-NEXT: mul a2, a4, a7
+; RV64-NEXT: mulhu a5, a4, a3
+; RV64-NEXT: sub a1, a1, a0
+; RV64-NEXT: add a2, a5, a2
+; RV64-NEXT: mul a1, a1, a3
+; RV64-NEXT: add a1, a2, a1
+; RV64-NEXT: mul a0, a4, a3
; RV64-NEXT: ret
%a = udiv iXLen2 %x, 7
ret iXLen2 %a
@@ -143,24 +208,86 @@ define iXLen2 @test_udiv_7(iXLen2 %x) nounwind {
define iXLen2 @test_udiv_9(iXLen2 %x) nounwind {
; RV32-LABEL: test_udiv_9:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT: li a2, 9
-; RV32-NEXT: li a3, 0
-; RV32-NEXT: call __udivdi3
-; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: lui a2, 262144
+; RV32-NEXT: slli a3, a1, 2
+; RV32-NEXT: srli a4, a0, 30
+; RV32-NEXT: srli a5, a1, 28
+; RV32-NEXT: lui a6, 233017
+; RV32-NEXT: addi a2, a2, -1
+; RV32-NEXT: or a3, a4, a3
+; RV32-NEXT: and a4, a0, a2
+; RV32-NEXT: add a3, a0, a3
+; RV32-NEXT: add a5, a3, a5
+; RV32-NEXT: and a3, a3, a2
+; RV32-NEXT: sltu a3, a3, a4
+; RV32-NEXT: lui a4, 582542
+; RV32-NEXT: addi a6, a6, -455
+; RV32-NEXT: addi a4, a4, 910
+; RV32-NEXT: add a3, a5, a3
+; RV32-NEXT: and a2, a3, a2
+; RV32-NEXT: mulhu a3, a2, a6
+; RV32-NEXT: srli a3, a3, 1
+; RV32-NEXT: slli a5, a3, 3
+; RV32-NEXT: sub a2, a2, a3
+; RV32-NEXT: sub a2, a2, a5
+; RV32-NEXT: sub a3, a0, a2
+; RV32-NEXT: sltu a0, a0, a2
+; RV32-NEXT: mul a2, a3, a4
+; RV32-NEXT: mulhu a4, a3, a6
+; RV32-NEXT: sub a1, a1, a0
+; RV32-NEXT: add a2, a4, a2
+; RV32-NEXT: mul a1, a1, a6
+; RV32-NEXT: add a1, a2, a1
+; RV32-NEXT: mul a0, a3, a6
; RV32-NEXT: ret
;
; RV64-LABEL: test_udiv_9:
; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -16
-; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
-; RV64-NEXT: li a2, 9
-; RV64-NEXT: li a3, 0
-; RV64-NEXT: call __udivti3
-; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
-; RV64-NEXT: addi sp, sp, 16
+; RV64-NEXT: slli a2, a1, 4
+; RV64-NEXT: srli a3, a0, 60
+; RV64-NEXT: slli a4, a1, 34
+; RV64-NEXT: srli a5, a0, 30
+; RV64-NEXT: lui a6, 262144
+; RV64-NEXT: srli a7, a1, 26
+; RV64-NEXT: or a2, a3, a2
+; RV64-NEXT: srli a3, a1, 56
+; RV64-NEXT: or a4, a5, a4
+; RV64-NEXT: addi a6, a6, -1
+; RV64-NEXT: add a4, a0, a4
+; RV64-NEXT: and a5, a0, a6
+; RV64-NEXT: add a2, a4, a2
+; RV64-NEXT: and a4, a4, a6
+; RV64-NEXT: sltu a5, a4, a5
+; RV64-NEXT: add a5, a2, a5
+; RV64-NEXT: and a2, a2, a6
+; RV64-NEXT: sltu a2, a2, a4
+; RV64-NEXT: lui a4, %hi(.LCPI3_0)
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: lui a3, %hi(.LCPI3_1)
+; RV64-NEXT: add a7, a5, a7
+; RV64-NEXT: and a5, a5, a6
+; RV64-NEXT: add a2, a7, a2
+; RV64-NEXT: and a7, a7, a6
+; RV64-NEXT: sltu a5, a7, a5
+; RV64-NEXT: lui a7, %hi(.LCPI3_2)
+; RV64-NEXT: ld a4, %lo(.LCPI3_0)(a4)
+; RV64-NEXT: ld a3, %lo(.LCPI3_1)(a3)
+; RV64-NEXT: ld a7, %lo(.LCPI3_2)(a7)
+; RV64-NEXT: add a2, a2, a5
+; RV64-NEXT: and a2, a2, a6
+; RV64-NEXT: mulhu a4, a2, a4
+; RV64-NEXT: slli a5, a4, 3
+; RV64-NEXT: sub a2, a2, a4
+; RV64-NEXT: sub a2, a2, a5
+; RV64-NEXT: sub a4, a0, a2
+; RV64-NEXT: sltu a0, a0, a2
+; RV64-NEXT: mul a2, a4, a3
+; RV64-NEXT: mulhu a3, a4, a7
+; RV64-NEXT: sub a1, a1, a0
+; RV64-NEXT: add a2, a3, a2
+; RV64-NEXT: mul a1, a1, a7
+; RV64-NEXT: add a1, a2, a1
+; RV64-NEXT: mul a0, a4, a7
; RV64-NEXT: ret
%a = udiv iXLen2 %x, 9
ret iXLen2 %a
diff --git a/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll b/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll
index bc4a99a00ac64..1680ea7d8da30 100644
--- a/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll
+++ b/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll
@@ -79,24 +79,63 @@ define iXLen2 @test_urem_5(iXLen2 %x) nounwind {
define iXLen2 @test_urem_7(iXLen2 %x) nounwind {
; RV32-LABEL: test_urem_7:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT: li a2, 7
-; RV32-NEXT: li a3, 0
-; RV32-NEXT: call __umoddi3
-; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: lui a2, 262144
+; RV32-NEXT: slli a3, a1, 2
+; RV32-NEXT: srli a4, a0, 30
+; RV32-NEXT: srli a1, a1, 28
+; RV32-NEXT: lui a5, 149797
+; RV32-NEXT: addi a2, a2, -1
+; RV32-NEXT: or a3, a4, a3
+; RV32-NEXT: addi a4, a5, -1755
+; RV32-NEXT: and a5, a0, a2
+; RV32-NEXT: add a0, a0, a3
+; RV32-NEXT: and a3, a0, a2
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: sltu a1, a3, a5
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: and a0, a0, a2
+; RV32-NEXT: mulhu a1, a0, a4
+; RV32-NEXT: slli a2, a1, 3
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: sub a0, a0, a2
+; RV32-NEXT: li a1, 0
; RV32-NEXT: ret
;
; RV64-LABEL: test_urem_7:
; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -16
-; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
-; RV64-NEXT: li a2, 7
-; RV64-NEXT: li a3, 0
-; RV64-NEXT: call __umodti3
-; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
-; RV64-NEXT: addi sp, sp, 16
+; RV64-NEXT: slli a2, a1, 4
+; RV64-NEXT: srli a3, a0, 60
+; RV64-NEXT: slli a4, a1, 34
+; RV64-NEXT: srli a5, a0, 30
+; RV64-NEXT: lui a6, 262144
+; RV64-NEXT: or a2, a3, a2
+; RV64-NEXT: srli a3, a1, 26
+; RV64-NEXT: srli a1, a1, 56
+; RV64-NEXT: or a4, a5, a4
+; RV64-NEXT: lui a5, %hi(.LCPI2_0)
+; RV64-NEXT: addi a6, a6, -1
+; RV64-NEXT: ld a5, %lo(.LCPI2_0)(a5)
+; RV64-NEXT: add a4, a0, a4
+; RV64-NEXT: and a0, a0, a6
+; RV64-NEXT: add a2, a4, a2
+; RV64-NEXT: and a4, a4, a6
+; RV64-NEXT: sltu a0, a4, a0
+; RV64-NEXT: add a0, a2, a0
+; RV64-NEXT: and a2, a2, a6
+; RV64-NEXT: sltu a2, a2, a4
+; RV64-NEXT: and a4, a0, a6
+; RV64-NEXT: add a0, a0, a3
+; RV64-NEXT: add a1, a2, a1
+; RV64-NEXT: and a2, a0, a6
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: sltu a1, a2, a4
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: and a0, a0, a6
+; RV64-NEXT: mulhu a1, a0, a5
+; RV64-NEXT: slli a2, a1, 3
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: sub a0, a0, a2
+; RV64-NEXT: li a1, 0
; RV64-NEXT: ret
%a = urem iXLen2 %x, 7
ret iXLen2 %a
@@ -105,24 +144,64 @@ define iXLen2 @test_urem_7(iXLen2 %x) nounwind {
define iXLen2 @test_urem_9(iXLen2 %x) nounwind {
; RV32-LABEL: test_urem_9:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT: li a2, 9
-; RV32-NEXT: li a3, 0
-; RV32-NEXT: call __umoddi3
-; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: lui a2, 262144
+; RV32-NEXT: slli a3, a1, 2
+; RV32-NEXT: srli a4, a0, 30
+; RV32-NEXT: srli a1, a1, 28
+; RV32-NEXT: lui a5, 233017
+; RV32-NEXT: addi a2, a2, -1
+; RV32-NEXT: or a3, a4, a3
+; RV32-NEXT: addi a4, a5, -455
+; RV32-NEXT: and a5, a0, a2
+; RV32-NEXT: add a0, a0, a3
+; RV32-NEXT: and a3, a0, a2
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: sltu a1, a3, a5
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: and a0, a0, a2
+; RV32-NEXT: mulhu a1, a0, a4
+; RV32-NEXT: srli a1, a1, 1
+; RV32-NEXT: slli a2, a1, 3
+; RV32-NEXT: sub a0, a0, a1
+; RV32-NEXT: sub a0, a0, a2
+; RV32-NEXT: li a1, 0
; RV32-NEXT: ret
;
; RV64-LABEL: test_urem_9:
; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -16
-; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
-; RV64-NEXT: li a2, 9
-; RV64-NEXT: li a3, 0
-; RV64-NEXT: call __umodti3
-; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
-; RV64-NEXT: addi sp, sp, 16
+; RV64-NEXT: slli a2, a1, 4
+; RV64-NEXT: srli a3, a0, 60
+; RV64-NEXT: slli a4, a1, 34
+; RV64-NEXT: srli a5, a0, 30
+; RV64-NEXT: lui a6, 262144
+; RV64-NEXT: or a2, a3, a2
+; RV64-NEXT: srli a3, a1, 26
+; RV64-NEXT: srli a1, a1, 56
+; RV64-NEXT: or a4, a5, a4
+; RV64-NEXT: lui a5, %hi(.LCPI3_0)
+; RV64-NEXT: addi a6, a6, -1
+; RV64-NEXT: ld a5, %lo(.LCPI3_0)(a5)
+; RV64-NEXT: add a4, a0, a4
+; RV64-NEXT: and a0, a0, a6
+; RV64-NEXT: add a2, a4, a2
+; RV64-NEXT: and a4, a4, a6
+; RV64-NEXT: sltu a0, a4, a0
+; RV64-NEXT: add a0, a2, a0
+; RV64-NEXT: and a2, a2, a6
+; RV64-NEXT: sltu a2, a2, a4
+; RV64-NEXT: and a4, a0, a6
+; RV64-NEXT: add a0, a0, a3
+; RV64-NEXT: add a1, a2, a1
+; RV64-NEXT: and a2, a0, a6
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: sltu a1, a2, a4
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: and a0, a0, a6
+; RV64-NEXT: mulhu a1, a0, a5
+; RV64-NEXT: slli a2, a1, 3
+; RV64-NEXT: sub a0, a0, a1
+; RV64-NEXT: sub a0, a0, a2
+; RV64-NEXT: li a1, 0
; RV64-NEXT: ret
%a = urem iXLen2 %x, 9
ret iXLen2 %a
diff --git a/llvm/test/CodeGen/X86/divide-by-constant.ll b/llvm/test/CodeGen/X86/divide-by-constant.ll
index ac78136b9d8ea..f4f99749969e9 100644
--- a/llvm/test/CodeGen/X86/divide-by-constant.ll
+++ b/llvm/test/CodeGen/X86/divide-by-constant.ll
@@ -294,19 +294,47 @@ entry:
define i64 @PR23590(i64 %x) nounwind {
; X86-LABEL: PR23590:
; X86: # %bb.0: # %entry
-; X86-NEXT: subl $12, %esp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
; X86-NEXT: pushl $0
; X86-NEXT: pushl $12345 # imm = 0x3039
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll __umoddi3
; X86-NEXT: addl $16, %esp
-; X86-NEXT: pushl $0
-; X86-NEXT: pushl $7
-; X86-NEXT: pushl %edx
-; X86-NEXT: pushl %eax
-; X86-NEXT: calll __udivdi3
-; X86-NEXT: addl $28, %esp
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: shldl $2, %esi, %eax
+; X86-NEXT: addl %esi, %eax
+; X86-NEXT: movl %eax, %edx
+; X86-NEXT: andl $1073741823, %edx # imm = 0x3FFFFFFF
+; X86-NEXT: movl %ecx, %edi
+; X86-NEXT: shrl $28, %edi
+; X86-NEXT: movl %esi, %ebx
+; X86-NEXT: andl $1073741823, %ebx # imm = 0x3FFFFFFF
+; X86-NEXT: cmpl %ebx, %edx
+; X86-NEXT: adcl %eax, %edi
+; X86-NEXT: andl $1073741823, %edi # imm = 0x3FFFFFFF
+; X86-NEXT: movl $613566757, %edx # imm = 0x24924925
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: mull %edx
+; X86-NEXT: leal (,%edx,8), %eax
+; X86-NEXT: subl %eax, %edx
+; X86-NEXT: addl %edi, %edx
+; X86-NEXT: subl %edx, %esi
+; X86-NEXT: sbbl $0, %ecx
+; X86-NEXT: movl $-1227133513, %edx # imm = 0xB6DB6DB7
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: mull %edx
+; X86-NEXT: imull $1840700269, %esi, %esi # imm = 0x6DB6DB6D
+; X86-NEXT: addl %esi, %edx
+; X86-NEXT: imull $-1227133513, %ecx, %ecx # imm = 0xB6DB6DB7
+; X86-NEXT: addl %ecx, %edx
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
; X86-NEXT: retl
;
; X64-FAST-LABEL: PR23590:
diff --git a/llvm/test/CodeGen/X86/divmod128.ll b/llvm/test/CodeGen/X86/divmod128.ll
index 3796dd796eaf9..16865030cfc36 100644
--- a/llvm/test/CodeGen/X86/divmod128.ll
+++ b/llvm/test/CodeGen/X86/divmod128.ll
@@ -67,25 +67,76 @@ define i64 @div128(i128 %x) nounwind {
define i64 @umod128(i128 %x) nounwind {
; X86-64-LABEL: umod128:
; X86-64: # %bb.0:
-; X86-64-NEXT: pushq %rax
-; X86-64-NEXT: movl $11, %edx
-; X86-64-NEXT: xorl %ecx, %ecx
-; X86-64-NEXT: callq __umodti3 at PLT
-; X86-64-NEXT: popq %rcx
+; X86-64-NEXT: movq %rsi, %rax
+; X86-64-NEXT: shldq $4, %rdi, %rax
+; X86-64-NEXT: movq %rdi, %rcx
+; X86-64-NEXT: shrq $30, %rcx
+; X86-64-NEXT: addl %edi, %ecx
+; X86-64-NEXT: movl %ecx, %edx
+; X86-64-NEXT: andl $1073741823, %edx # imm = 0x3FFFFFFF
+; X86-64-NEXT: andl $1073741823, %edi # imm = 0x3FFFFFFF
+; X86-64-NEXT: cmpl %edi, %edx
+; X86-64-NEXT: movl %ecx, %edi
+; X86-64-NEXT: adcl %eax, %edi
+; X86-64-NEXT: addl %eax, %ecx
+; X86-64-NEXT: andl $1073741823, %ecx # imm = 0x3FFFFFFF
+; X86-64-NEXT: movq %rsi, %rax
+; X86-64-NEXT: shrq $26, %rax
+; X86-64-NEXT: cmpl %edx, %ecx
+; X86-64-NEXT: movl %edi, %ecx
+; X86-64-NEXT: adcl %eax, %ecx
+; X86-64-NEXT: movl %edi, %edx
+; X86-64-NEXT: andl $1073741823, %edx # imm = 0x3FFFFFFF
+; X86-64-NEXT: addl %eax, %edi
+; X86-64-NEXT: andl $1073741823, %edi # imm = 0x3FFFFFFF
+; X86-64-NEXT: shrq $56, %rsi
+; X86-64-NEXT: cmpl %edx, %edi
+; X86-64-NEXT: adcl %esi, %ecx
+; X86-64-NEXT: andl $1073741823, %ecx # imm = 0x3FFFFFFF
+; X86-64-NEXT: movabsq $1676976733973595602, %rdx # imm = 0x1745D1745D1745D2
+; X86-64-NEXT: movq %rcx, %rax
+; X86-64-NEXT: mulq %rdx
+; X86-64-NEXT: leaq (%rdx,%rdx,4), %rax
+; X86-64-NEXT: leaq (%rdx,%rax,2), %rax
+; X86-64-NEXT: subq %rax, %rcx
+; X86-64-NEXT: movq %rcx, %rax
; X86-64-NEXT: retq
;
; WIN64-LABEL: umod128:
; WIN64: # %bb.0:
-; WIN64-NEXT: subq $72, %rsp
-; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: movq $11, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp)
-; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
-; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
-; WIN64-NEXT: callq __umodti3
-; WIN64-NEXT: movq %xmm0, %rax
-; WIN64-NEXT: addq $72, %rsp
+; WIN64-NEXT: movq %rdx, %rax
+; WIN64-NEXT: shldq $4, %rcx, %rax
+; WIN64-NEXT: movq %rcx, %r8
+; WIN64-NEXT: shrq $30, %r8
+; WIN64-NEXT: addl %ecx, %r8d
+; WIN64-NEXT: movl %r8d, %r9d
+; WIN64-NEXT: andl $1073741823, %r9d # imm = 0x3FFFFFFF
+; WIN64-NEXT: andl $1073741823, %ecx # imm = 0x3FFFFFFF
+; WIN64-NEXT: cmpl %ecx, %r9d
+; WIN64-NEXT: movl %r8d, %r10d
+; WIN64-NEXT: adcl %eax, %r10d
+; WIN64-NEXT: addl %eax, %r8d
+; WIN64-NEXT: andl $1073741823, %r8d # imm = 0x3FFFFFFF
+; WIN64-NEXT: movq %rdx, %rax
+; WIN64-NEXT: shrq $26, %rax
+; WIN64-NEXT: cmpl %r9d, %r8d
+; WIN64-NEXT: movl %r10d, %ecx
+; WIN64-NEXT: adcl %eax, %ecx
+; WIN64-NEXT: movl %r10d, %r8d
+; WIN64-NEXT: andl $1073741823, %r8d # imm = 0x3FFFFFFF
+; WIN64-NEXT: addl %eax, %r10d
+; WIN64-NEXT: andl $1073741823, %r10d # imm = 0x3FFFFFFF
+; WIN64-NEXT: shrq $56, %rdx
+; WIN64-NEXT: cmpl %r8d, %r10d
+; WIN64-NEXT: adcl %edx, %ecx
+; WIN64-NEXT: andl $1073741823, %ecx # imm = 0x3FFFFFFF
+; WIN64-NEXT: movabsq $1676976733973595602, %rdx # imm = 0x1745D1745D1745D2
+; WIN64-NEXT: movq %rcx, %rax
+; WIN64-NEXT: mulq %rdx
+; WIN64-NEXT: leaq (%rdx,%rdx,4), %rax
+; WIN64-NEXT: leaq (%rdx,%rax,2), %rax
+; WIN64-NEXT: subq %rax, %rcx
+; WIN64-NEXT: movq %rcx, %rax
; WIN64-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/uint128-div-const.ll b/llvm/test/CodeGen/X86/uint128-div-const.ll
new file mode 100644
index 0000000000000..60dddad952679
--- /dev/null
+++ b/llvm/test/CodeGen/X86/uint128-div-const.ll
@@ -0,0 +1,210 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -O2 | FileCheck %s
+
+define i128 @div_by_7(i128 %x) {
+; CHECK-LABEL: div_by_7:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: movq %rsi, %rax
+; CHECK-NEXT: shldq $4, %rdi, %rax
+; CHECK-NEXT: movq %rdi, %rcx
+; CHECK-NEXT: shrq $30, %rcx
+; CHECK-NEXT: addl %edi, %ecx
+; CHECK-NEXT: movl %ecx, %edx
+; CHECK-NEXT: andl $1073741823, %edx # imm = 0x3FFFFFFF
+; CHECK-NEXT: movl %edi, %r8d
+; CHECK-NEXT: andl $1073741823, %r8d # imm = 0x3FFFFFFF
+; CHECK-NEXT: cmpl %r8d, %edx
+; CHECK-NEXT: movl %ecx, %r8d
+; CHECK-NEXT: adcl %eax, %r8d
+; CHECK-NEXT: addl %eax, %ecx
+; CHECK-NEXT: andl $1073741823, %ecx # imm = 0x3FFFFFFF
+; CHECK-NEXT: movq %rsi, %rax
+; CHECK-NEXT: shrq $26, %rax
+; CHECK-NEXT: cmpl %edx, %ecx
+; CHECK-NEXT: movl %r8d, %edx
+; CHECK-NEXT: adcl %eax, %edx
+; CHECK-NEXT: movl %r8d, %r9d
+; CHECK-NEXT: andl $1073741823, %r9d # imm = 0x3FFFFFFF
+; CHECK-NEXT: addl %eax, %r8d
+; CHECK-NEXT: andl $1073741823, %r8d # imm = 0x3FFFFFFF
+; CHECK-NEXT: movq %rsi, %rcx
+; CHECK-NEXT: shrq $56, %rcx
+; CHECK-NEXT: cmpl %r9d, %r8d
+; CHECK-NEXT: adcl %edx, %ecx
+; CHECK-NEXT: andl $1073741823, %ecx # imm = 0x3FFFFFFF
+; CHECK-NEXT: movabsq $2635249153387078803, %rdx # imm = 0x2492492492492493
+; CHECK-NEXT: movq %rcx, %rax
+; CHECK-NEXT: mulq %rdx
+; CHECK-NEXT: leal (,%rdx,8), %eax
+; CHECK-NEXT: subq %rax, %rdx
+; CHECK-NEXT: addq %rcx, %rdx
+; CHECK-NEXT: subq %rdx, %rdi
+; CHECK-NEXT: sbbq $0, %rsi
+; CHECK-NEXT: movabsq $-5270498306774157605, %rcx # imm = 0xB6DB6DB6DB6DB6DB
+; CHECK-NEXT: imulq %rdi, %rcx
+; CHECK-NEXT: movabsq $7905747460161236407, %r8 # imm = 0x6DB6DB6DB6DB6DB7
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: mulq %r8
+; CHECK-NEXT: addq %rcx, %rdx
+; CHECK-NEXT: imulq %rsi, %r8
+; CHECK-NEXT: addq %r8, %rdx
+; CHECK-NEXT: retq
+entry:
+ %div = udiv i128 %x, 7
+ ret i128 %div
+}
+
+define i128 @div_by_9(i128 %x) {
+; CHECK-LABEL: div_by_9:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: movq %rsi, %rax
+; CHECK-NEXT: shldq $4, %rdi, %rax
+; CHECK-NEXT: movq %rdi, %rcx
+; CHECK-NEXT: shrq $30, %rcx
+; CHECK-NEXT: addl %edi, %ecx
+; CHECK-NEXT: movl %ecx, %edx
+; CHECK-NEXT: andl $1073741823, %edx # imm = 0x3FFFFFFF
+; CHECK-NEXT: movl %edi, %r8d
+; CHECK-NEXT: andl $1073741823, %r8d # imm = 0x3FFFFFFF
+; CHECK-NEXT: cmpl %r8d, %edx
+; CHECK-NEXT: movl %ecx, %r8d
+; CHECK-NEXT: adcl %eax, %r8d
+; CHECK-NEXT: addl %eax, %ecx
+; CHECK-NEXT: andl $1073741823, %ecx # imm = 0x3FFFFFFF
+; CHECK-NEXT: movq %rsi, %rax
+; CHECK-NEXT: shrq $26, %rax
+; CHECK-NEXT: cmpl %edx, %ecx
+; CHECK-NEXT: movl %r8d, %edx
+; CHECK-NEXT: adcl %eax, %edx
+; CHECK-NEXT: movl %r8d, %r9d
+; CHECK-NEXT: andl $1073741823, %r9d # imm = 0x3FFFFFFF
+; CHECK-NEXT: addl %eax, %r8d
+; CHECK-NEXT: andl $1073741823, %r8d # imm = 0x3FFFFFFF
+; CHECK-NEXT: movq %rsi, %rcx
+; CHECK-NEXT: shrq $56, %rcx
+; CHECK-NEXT: cmpl %r9d, %r8d
+; CHECK-NEXT: adcl %edx, %ecx
+; CHECK-NEXT: andl $1073741823, %ecx # imm = 0x3FFFFFFF
+; CHECK-NEXT: movabsq $2049638230412172402, %rdx # imm = 0x1C71C71C71C71C72
+; CHECK-NEXT: movq %rcx, %rax
+; CHECK-NEXT: mulq %rdx
+; CHECK-NEXT: leaq (%rdx,%rdx,8), %rax
+; CHECK-NEXT: subq %rax, %rcx
+; CHECK-NEXT: subq %rcx, %rdi
+; CHECK-NEXT: sbbq $0, %rsi
+; CHECK-NEXT: movabsq $4099276460824344803, %rcx # imm = 0x38E38E38E38E38E3
+; CHECK-NEXT: imulq %rdi, %rcx
+; CHECK-NEXT: movabsq $-8198552921648689607, %r8 # imm = 0x8E38E38E38E38E39
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: mulq %r8
+; CHECK-NEXT: addq %rcx, %rdx
+; CHECK-NEXT: imulq %rsi, %r8
+; CHECK-NEXT: addq %r8, %rdx
+; CHECK-NEXT: retq
+entry:
+ %div = udiv i128 %x, 9
+ ret i128 %div
+}
+
+define i128 @div_by_25(i128 %x) {
+; CHECK-LABEL: div_by_25:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: movq %rsi, %rax
+; CHECK-NEXT: shldq $24, %rdi, %rax
+; CHECK-NEXT: movq %rdi, %rcx
+; CHECK-NEXT: shrq $20, %rcx
+; CHECK-NEXT: addl %edi, %ecx
+; CHECK-NEXT: movl %ecx, %edx
+; CHECK-NEXT: andl $1048575, %edx # imm = 0xFFFFF
+; CHECK-NEXT: movl %edi, %r8d
+; CHECK-NEXT: andl $1048575, %r8d # imm = 0xFFFFF
+; CHECK-NEXT: cmpl %r8d, %edx
+; CHECK-NEXT: movl %ecx, %r8d
+; CHECK-NEXT: adcl %eax, %r8d
+; CHECK-NEXT: addl %eax, %ecx
+; CHECK-NEXT: andl $1048575, %ecx # imm = 0xFFFFF
+; CHECK-NEXT: movq %rsi, %rax
+; CHECK-NEXT: shldq $4, %rdi, %rax
+; CHECK-NEXT: cmpl %edx, %ecx
+; CHECK-NEXT: movl %r8d, %ecx
+; CHECK-NEXT: adcl %eax, %ecx
+; CHECK-NEXT: movl %r8d, %edx
+; CHECK-NEXT: andl $1048575, %edx # imm = 0xFFFFF
+; CHECK-NEXT: addl %eax, %r8d
+; CHECK-NEXT: andl $1048575, %r8d # imm = 0xFFFFF
+; CHECK-NEXT: movq %rsi, %rax
+; CHECK-NEXT: shrq $16, %rax
+; CHECK-NEXT: cmpl %edx, %r8d
+; CHECK-NEXT: movl %ecx, %edx
+; CHECK-NEXT: adcl %eax, %edx
+; CHECK-NEXT: movl %ecx, %r8d
+; CHECK-NEXT: andl $1048575, %r8d # imm = 0xFFFFF
+; CHECK-NEXT: addl %eax, %ecx
+; CHECK-NEXT: andl $1048575, %ecx # imm = 0xFFFFF
+; CHECK-NEXT: movq %rsi, %rax
+; CHECK-NEXT: shrq $36, %rax
+; CHECK-NEXT: cmpl %r8d, %ecx
+; CHECK-NEXT: movl %edx, %r8d
+; CHECK-NEXT: adcl %eax, %r8d
+; CHECK-NEXT: movl %edx, %r9d
+; CHECK-NEXT: andl $1048575, %r9d # imm = 0xFFFFF
+; CHECK-NEXT: addl %eax, %edx
+; CHECK-NEXT: andl $1048575, %edx # imm = 0xFFFFF
+; CHECK-NEXT: movq %rsi, %rcx
+; CHECK-NEXT: shrq $56, %rcx
+; CHECK-NEXT: cmpl %r9d, %edx
+; CHECK-NEXT: adcl %r8d, %ecx
+; CHECK-NEXT: andl $1048575, %ecx # imm = 0xFFFFF
+; CHECK-NEXT: movabsq $737869762948382065, %rdx # imm = 0xA3D70A3D70A3D71
+; CHECK-NEXT: movq %rcx, %rax
+; CHECK-NEXT: mulq %rdx
+; CHECK-NEXT: leaq (%rdx,%rdx,4), %rax
+; CHECK-NEXT: leaq (%rax,%rax,4), %rax
+; CHECK-NEXT: subq %rax, %rcx
+; CHECK-NEXT: subq %rcx, %rdi
+; CHECK-NEXT: sbbq $0, %rsi
+; CHECK-NEXT: movabsq $2951479051793528258, %rcx # imm = 0x28F5C28F5C28F5C2
+; CHECK-NEXT: imulq %rdi, %rcx
+; CHECK-NEXT: movabsq $-8116567392432202711, %r8 # imm = 0x8F5C28F5C28F5C29
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: mulq %r8
+; CHECK-NEXT: addq %rcx, %rdx
+; CHECK-NEXT: imulq %rsi, %r8
+; CHECK-NEXT: addq %r8, %rdx
+; CHECK-NEXT: retq
+entry:
+ %div = udiv i128 %x, 25
+ ret i128 %div
+}
+
+define i128 @div_by_14(i128 %x) {
+; CHECK-LABEL: div_by_14:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: movl $14, %edx
+; CHECK-NEXT: xorl %ecx, %ecx
+; CHECK-NEXT: callq __udivti3 at PLT
+; CHECK-NEXT: popq %rcx
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
+entry:
+ %div = udiv i128 %x, 14
+ ret i128 %div
+}
+
+define i128 @div_by_22(i128 %x) {
+; CHECK-LABEL: div_by_22:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: movl $22, %edx
+; CHECK-NEXT: xorl %ecx, %ecx
+; CHECK-NEXT: callq __udivti3 at PLT
+; CHECK-NEXT: popq %rcx
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
+entry:
+ %div = udiv i128 %x, 22
+ ret i128 %div
+}
>From c5485eff36f1c1ae9591082d5295c2e5069999a7 Mon Sep 17 00:00:00 2001
From: Shivam Gupta <shivam98.tkg at gmail.com>
Date: Sun, 1 Mar 2026 19:45:52 +0530
Subject: [PATCH 02/15] Address Review comments
---
.../CodeGen/SelectionDAG/TargetLowering.cpp | 44 +++---
llvm/test/CodeGen/AArch64/rem-by-const.ll | 80 ++++++-----
llvm/test/CodeGen/X86/divmod128.ll | 80 ++++-------
llvm/test/CodeGen/X86/uint128-div-const.ll | 136 +++++-------------
4 files changed, 135 insertions(+), 205 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index ec5e7b44a3d09..284f680612341 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8241,10 +8241,25 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
unsigned BitWidth = VT.getScalarSizeInBits();
unsigned BestChunkWidth = 0;
- // We restrict to small chunk sizes (e.g., ≤ 32 bits) to ensure that all
- // operations remain legal on most targets.
- unsigned MaxChunk = 32;
- for (int i = MaxChunk; i >= 1; --i) {
+ // Determine the largest legal scalar integer type we can safely use
+ // for chunk operations.
+ unsigned MaxChunk = 0;
+
+ // Use the largest legal integer register type for this VT.
+ EVT LegalVT = EVT(getRegisterType(*DAG.getContext(), VT));
+ if (LegalVT.isInteger())
+ MaxChunk = LegalVT.getSizeInBits();
+ else
+ return false;
+
+ // Clamp to the original bit width.
+ MaxChunk = std::min(MaxChunk, BitWidth);
+
+ // Find the largest chunk width W in (MaxChunk/2, MaxChunk] satisfying
+ // (1 << W) % Divisor == 1.
+ // Then 2^W ≡ 1 (mod Divisor), so a value written in base 2^W can be
+ // reduced modulo Divisor by summing its W-bit chunks.
+ for (unsigned i = MaxChunk; i > MaxChunk / 2; --i) {
APInt ChunkMaxPlus1 = APInt::getOneBitSet(BitWidth, i);
if (ChunkMaxPlus1.urem(Divisor).isOne()) {
BestChunkWidth = i;
@@ -8256,10 +8271,13 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
if (BestChunkWidth > 0) {
EVT ChunkVT = EVT::getIntegerVT(*DAG.getContext(), BestChunkWidth);
- if (!LL)
- std::tie(LL, LH) =
- DAG.SplitScalar(N->getOperand(0), dl, HiLoVT, HiLoVT);
- SDValue In = DAG.getNode(ISD::BUILD_PAIR, dl, VT, LL, LH);
+ SDValue In;
+
+ if (LL) {
+ In = DAG.getNode(ISD::BUILD_PAIR, dl, VT, LL, LH);
+ } else {
+ In = N->getOperand(0);
+ }
SmallVector<SDValue, 8> Parts;
// Split into fixed-size chunks
@@ -8287,15 +8305,7 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
} else {
SDValue Add = DAG.getNode(ISD::ADD, dl, ChunkVT, Sum, Parts[i]);
SDValue NewCarry = DAG.getSetCC(dl, SetCCType, Add, Sum, ISD::SETULT);
-
- if (getBooleanContents(ChunkVT) ==
- TargetLoweringBase::ZeroOrOneBooleanContent)
- NewCarry = DAG.getZExtOrTrunc(NewCarry, dl, ChunkVT);
- else
- NewCarry = DAG.getSelect(dl, ChunkVT, NewCarry,
- DAG.getConstant(1, dl, ChunkVT),
- DAG.getConstant(0, dl, ChunkVT));
-
+ NewCarry = DAG.getZExtOrTrunc(NewCarry, dl, ChunkVT);
Sum = DAG.getNode(ISD::ADD, dl, ChunkVT, Add, Carry);
Carry = NewCarry;
}
diff --git a/llvm/test/CodeGen/AArch64/rem-by-const.ll b/llvm/test/CodeGen/AArch64/rem-by-const.ll
index eaed62961fc50..d2875d9a3fc05 100644
--- a/llvm/test/CodeGen/AArch64/rem-by-const.ll
+++ b/llvm/test/CodeGen/AArch64/rem-by-const.ll
@@ -500,13 +500,23 @@ entry:
define i128 @ui128_7(i128 %a, i128 %b) {
; CHECK-SD-LABEL: ui128_7:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
-; CHECK-SD-NEXT: .cfi_offset w30, -16
-; CHECK-SD-NEXT: mov w2, #7 // =0x7
-; CHECK-SD-NEXT: mov x3, xzr
-; CHECK-SD-NEXT: bl __umodti3
-; CHECK-SD-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-SD-NEXT: extr x9, x1, x0, #63
+; CHECK-SD-NEXT: mov x8, #18725 // =0x4925
+; CHECK-SD-NEXT: and x11, x0, #0x7fffffffffffffff
+; CHECK-SD-NEXT: movk x8, #9362, lsl #16
+; CHECK-SD-NEXT: add x9, x0, x9
+; CHECK-SD-NEXT: movk x8, #37449, lsl #32
+; CHECK-SD-NEXT: add x10, x9, x1, lsr #62
+; CHECK-SD-NEXT: and x9, x9, #0x7fffffffffffffff
+; CHECK-SD-NEXT: movk x8, #18724, lsl #48
+; CHECK-SD-NEXT: cmp x9, x11
+; CHECK-SD-NEXT: mov x1, xzr
+; CHECK-SD-NEXT: cinc x9, x10, lo
+; CHECK-SD-NEXT: and x9, x9, #0x7fffffffffffffff
+; CHECK-SD-NEXT: umulh x8, x9, x8
+; CHECK-SD-NEXT: lsr x8, x8, #1
+; CHECK-SD-NEXT: sub x8, x8, x8, lsl #3
+; CHECK-SD-NEXT: add x0, x9, x8
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: ui128_7:
@@ -3079,34 +3089,36 @@ entry:
define <2 x i128> @uv2i128_7(<2 x i128> %d, <2 x i128> %e) {
; CHECK-SD-LABEL: uv2i128_7:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill
-; CHECK-SD-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill
-; CHECK-SD-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill
-; CHECK-SD-NEXT: .cfi_def_cfa_offset 48
-; CHECK-SD-NEXT: .cfi_offset w19, -8
-; CHECK-SD-NEXT: .cfi_offset w20, -16
-; CHECK-SD-NEXT: .cfi_offset w21, -24
-; CHECK-SD-NEXT: .cfi_offset w22, -32
-; CHECK-SD-NEXT: .cfi_offset w30, -48
-; CHECK-SD-NEXT: mov x19, x3
-; CHECK-SD-NEXT: mov x20, x2
-; CHECK-SD-NEXT: mov w2, #7 // =0x7
-; CHECK-SD-NEXT: mov x3, xzr
-; CHECK-SD-NEXT: bl __umodti3
-; CHECK-SD-NEXT: mov x21, x0
-; CHECK-SD-NEXT: mov x22, x1
-; CHECK-SD-NEXT: mov x0, x20
-; CHECK-SD-NEXT: mov x1, x19
-; CHECK-SD-NEXT: mov w2, #7 // =0x7
+; CHECK-SD-NEXT: extr x9, x1, x0, #63
+; CHECK-SD-NEXT: extr x8, x3, x2, #63
+; CHECK-SD-NEXT: and x10, x0, #0x7fffffffffffffff
+; CHECK-SD-NEXT: and x12, x2, #0x7fffffffffffffff
+; CHECK-SD-NEXT: add x9, x0, x9
+; CHECK-SD-NEXT: add x8, x2, x8
+; CHECK-SD-NEXT: add x11, x9, x1, lsr #62
+; CHECK-SD-NEXT: and x9, x9, #0x7fffffffffffffff
+; CHECK-SD-NEXT: mov x1, xzr
+; CHECK-SD-NEXT: cmp x9, x10
+; CHECK-SD-NEXT: add x9, x8, x3, lsr #62
+; CHECK-SD-NEXT: and x8, x8, #0x7fffffffffffffff
+; CHECK-SD-NEXT: cinc x10, x11, lo
+; CHECK-SD-NEXT: mov x11, #18725 // =0x4925
+; CHECK-SD-NEXT: cmp x8, x12
+; CHECK-SD-NEXT: movk x11, #9362, lsl #16
+; CHECK-SD-NEXT: cinc x9, x9, lo
+; CHECK-SD-NEXT: and x8, x10, #0x7fffffffffffffff
+; CHECK-SD-NEXT: movk x11, #37449, lsl #32
+; CHECK-SD-NEXT: and x9, x9, #0x7fffffffffffffff
; CHECK-SD-NEXT: mov x3, xzr
-; CHECK-SD-NEXT: bl __umodti3
-; CHECK-SD-NEXT: mov x2, x0
-; CHECK-SD-NEXT: mov x3, x1
-; CHECK-SD-NEXT: mov x0, x21
-; CHECK-SD-NEXT: mov x1, x22
-; CHECK-SD-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; CHECK-SD-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; CHECK-SD-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload
+; CHECK-SD-NEXT: movk x11, #18724, lsl #48
+; CHECK-SD-NEXT: umulh x10, x8, x11
+; CHECK-SD-NEXT: umulh x11, x9, x11
+; CHECK-SD-NEXT: lsr x10, x10, #1
+; CHECK-SD-NEXT: lsr x11, x11, #1
+; CHECK-SD-NEXT: sub x10, x10, x10, lsl #3
+; CHECK-SD-NEXT: sub x11, x11, x11, lsl #3
+; CHECK-SD-NEXT: add x0, x8, x10
+; CHECK-SD-NEXT: add x2, x9, x11
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: uv2i128_7:
diff --git a/llvm/test/CodeGen/X86/divmod128.ll b/llvm/test/CodeGen/X86/divmod128.ll
index 16865030cfc36..10b91e82f915a 100644
--- a/llvm/test/CodeGen/X86/divmod128.ll
+++ b/llvm/test/CodeGen/X86/divmod128.ll
@@ -67,32 +67,17 @@ define i64 @div128(i128 %x) nounwind {
define i64 @umod128(i128 %x) nounwind {
; X86-64-LABEL: umod128:
; X86-64: # %bb.0:
-; X86-64-NEXT: movq %rsi, %rax
-; X86-64-NEXT: shldq $4, %rdi, %rax
-; X86-64-NEXT: movq %rdi, %rcx
-; X86-64-NEXT: shrq $30, %rcx
-; X86-64-NEXT: addl %edi, %ecx
-; X86-64-NEXT: movl %ecx, %edx
-; X86-64-NEXT: andl $1073741823, %edx # imm = 0x3FFFFFFF
-; X86-64-NEXT: andl $1073741823, %edi # imm = 0x3FFFFFFF
-; X86-64-NEXT: cmpl %edi, %edx
-; X86-64-NEXT: movl %ecx, %edi
-; X86-64-NEXT: adcl %eax, %edi
-; X86-64-NEXT: addl %eax, %ecx
-; X86-64-NEXT: andl $1073741823, %ecx # imm = 0x3FFFFFFF
-; X86-64-NEXT: movq %rsi, %rax
-; X86-64-NEXT: shrq $26, %rax
-; X86-64-NEXT: cmpl %edx, %ecx
-; X86-64-NEXT: movl %edi, %ecx
-; X86-64-NEXT: adcl %eax, %ecx
-; X86-64-NEXT: movl %edi, %edx
-; X86-64-NEXT: andl $1073741823, %edx # imm = 0x3FFFFFFF
-; X86-64-NEXT: addl %eax, %edi
-; X86-64-NEXT: andl $1073741823, %edi # imm = 0x3FFFFFFF
+; X86-64-NEXT: movabsq $1152921504606846975, %rax # imm = 0xFFFFFFFFFFFFFFF
+; X86-64-NEXT: movq %rsi, %rcx
+; X86-64-NEXT: shldq $4, %rdi, %rcx
+; X86-64-NEXT: addq %rdi, %rcx
+; X86-64-NEXT: andq %rax, %rdi
+; X86-64-NEXT: movq %rcx, %rdx
+; X86-64-NEXT: andq %rax, %rdx
; X86-64-NEXT: shrq $56, %rsi
-; X86-64-NEXT: cmpl %edx, %edi
-; X86-64-NEXT: adcl %esi, %ecx
-; X86-64-NEXT: andl $1073741823, %ecx # imm = 0x3FFFFFFF
+; X86-64-NEXT: cmpq %rdi, %rdx
+; X86-64-NEXT: adcq %rsi, %rcx
+; X86-64-NEXT: andq %rax, %rcx
; X86-64-NEXT: movabsq $1676976733973595602, %rdx # imm = 0x1745D1745D1745D2
; X86-64-NEXT: movq %rcx, %rax
; X86-64-NEXT: mulq %rdx
@@ -104,39 +89,24 @@ define i64 @umod128(i128 %x) nounwind {
;
; WIN64-LABEL: umod128:
; WIN64: # %bb.0:
-; WIN64-NEXT: movq %rdx, %rax
-; WIN64-NEXT: shldq $4, %rcx, %rax
-; WIN64-NEXT: movq %rcx, %r8
-; WIN64-NEXT: shrq $30, %r8
-; WIN64-NEXT: addl %ecx, %r8d
-; WIN64-NEXT: movl %r8d, %r9d
-; WIN64-NEXT: andl $1073741823, %r9d # imm = 0x3FFFFFFF
-; WIN64-NEXT: andl $1073741823, %ecx # imm = 0x3FFFFFFF
-; WIN64-NEXT: cmpl %ecx, %r9d
-; WIN64-NEXT: movl %r8d, %r10d
-; WIN64-NEXT: adcl %eax, %r10d
-; WIN64-NEXT: addl %eax, %r8d
-; WIN64-NEXT: andl $1073741823, %r8d # imm = 0x3FFFFFFF
-; WIN64-NEXT: movq %rdx, %rax
-; WIN64-NEXT: shrq $26, %rax
-; WIN64-NEXT: cmpl %r9d, %r8d
-; WIN64-NEXT: movl %r10d, %ecx
-; WIN64-NEXT: adcl %eax, %ecx
-; WIN64-NEXT: movl %r10d, %r8d
-; WIN64-NEXT: andl $1073741823, %r8d # imm = 0x3FFFFFFF
-; WIN64-NEXT: addl %eax, %r10d
-; WIN64-NEXT: andl $1073741823, %r10d # imm = 0x3FFFFFFF
+; WIN64-NEXT: movabsq $1152921504606846975, %rax # imm = 0xFFFFFFFFFFFFFFF
+; WIN64-NEXT: movq %rdx, %r8
+; WIN64-NEXT: shldq $4, %rcx, %r8
+; WIN64-NEXT: addq %rcx, %r8
+; WIN64-NEXT: andq %rax, %rcx
+; WIN64-NEXT: movq %r8, %r9
+; WIN64-NEXT: andq %rax, %r9
; WIN64-NEXT: shrq $56, %rdx
-; WIN64-NEXT: cmpl %r8d, %r10d
-; WIN64-NEXT: adcl %edx, %ecx
-; WIN64-NEXT: andl $1073741823, %ecx # imm = 0x3FFFFFFF
-; WIN64-NEXT: movabsq $1676976733973595602, %rdx # imm = 0x1745D1745D1745D2
-; WIN64-NEXT: movq %rcx, %rax
-; WIN64-NEXT: mulq %rdx
+; WIN64-NEXT: cmpq %rcx, %r9
+; WIN64-NEXT: adcq %rdx, %r8
+; WIN64-NEXT: andq %rax, %r8
+; WIN64-NEXT: movabsq $1676976733973595602, %rcx # imm = 0x1745D1745D1745D2
+; WIN64-NEXT: movq %r8, %rax
+; WIN64-NEXT: mulq %rcx
; WIN64-NEXT: leaq (%rdx,%rdx,4), %rax
; WIN64-NEXT: leaq (%rdx,%rax,2), %rax
-; WIN64-NEXT: subq %rax, %rcx
-; WIN64-NEXT: movq %rcx, %rax
+; WIN64-NEXT: subq %rax, %r8
+; WIN64-NEXT: movq %r8, %rax
; WIN64-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/uint128-div-const.ll b/llvm/test/CodeGen/X86/uint128-div-const.ll
index 60dddad952679..952b98af6adea 100644
--- a/llvm/test/CodeGen/X86/uint128-div-const.ll
+++ b/llvm/test/CodeGen/X86/uint128-div-const.ll
@@ -4,38 +4,24 @@
define i128 @div_by_7(i128 %x) {
; CHECK-LABEL: div_by_7:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: movq %rsi, %rax
-; CHECK-NEXT: shldq $4, %rdi, %rax
-; CHECK-NEXT: movq %rdi, %rcx
-; CHECK-NEXT: shrq $30, %rcx
-; CHECK-NEXT: addl %edi, %ecx
-; CHECK-NEXT: movl %ecx, %edx
-; CHECK-NEXT: andl $1073741823, %edx # imm = 0x3FFFFFFF
-; CHECK-NEXT: movl %edi, %r8d
-; CHECK-NEXT: andl $1073741823, %r8d # imm = 0x3FFFFFFF
-; CHECK-NEXT: cmpl %r8d, %edx
-; CHECK-NEXT: movl %ecx, %r8d
-; CHECK-NEXT: adcl %eax, %r8d
-; CHECK-NEXT: addl %eax, %ecx
-; CHECK-NEXT: andl $1073741823, %ecx # imm = 0x3FFFFFFF
-; CHECK-NEXT: movq %rsi, %rax
-; CHECK-NEXT: shrq $26, %rax
-; CHECK-NEXT: cmpl %edx, %ecx
-; CHECK-NEXT: movl %r8d, %edx
-; CHECK-NEXT: adcl %eax, %edx
-; CHECK-NEXT: movl %r8d, %r9d
-; CHECK-NEXT: andl $1073741823, %r9d # imm = 0x3FFFFFFF
-; CHECK-NEXT: addl %eax, %r8d
-; CHECK-NEXT: andl $1073741823, %r8d # imm = 0x3FFFFFFF
+; CHECK-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF
+; CHECK-NEXT: movq %rdi, %rdx
+; CHECK-NEXT: andq %rax, %rdx
+; CHECK-NEXT: movq %rsi, %r8
+; CHECK-NEXT: shldq $1, %rdi, %r8
+; CHECK-NEXT: addq %rdi, %r8
+; CHECK-NEXT: movq %r8, %r9
+; CHECK-NEXT: andq %rax, %r9
; CHECK-NEXT: movq %rsi, %rcx
-; CHECK-NEXT: shrq $56, %rcx
-; CHECK-NEXT: cmpl %r9d, %r8d
-; CHECK-NEXT: adcl %edx, %ecx
-; CHECK-NEXT: andl $1073741823, %ecx # imm = 0x3FFFFFFF
-; CHECK-NEXT: movabsq $2635249153387078803, %rdx # imm = 0x2492492492492493
+; CHECK-NEXT: shrq $62, %rcx
+; CHECK-NEXT: cmpq %rdx, %r9
+; CHECK-NEXT: adcq %r8, %rcx
+; CHECK-NEXT: andq %rax, %rcx
+; CHECK-NEXT: movabsq $5270498306774157605, %rdx # imm = 0x4924924924924925
; CHECK-NEXT: movq %rcx, %rax
; CHECK-NEXT: mulq %rdx
-; CHECK-NEXT: leal (,%rdx,8), %eax
+; CHECK-NEXT: shrq %rdx
+; CHECK-NEXT: leaq (,%rdx,8), %rax
; CHECK-NEXT: subq %rax, %rdx
; CHECK-NEXT: addq %rcx, %rdx
; CHECK-NEXT: subq %rdx, %rdi
@@ -57,34 +43,19 @@ entry:
define i128 @div_by_9(i128 %x) {
; CHECK-LABEL: div_by_9:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: movq %rsi, %rax
-; CHECK-NEXT: shldq $4, %rdi, %rax
-; CHECK-NEXT: movq %rdi, %rcx
-; CHECK-NEXT: shrq $30, %rcx
-; CHECK-NEXT: addl %edi, %ecx
-; CHECK-NEXT: movl %ecx, %edx
-; CHECK-NEXT: andl $1073741823, %edx # imm = 0x3FFFFFFF
-; CHECK-NEXT: movl %edi, %r8d
-; CHECK-NEXT: andl $1073741823, %r8d # imm = 0x3FFFFFFF
-; CHECK-NEXT: cmpl %r8d, %edx
-; CHECK-NEXT: movl %ecx, %r8d
-; CHECK-NEXT: adcl %eax, %r8d
-; CHECK-NEXT: addl %eax, %ecx
-; CHECK-NEXT: andl $1073741823, %ecx # imm = 0x3FFFFFFF
-; CHECK-NEXT: movq %rsi, %rax
-; CHECK-NEXT: shrq $26, %rax
-; CHECK-NEXT: cmpl %edx, %ecx
-; CHECK-NEXT: movl %r8d, %edx
-; CHECK-NEXT: adcl %eax, %edx
-; CHECK-NEXT: movl %r8d, %r9d
-; CHECK-NEXT: andl $1073741823, %r9d # imm = 0x3FFFFFFF
-; CHECK-NEXT: addl %eax, %r8d
-; CHECK-NEXT: andl $1073741823, %r8d # imm = 0x3FFFFFFF
+; CHECK-NEXT: movabsq $1152921504606846975, %rax # imm = 0xFFFFFFFFFFFFFFF
+; CHECK-NEXT: movq %rdi, %rdx
+; CHECK-NEXT: andq %rax, %rdx
+; CHECK-NEXT: movq %rsi, %r8
+; CHECK-NEXT: shldq $4, %rdi, %r8
+; CHECK-NEXT: addq %rdi, %r8
+; CHECK-NEXT: movq %r8, %r9
+; CHECK-NEXT: andq %rax, %r9
; CHECK-NEXT: movq %rsi, %rcx
; CHECK-NEXT: shrq $56, %rcx
-; CHECK-NEXT: cmpl %r9d, %r8d
-; CHECK-NEXT: adcl %edx, %ecx
-; CHECK-NEXT: andl $1073741823, %ecx # imm = 0x3FFFFFFF
+; CHECK-NEXT: cmpq %rdx, %r9
+; CHECK-NEXT: adcq %r8, %rcx
+; CHECK-NEXT: andq %rax, %rcx
; CHECK-NEXT: movabsq $2049638230412172402, %rdx # imm = 0x1C71C71C71C71C72
; CHECK-NEXT: movq %rcx, %rax
; CHECK-NEXT: mulq %rdx
@@ -109,52 +80,19 @@ entry:
define i128 @div_by_25(i128 %x) {
; CHECK-LABEL: div_by_25:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: movq %rsi, %rax
-; CHECK-NEXT: shldq $24, %rdi, %rax
-; CHECK-NEXT: movq %rdi, %rcx
-; CHECK-NEXT: shrq $20, %rcx
-; CHECK-NEXT: addl %edi, %ecx
-; CHECK-NEXT: movl %ecx, %edx
-; CHECK-NEXT: andl $1048575, %edx # imm = 0xFFFFF
-; CHECK-NEXT: movl %edi, %r8d
-; CHECK-NEXT: andl $1048575, %r8d # imm = 0xFFFFF
-; CHECK-NEXT: cmpl %r8d, %edx
-; CHECK-NEXT: movl %ecx, %r8d
-; CHECK-NEXT: adcl %eax, %r8d
-; CHECK-NEXT: addl %eax, %ecx
-; CHECK-NEXT: andl $1048575, %ecx # imm = 0xFFFFF
-; CHECK-NEXT: movq %rsi, %rax
-; CHECK-NEXT: shldq $4, %rdi, %rax
-; CHECK-NEXT: cmpl %edx, %ecx
-; CHECK-NEXT: movl %r8d, %ecx
-; CHECK-NEXT: adcl %eax, %ecx
-; CHECK-NEXT: movl %r8d, %edx
-; CHECK-NEXT: andl $1048575, %edx # imm = 0xFFFFF
-; CHECK-NEXT: addl %eax, %r8d
-; CHECK-NEXT: andl $1048575, %r8d # imm = 0xFFFFF
-; CHECK-NEXT: movq %rsi, %rax
-; CHECK-NEXT: shrq $16, %rax
-; CHECK-NEXT: cmpl %edx, %r8d
-; CHECK-NEXT: movl %ecx, %edx
-; CHECK-NEXT: adcl %eax, %edx
-; CHECK-NEXT: movl %ecx, %r8d
-; CHECK-NEXT: andl $1048575, %r8d # imm = 0xFFFFF
-; CHECK-NEXT: addl %eax, %ecx
-; CHECK-NEXT: andl $1048575, %ecx # imm = 0xFFFFF
-; CHECK-NEXT: movq %rsi, %rax
-; CHECK-NEXT: shrq $36, %rax
-; CHECK-NEXT: cmpl %r8d, %ecx
-; CHECK-NEXT: movl %edx, %r8d
-; CHECK-NEXT: adcl %eax, %r8d
-; CHECK-NEXT: movl %edx, %r9d
-; CHECK-NEXT: andl $1048575, %r9d # imm = 0xFFFFF
-; CHECK-NEXT: addl %eax, %edx
-; CHECK-NEXT: andl $1048575, %edx # imm = 0xFFFFF
+; CHECK-NEXT: movabsq $1152921504606846975, %rax # imm = 0xFFFFFFFFFFFFFFF
+; CHECK-NEXT: movq %rdi, %rdx
+; CHECK-NEXT: andq %rax, %rdx
+; CHECK-NEXT: movq %rsi, %r8
+; CHECK-NEXT: shldq $4, %rdi, %r8
+; CHECK-NEXT: addq %rdi, %r8
+; CHECK-NEXT: movq %r8, %r9
+; CHECK-NEXT: andq %rax, %r9
; CHECK-NEXT: movq %rsi, %rcx
; CHECK-NEXT: shrq $56, %rcx
-; CHECK-NEXT: cmpl %r9d, %edx
-; CHECK-NEXT: adcl %r8d, %ecx
-; CHECK-NEXT: andl $1048575, %ecx # imm = 0xFFFFF
+; CHECK-NEXT: cmpq %rdx, %r9
+; CHECK-NEXT: adcq %r8, %rcx
+; CHECK-NEXT: andq %rax, %rcx
; CHECK-NEXT: movabsq $737869762948382065, %rdx # imm = 0xA3D70A3D70A3D71
; CHECK-NEXT: movq %rcx, %rax
; CHECK-NEXT: mulq %rdx
>From 4cf19c84368365f773393bb8b9ee6ffa492509a8 Mon Sep 17 00:00:00 2001
From: Shivam Gupta <shivam98.tkg at gmail.com>
Date: Sun, 1 Mar 2026 19:59:50 +0530
Subject: [PATCH 03/15] minor style fix
---
.../CodeGen/SelectionDAG/TargetLowering.cpp | 25 ++++++++-----------
1 file changed, 11 insertions(+), 14 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 284f680612341..c40e4d9e89270 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8243,9 +8243,7 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
// Determine the largest legal scalar integer type we can safely use
// for chunk operations.
- unsigned MaxChunk = 0;
-
- // Use the largest legal integer register type for this VT.
+ unsigned MaxChunk;
EVT LegalVT = EVT(getRegisterType(*DAG.getContext(), VT));
if (LegalVT.isInteger())
MaxChunk = LegalVT.getSizeInBits();
@@ -8268,16 +8266,18 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
}
// If we found a good chunk width, slice the number and sum the pieces.
- if (BestChunkWidth > 0) {
- EVT ChunkVT = EVT::getIntegerVT(*DAG.getContext(), BestChunkWidth);
+ if (!BestChunkWidth)
+ return false;
- SDValue In;
+ EVT ChunkVT = EVT::getIntegerVT(*DAG.getContext(), BestChunkWidth);
- if (LL) {
- In = DAG.getNode(ISD::BUILD_PAIR, dl, VT, LL, LH);
- } else {
- In = N->getOperand(0);
- }
+ SDValue In;
+
+ if (LL) {
+ In = DAG.getNode(ISD::BUILD_PAIR, dl, VT, LL, LH);
+ } else {
+ In = N->getOperand(0);
+ }
SmallVector<SDValue, 8> Parts;
// Split into fixed-size chunks
@@ -8312,9 +8312,6 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
}
Sum = DAG.getNode(ISD::ZERO_EXTEND, dl, HiLoVT, Sum);
- } else {
- return false;
- }
}
// If we didn't find a sum, we can't do the expansion.
>From 82903fd0bcdbdd9170cf741401777315f094fbe5 Mon Sep 17 00:00:00 2001
From: Shivam Gupta <shivam98.tkg at gmail.com>
Date: Sun, 1 Mar 2026 20:08:44 +0530
Subject: [PATCH 04/15] some minor formating
---
.../CodeGen/SelectionDAG/TargetLowering.cpp | 62 +++++++++----------
1 file changed, 31 insertions(+), 31 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index c40e4d9e89270..17e4a228b7a66 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8279,39 +8279,39 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
In = N->getOperand(0);
}
- SmallVector<SDValue, 8> Parts;
- // Split into fixed-size chunks
- for (unsigned i = 0; i < BitWidth; i += BestChunkWidth) {
- SDValue Shift = DAG.getShiftAmountConstant(i, VT, dl);
- SDValue Chunk = DAG.getNode(ISD::SRL, dl, VT, In, Shift);
- Chunk = DAG.getNode(ISD::TRUNCATE, dl, ChunkVT, Chunk);
- Parts.push_back(Chunk);
- }
- if (Parts.empty())
- return false;
- Sum = Parts[0];
-
- // Use uaddo_carry if we can, otherwise use a compare to detect overflow.
- // same logic as used in above if condition.
- SDValue Carry = DAG.getConstant(0, dl, ChunkVT);
- EVT SetCCType =
- getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), ChunkVT);
- for (unsigned i = 1; i < Parts.size(); ++i) {
- if (isOperationLegalOrCustom(ISD::UADDO_CARRY, ChunkVT)) {
- SDVTList VTList = DAG.getVTList(ChunkVT, SetCCType);
- SDValue UAdd = DAG.getNode(ISD::UADDO, dl, VTList, Sum, Parts[i]);
- Sum = DAG.getNode(ISD::UADDO_CARRY, dl, VTList, UAdd, Carry,
- UAdd.getValue(1));
- } else {
- SDValue Add = DAG.getNode(ISD::ADD, dl, ChunkVT, Sum, Parts[i]);
- SDValue NewCarry = DAG.getSetCC(dl, SetCCType, Add, Sum, ISD::SETULT);
- NewCarry = DAG.getZExtOrTrunc(NewCarry, dl, ChunkVT);
- Sum = DAG.getNode(ISD::ADD, dl, ChunkVT, Add, Carry);
- Carry = NewCarry;
- }
+ SmallVector<SDValue, 8> Parts;
+ // Split into fixed-size chunks
+ for (unsigned i = 0; i < BitWidth; i += BestChunkWidth) {
+ SDValue Shift = DAG.getShiftAmountConstant(i, VT, dl);
+ SDValue Chunk = DAG.getNode(ISD::SRL, dl, VT, In, Shift);
+ Chunk = DAG.getNode(ISD::TRUNCATE, dl, ChunkVT, Chunk);
+ Parts.push_back(Chunk);
+ }
+ if (Parts.empty())
+ return false;
+ Sum = Parts[0];
+
+ // Use uaddo_carry if we can, otherwise use a compare to detect overflow.
+ // same logic as used in above if condition.
+ SDValue Carry = DAG.getConstant(0, dl, ChunkVT);
+ EVT SetCCType =
+ getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), ChunkVT);
+ for (unsigned i = 1; i < Parts.size(); ++i) {
+ if (isOperationLegalOrCustom(ISD::UADDO_CARRY, ChunkVT)) {
+ SDVTList VTList = DAG.getVTList(ChunkVT, SetCCType);
+ SDValue UAdd = DAG.getNode(ISD::UADDO, dl, VTList, Sum, Parts[i]);
+ Sum = DAG.getNode(ISD::UADDO_CARRY, dl, VTList, UAdd, Carry,
+ UAdd.getValue(1));
+ } else {
+ SDValue Add = DAG.getNode(ISD::ADD, dl, ChunkVT, Sum, Parts[i]);
+ SDValue NewCarry = DAG.getSetCC(dl, SetCCType, Add, Sum, ISD::SETULT);
+ NewCarry = DAG.getZExtOrTrunc(NewCarry, dl, ChunkVT);
+ Sum = DAG.getNode(ISD::ADD, dl, ChunkVT, Add, Carry);
+ Carry = NewCarry;
}
+ }
- Sum = DAG.getNode(ISD::ZERO_EXTEND, dl, HiLoVT, Sum);
+ Sum = DAG.getNode(ISD::ZERO_EXTEND, dl, HiLoVT, Sum);
}
// If we didn't find a sum, we can't do the expansion.
>From 8fa320e766af56615cf40f79e72f8bbf017258f8 Mon Sep 17 00:00:00 2001
From: Shivam Gupta <shivam98.tkg at gmail.com>
Date: Sun, 1 Mar 2026 20:42:25 +0530
Subject: [PATCH 05/15] Fixed failed riscv test cases
---
.../CodeGen/RISCV/split-udiv-by-constant.ll | 145 ++++++++----------
.../CodeGen/RISCV/split-urem-by-constant.ll | 73 +++------
llvm/test/CodeGen/RISCV/urem-vector-lkk.ll | 73 +++++----
3 files changed, 125 insertions(+), 166 deletions(-)
diff --git a/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll b/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll
index 8250fc3a176e2..b151370a15edc 100644
--- a/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll
+++ b/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll
@@ -153,53 +153,41 @@ define iXLen2 @test_udiv_7(iXLen2 %x) nounwind {
;
; RV64-LABEL: test_udiv_7:
; RV64: # %bb.0:
-; RV64-NEXT: slli a2, a1, 4
-; RV64-NEXT: srli a3, a0, 60
-; RV64-NEXT: slli a4, a1, 34
-; RV64-NEXT: srli a5, a0, 30
-; RV64-NEXT: lui a6, 262144
-; RV64-NEXT: srli a7, a1, 26
-; RV64-NEXT: or a2, a3, a2
-; RV64-NEXT: lui a3, 748983
-; RV64-NEXT: or a4, a5, a4
-; RV64-NEXT: addi a6, a6, -1
-; RV64-NEXT: addi a3, a3, -585
-; RV64-NEXT: add a4, a0, a4
-; RV64-NEXT: slli a5, a3, 33
-; RV64-NEXT: add a3, a3, a5
-; RV64-NEXT: and a5, a0, a6
-; RV64-NEXT: add a2, a4, a2
-; RV64-NEXT: and a4, a4, a6
-; RV64-NEXT: sltu a5, a4, a5
-; RV64-NEXT: add a5, a2, a5
-; RV64-NEXT: and a2, a2, a6
-; RV64-NEXT: sltu a2, a2, a4
-; RV64-NEXT: srli a4, a1, 56
-; RV64-NEXT: add a2, a2, a4
-; RV64-NEXT: lui a4, %hi(.LCPI2_0)
-; RV64-NEXT: add a7, a5, a7
-; RV64-NEXT: and a5, a5, a6
-; RV64-NEXT: add a2, a7, a2
-; RV64-NEXT: and a7, a7, a6
-; RV64-NEXT: sltu a5, a7, a5
-; RV64-NEXT: lui a7, %hi(.LCPI2_1)
-; RV64-NEXT: ld a4, %lo(.LCPI2_0)(a4)
-; RV64-NEXT: ld a7, %lo(.LCPI2_1)(a7)
-; RV64-NEXT: add a2, a2, a5
-; RV64-NEXT: and a2, a2, a6
-; RV64-NEXT: mulhu a4, a2, a4
-; RV64-NEXT: slli a5, a4, 3
-; RV64-NEXT: add a2, a2, a4
-; RV64-NEXT: sub a2, a2, a5
-; RV64-NEXT: sub a4, a0, a2
+; RV64-NEXT: li a2, -1
+; RV64-NEXT: slli a3, a1, 1
+; RV64-NEXT: srli a4, a0, 63
+; RV64-NEXT: srli a5, a1, 62
+; RV64-NEXT: lui a6, 748983
+; RV64-NEXT: srli a2, a2, 1
+; RV64-NEXT: or a3, a4, a3
+; RV64-NEXT: addi a4, a6, -585
+; RV64-NEXT: slli a6, a4, 33
+; RV64-NEXT: add a4, a4, a6
+; RV64-NEXT: and a6, a0, a2
+; RV64-NEXT: add a3, a0, a3
+; RV64-NEXT: add a5, a3, a5
+; RV64-NEXT: and a3, a3, a2
+; RV64-NEXT: sltu a3, a3, a6
+; RV64-NEXT: lui a6, %hi(.LCPI2_0)
+; RV64-NEXT: ld a6, %lo(.LCPI2_0)(a6)
+; RV64-NEXT: add a3, a5, a3
+; RV64-NEXT: lui a5, %hi(.LCPI2_1)
+; RV64-NEXT: ld a5, %lo(.LCPI2_1)(a5)
+; RV64-NEXT: and a2, a3, a2
+; RV64-NEXT: mulhu a3, a2, a6
+; RV64-NEXT: srli a3, a3, 1
+; RV64-NEXT: slli a6, a3, 3
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: sub a2, a2, a6
+; RV64-NEXT: sub a3, a0, a2
; RV64-NEXT: sltu a0, a0, a2
-; RV64-NEXT: mul a2, a4, a7
-; RV64-NEXT: mulhu a5, a4, a3
+; RV64-NEXT: mul a2, a3, a5
+; RV64-NEXT: mulhu a5, a3, a4
; RV64-NEXT: sub a1, a1, a0
; RV64-NEXT: add a2, a5, a2
-; RV64-NEXT: mul a1, a1, a3
+; RV64-NEXT: mul a1, a1, a4
; RV64-NEXT: add a1, a2, a1
-; RV64-NEXT: mul a0, a4, a3
+; RV64-NEXT: mul a0, a3, a4
; RV64-NEXT: ret
%a = udiv iXLen2 %x, 7
ret iXLen2 %a
@@ -243,51 +231,38 @@ define iXLen2 @test_udiv_9(iXLen2 %x) nounwind {
;
; RV64-LABEL: test_udiv_9:
; RV64: # %bb.0:
-; RV64-NEXT: slli a2, a1, 4
-; RV64-NEXT: srli a3, a0, 60
-; RV64-NEXT: slli a4, a1, 34
-; RV64-NEXT: srli a5, a0, 30
-; RV64-NEXT: lui a6, 262144
-; RV64-NEXT: srli a7, a1, 26
-; RV64-NEXT: or a2, a3, a2
-; RV64-NEXT: srli a3, a1, 56
-; RV64-NEXT: or a4, a5, a4
-; RV64-NEXT: addi a6, a6, -1
-; RV64-NEXT: add a4, a0, a4
-; RV64-NEXT: and a5, a0, a6
-; RV64-NEXT: add a2, a4, a2
-; RV64-NEXT: and a4, a4, a6
-; RV64-NEXT: sltu a5, a4, a5
-; RV64-NEXT: add a5, a2, a5
-; RV64-NEXT: and a2, a2, a6
-; RV64-NEXT: sltu a2, a2, a4
-; RV64-NEXT: lui a4, %hi(.LCPI3_0)
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: lui a3, %hi(.LCPI3_1)
-; RV64-NEXT: add a7, a5, a7
-; RV64-NEXT: and a5, a5, a6
-; RV64-NEXT: add a2, a7, a2
-; RV64-NEXT: and a7, a7, a6
-; RV64-NEXT: sltu a5, a7, a5
-; RV64-NEXT: lui a7, %hi(.LCPI3_2)
-; RV64-NEXT: ld a4, %lo(.LCPI3_0)(a4)
-; RV64-NEXT: ld a3, %lo(.LCPI3_1)(a3)
-; RV64-NEXT: ld a7, %lo(.LCPI3_2)(a7)
-; RV64-NEXT: add a2, a2, a5
-; RV64-NEXT: and a2, a2, a6
-; RV64-NEXT: mulhu a4, a2, a4
-; RV64-NEXT: slli a5, a4, 3
-; RV64-NEXT: sub a2, a2, a4
-; RV64-NEXT: sub a2, a2, a5
-; RV64-NEXT: sub a4, a0, a2
+; RV64-NEXT: li a2, -1
+; RV64-NEXT: slli a3, a1, 4
+; RV64-NEXT: srli a4, a0, 60
+; RV64-NEXT: srli a5, a1, 56
+; RV64-NEXT: lui a6, %hi(.LCPI3_0)
+; RV64-NEXT: srli a2, a2, 4
+; RV64-NEXT: or a3, a4, a3
+; RV64-NEXT: and a4, a0, a2
+; RV64-NEXT: add a3, a0, a3
+; RV64-NEXT: add a5, a3, a5
+; RV64-NEXT: and a3, a3, a2
+; RV64-NEXT: sltu a3, a3, a4
+; RV64-NEXT: lui a4, %hi(.LCPI3_1)
+; RV64-NEXT: add a3, a5, a3
+; RV64-NEXT: lui a5, %hi(.LCPI3_2)
+; RV64-NEXT: ld a6, %lo(.LCPI3_0)(a6)
+; RV64-NEXT: ld a4, %lo(.LCPI3_1)(a4)
+; RV64-NEXT: ld a5, %lo(.LCPI3_2)(a5)
+; RV64-NEXT: and a2, a3, a2
+; RV64-NEXT: mulhu a3, a2, a6
+; RV64-NEXT: slli a6, a3, 3
+; RV64-NEXT: sub a2, a2, a3
+; RV64-NEXT: sub a2, a2, a6
+; RV64-NEXT: sub a3, a0, a2
; RV64-NEXT: sltu a0, a0, a2
-; RV64-NEXT: mul a2, a4, a3
-; RV64-NEXT: mulhu a3, a4, a7
+; RV64-NEXT: mul a2, a3, a4
+; RV64-NEXT: mulhu a4, a3, a5
; RV64-NEXT: sub a1, a1, a0
-; RV64-NEXT: add a2, a3, a2
-; RV64-NEXT: mul a1, a1, a7
+; RV64-NEXT: add a2, a4, a2
+; RV64-NEXT: mul a1, a1, a5
; RV64-NEXT: add a1, a2, a1
-; RV64-NEXT: mul a0, a4, a7
+; RV64-NEXT: mul a0, a3, a5
; RV64-NEXT: ret
%a = udiv iXLen2 %x, 9
ret iXLen2 %a
diff --git a/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll b/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll
index 1680ea7d8da30..4b0c41861664a 100644
--- a/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll
+++ b/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll
@@ -103,35 +103,23 @@ define iXLen2 @test_urem_7(iXLen2 %x) nounwind {
;
; RV64-LABEL: test_urem_7:
; RV64: # %bb.0:
-; RV64-NEXT: slli a2, a1, 4
-; RV64-NEXT: srli a3, a0, 60
-; RV64-NEXT: slli a4, a1, 34
-; RV64-NEXT: srli a5, a0, 30
-; RV64-NEXT: lui a6, 262144
-; RV64-NEXT: or a2, a3, a2
-; RV64-NEXT: srli a3, a1, 26
-; RV64-NEXT: srli a1, a1, 56
-; RV64-NEXT: or a4, a5, a4
+; RV64-NEXT: li a2, -1
+; RV64-NEXT: slli a3, a1, 1
+; RV64-NEXT: srli a4, a0, 63
+; RV64-NEXT: srli a1, a1, 62
; RV64-NEXT: lui a5, %hi(.LCPI2_0)
-; RV64-NEXT: addi a6, a6, -1
-; RV64-NEXT: ld a5, %lo(.LCPI2_0)(a5)
-; RV64-NEXT: add a4, a0, a4
-; RV64-NEXT: and a0, a0, a6
-; RV64-NEXT: add a2, a4, a2
-; RV64-NEXT: and a4, a4, a6
-; RV64-NEXT: sltu a0, a4, a0
-; RV64-NEXT: add a0, a2, a0
-; RV64-NEXT: and a2, a2, a6
-; RV64-NEXT: sltu a2, a2, a4
-; RV64-NEXT: and a4, a0, a6
+; RV64-NEXT: srli a2, a2, 1
+; RV64-NEXT: or a3, a4, a3
+; RV64-NEXT: ld a4, %lo(.LCPI2_0)(a5)
+; RV64-NEXT: and a5, a0, a2
; RV64-NEXT: add a0, a0, a3
-; RV64-NEXT: add a1, a2, a1
-; RV64-NEXT: and a2, a0, a6
+; RV64-NEXT: and a3, a0, a2
; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: sltu a1, a2, a4
+; RV64-NEXT: sltu a1, a3, a5
; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: and a0, a0, a6
-; RV64-NEXT: mulhu a1, a0, a5
+; RV64-NEXT: and a0, a0, a2
+; RV64-NEXT: mulhu a1, a0, a4
+; RV64-NEXT: srli a1, a1, 1
; RV64-NEXT: slli a2, a1, 3
; RV64-NEXT: add a0, a0, a1
; RV64-NEXT: sub a0, a0, a2
@@ -169,35 +157,22 @@ define iXLen2 @test_urem_9(iXLen2 %x) nounwind {
;
; RV64-LABEL: test_urem_9:
; RV64: # %bb.0:
-; RV64-NEXT: slli a2, a1, 4
-; RV64-NEXT: srli a3, a0, 60
-; RV64-NEXT: slli a4, a1, 34
-; RV64-NEXT: srli a5, a0, 30
-; RV64-NEXT: lui a6, 262144
-; RV64-NEXT: or a2, a3, a2
-; RV64-NEXT: srli a3, a1, 26
+; RV64-NEXT: li a2, -1
+; RV64-NEXT: slli a3, a1, 4
+; RV64-NEXT: srli a4, a0, 60
; RV64-NEXT: srli a1, a1, 56
-; RV64-NEXT: or a4, a5, a4
; RV64-NEXT: lui a5, %hi(.LCPI3_0)
-; RV64-NEXT: addi a6, a6, -1
-; RV64-NEXT: ld a5, %lo(.LCPI3_0)(a5)
-; RV64-NEXT: add a4, a0, a4
-; RV64-NEXT: and a0, a0, a6
-; RV64-NEXT: add a2, a4, a2
-; RV64-NEXT: and a4, a4, a6
-; RV64-NEXT: sltu a0, a4, a0
-; RV64-NEXT: add a0, a2, a0
-; RV64-NEXT: and a2, a2, a6
-; RV64-NEXT: sltu a2, a2, a4
-; RV64-NEXT: and a4, a0, a6
+; RV64-NEXT: srli a2, a2, 4
+; RV64-NEXT: or a3, a4, a3
+; RV64-NEXT: ld a4, %lo(.LCPI3_0)(a5)
+; RV64-NEXT: and a5, a0, a2
; RV64-NEXT: add a0, a0, a3
-; RV64-NEXT: add a1, a2, a1
-; RV64-NEXT: and a2, a0, a6
+; RV64-NEXT: and a3, a0, a2
; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: sltu a1, a2, a4
+; RV64-NEXT: sltu a1, a3, a5
; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: and a0, a0, a6
-; RV64-NEXT: mulhu a1, a0, a5
+; RV64-NEXT: and a0, a0, a2
+; RV64-NEXT: mulhu a1, a0, a4
; RV64-NEXT: slli a2, a1, 3
; RV64-NEXT: sub a0, a0, a1
; RV64-NEXT: sub a0, a0, a2
diff --git a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
index 7fb5ba5f7fc63..e2f2c00c7818b 100644
--- a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
+++ b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
@@ -862,51 +862,61 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind {
; RV32IM-NEXT: sw s5, 20(sp) # 4-byte Folded Spill
; RV32IM-NEXT: sw s6, 16(sp) # 4-byte Folded Spill
; RV32IM-NEXT: sw s7, 12(sp) # 4-byte Folded Spill
-; RV32IM-NEXT: sw s8, 8(sp) # 4-byte Folded Spill
-; RV32IM-NEXT: lw s1, 16(a1)
-; RV32IM-NEXT: lw s2, 20(a1)
-; RV32IM-NEXT: lw s3, 24(a1)
-; RV32IM-NEXT: lw s4, 28(a1)
-; RV32IM-NEXT: lw a3, 0(a1)
-; RV32IM-NEXT: lw a4, 4(a1)
-; RV32IM-NEXT: lw s5, 8(a1)
-; RV32IM-NEXT: lw s6, 12(a1)
; RV32IM-NEXT: mv s0, a0
+; RV32IM-NEXT: lw a2, 16(a1)
+; RV32IM-NEXT: lw a4, 20(a1)
+; RV32IM-NEXT: lw s1, 24(a1)
+; RV32IM-NEXT: lw s2, 28(a1)
+; RV32IM-NEXT: lw a0, 0(a1)
+; RV32IM-NEXT: lw a3, 4(a1)
+; RV32IM-NEXT: lw s3, 8(a1)
+; RV32IM-NEXT: lw s4, 12(a1)
+; RV32IM-NEXT: lui a1, 1024
+; RV32IM-NEXT: lui a5, 45590
+; RV32IM-NEXT: addi a1, a1, -1
+; RV32IM-NEXT: addi a5, a5, 1069
+; RV32IM-NEXT: slli a6, a4, 10
+; RV32IM-NEXT: srli a7, a2, 22
+; RV32IM-NEXT: or a6, a7, a6
+; RV32IM-NEXT: and a7, a2, a1
+; RV32IM-NEXT: srli a4, a4, 12
+; RV32IM-NEXT: add a2, a2, a6
+; RV32IM-NEXT: and a6, a2, a1
+; RV32IM-NEXT: add a2, a2, a4
+; RV32IM-NEXT: sltu a4, a6, a7
+; RV32IM-NEXT: add a2, a2, a4
+; RV32IM-NEXT: and a1, a2, a1
+; RV32IM-NEXT: mulhu a2, a1, a5
+; RV32IM-NEXT: li a4, 23
+; RV32IM-NEXT: mul a2, a2, a4
+; RV32IM-NEXT: sub s7, a1, a2
; RV32IM-NEXT: li a2, 1
-; RV32IM-NEXT: mv a0, a3
-; RV32IM-NEXT: mv a1, a4
-; RV32IM-NEXT: li a3, 0
-; RV32IM-NEXT: call __umoddi3
-; RV32IM-NEXT: mv s7, a0
-; RV32IM-NEXT: mv s8, a1
-; RV32IM-NEXT: li a2, 654
-; RV32IM-NEXT: mv a0, s5
-; RV32IM-NEXT: mv a1, s6
+; RV32IM-NEXT: mv a1, a3
; RV32IM-NEXT: li a3, 0
; RV32IM-NEXT: call __umoddi3
; RV32IM-NEXT: mv s5, a0
; RV32IM-NEXT: mv s6, a1
-; RV32IM-NEXT: li a2, 23
-; RV32IM-NEXT: mv a0, s1
-; RV32IM-NEXT: mv a1, s2
+; RV32IM-NEXT: li a2, 654
+; RV32IM-NEXT: mv a0, s3
+; RV32IM-NEXT: mv a1, s4
; RV32IM-NEXT: li a3, 0
; RV32IM-NEXT: call __umoddi3
-; RV32IM-NEXT: mv s1, a0
-; RV32IM-NEXT: mv s2, a1
+; RV32IM-NEXT: mv s3, a0
+; RV32IM-NEXT: mv s4, a1
; RV32IM-NEXT: lui a2, 1
; RV32IM-NEXT: addi a2, a2, 1327
-; RV32IM-NEXT: mv a0, s3
-; RV32IM-NEXT: mv a1, s4
+; RV32IM-NEXT: mv a0, s1
+; RV32IM-NEXT: mv a1, s2
; RV32IM-NEXT: li a3, 0
; RV32IM-NEXT: call __umoddi3
-; RV32IM-NEXT: sw s1, 16(s0)
-; RV32IM-NEXT: sw s2, 20(s0)
+; RV32IM-NEXT: sw s7, 16(s0)
+; RV32IM-NEXT: sw zero, 20(s0)
; RV32IM-NEXT: sw a0, 24(s0)
; RV32IM-NEXT: sw a1, 28(s0)
-; RV32IM-NEXT: sw s7, 0(s0)
-; RV32IM-NEXT: sw s8, 4(s0)
-; RV32IM-NEXT: sw s5, 8(s0)
-; RV32IM-NEXT: sw s6, 12(s0)
+; RV32IM-NEXT: sw s5, 0(s0)
+; RV32IM-NEXT: sw s6, 4(s0)
+; RV32IM-NEXT: sw s3, 8(s0)
+; RV32IM-NEXT: sw s4, 12(s0)
; RV32IM-NEXT: lw ra, 44(sp) # 4-byte Folded Reload
; RV32IM-NEXT: lw s0, 40(sp) # 4-byte Folded Reload
; RV32IM-NEXT: lw s1, 36(sp) # 4-byte Folded Reload
@@ -916,7 +926,6 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind {
; RV32IM-NEXT: lw s5, 20(sp) # 4-byte Folded Reload
; RV32IM-NEXT: lw s6, 16(sp) # 4-byte Folded Reload
; RV32IM-NEXT: lw s7, 12(sp) # 4-byte Folded Reload
-; RV32IM-NEXT: lw s8, 8(sp) # 4-byte Folded Reload
; RV32IM-NEXT: addi sp, sp, 48
; RV32IM-NEXT: ret
;
>From 8ba7f05bb2b253bd30d8de968665c09d65464642 Mon Sep 17 00:00:00 2001
From: Shivam Gupta <shivam98.tkg at gmail.com>
Date: Mon, 2 Mar 2026 18:11:35 +0530
Subject: [PATCH 06/15] Address Review comments
---
.../CodeGen/SelectionDAG/TargetLowering.cpp | 9 +-
llvm/test/CodeGen/X86/i128-divrem-by-const.ll | 391 ++++++++++++++++++
llvm/test/CodeGen/X86/uint128-div-const.ll | 148 -------
3 files changed, 393 insertions(+), 155 deletions(-)
create mode 100644 llvm/test/CodeGen/X86/i128-divrem-by-const.ll
delete mode 100644 llvm/test/CodeGen/X86/uint128-div-const.ll
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 17e4a228b7a66..45d79fd8cecf0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8271,13 +8271,8 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
EVT ChunkVT = EVT::getIntegerVT(*DAG.getContext(), BestChunkWidth);
- SDValue In;
-
- if (LL) {
- In = DAG.getNode(ISD::BUILD_PAIR, dl, VT, LL, LH);
- } else {
- In = N->getOperand(0);
- }
+ SDValue In =
+ LL ? DAG.getNode(ISD::BUILD_PAIR, dl, VT, LL, LH) : N->getOperand(0);
SmallVector<SDValue, 8> Parts;
// Split into fixed-size chunks
diff --git a/llvm/test/CodeGen/X86/i128-divrem-by-const.ll b/llvm/test/CodeGen/X86/i128-divrem-by-const.ll
new file mode 100644
index 0000000000000..ccf80e4892711
--- /dev/null
+++ b/llvm/test/CodeGen/X86/i128-divrem-by-const.ll
@@ -0,0 +1,391 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+
+define i128 @div_by_7(i128 %x) {
+; CHECK-LABEL: div_by_7:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF
+; CHECK-NEXT: movq %rdi, %rdx
+; CHECK-NEXT: andq %rax, %rdx
+; CHECK-NEXT: movq %rsi, %r8
+; CHECK-NEXT: shldq $1, %rdi, %r8
+; CHECK-NEXT: addq %rdi, %r8
+; CHECK-NEXT: movq %r8, %r9
+; CHECK-NEXT: andq %rax, %r9
+; CHECK-NEXT: movq %rsi, %rcx
+; CHECK-NEXT: shrq $62, %rcx
+; CHECK-NEXT: cmpq %rdx, %r9
+; CHECK-NEXT: adcq %r8, %rcx
+; CHECK-NEXT: andq %rax, %rcx
+; CHECK-NEXT: movabsq $5270498306774157605, %rdx # imm = 0x4924924924924925
+; CHECK-NEXT: movq %rcx, %rax
+; CHECK-NEXT: mulq %rdx
+; CHECK-NEXT: shrq %rdx
+; CHECK-NEXT: leaq (,%rdx,8), %rax
+; CHECK-NEXT: subq %rax, %rdx
+; CHECK-NEXT: addq %rcx, %rdx
+; CHECK-NEXT: subq %rdx, %rdi
+; CHECK-NEXT: sbbq $0, %rsi
+; CHECK-NEXT: movabsq $-5270498306774157605, %rcx # imm = 0xB6DB6DB6DB6DB6DB
+; CHECK-NEXT: imulq %rdi, %rcx
+; CHECK-NEXT: movabsq $7905747460161236407, %r8 # imm = 0x6DB6DB6DB6DB6DB7
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: mulq %r8
+; CHECK-NEXT: addq %rcx, %rdx
+; CHECK-NEXT: imulq %rsi, %r8
+; CHECK-NEXT: addq %r8, %rdx
+; CHECK-NEXT: retq
+entry:
+ %div = udiv i128 %x, 7
+ ret i128 %div
+}
+
+define i128 @div_by_9(i128 %x) {
+; CHECK-LABEL: div_by_9:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: movabsq $1152921504606846975, %rax # imm = 0xFFFFFFFFFFFFFFF
+; CHECK-NEXT: movq %rdi, %rdx
+; CHECK-NEXT: andq %rax, %rdx
+; CHECK-NEXT: movq %rsi, %r8
+; CHECK-NEXT: shldq $4, %rdi, %r8
+; CHECK-NEXT: addq %rdi, %r8
+; CHECK-NEXT: movq %r8, %r9
+; CHECK-NEXT: andq %rax, %r9
+; CHECK-NEXT: movq %rsi, %rcx
+; CHECK-NEXT: shrq $56, %rcx
+; CHECK-NEXT: cmpq %rdx, %r9
+; CHECK-NEXT: adcq %r8, %rcx
+; CHECK-NEXT: andq %rax, %rcx
+; CHECK-NEXT: movabsq $2049638230412172402, %rdx # imm = 0x1C71C71C71C71C72
+; CHECK-NEXT: movq %rcx, %rax
+; CHECK-NEXT: mulq %rdx
+; CHECK-NEXT: leaq (%rdx,%rdx,8), %rax
+; CHECK-NEXT: subq %rax, %rcx
+; CHECK-NEXT: subq %rcx, %rdi
+; CHECK-NEXT: sbbq $0, %rsi
+; CHECK-NEXT: movabsq $4099276460824344803, %rcx # imm = 0x38E38E38E38E38E3
+; CHECK-NEXT: imulq %rdi, %rcx
+; CHECK-NEXT: movabsq $-8198552921648689607, %r8 # imm = 0x8E38E38E38E38E39
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: mulq %r8
+; CHECK-NEXT: addq %rcx, %rdx
+; CHECK-NEXT: imulq %rsi, %r8
+; CHECK-NEXT: addq %r8, %rdx
+; CHECK-NEXT: retq
+entry:
+ %div = udiv i128 %x, 9
+ ret i128 %div
+}
+
+define i128 @div_by_11(i128 %x) {
+; CHECK-LABEL: div_by_11:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movabsq $1152921504606846975, %rax # imm = 0xFFFFFFFFFFFFFFF
+; CHECK-NEXT: movq %rdi, %rdx
+; CHECK-NEXT: andq %rax, %rdx
+; CHECK-NEXT: movq %rsi, %r8
+; CHECK-NEXT: shldq $4, %rdi, %r8
+; CHECK-NEXT: addq %rdi, %r8
+; CHECK-NEXT: movq %r8, %r9
+; CHECK-NEXT: andq %rax, %r9
+; CHECK-NEXT: movq %rsi, %rcx
+; CHECK-NEXT: shrq $56, %rcx
+; CHECK-NEXT: cmpq %rdx, %r9
+; CHECK-NEXT: adcq %r8, %rcx
+; CHECK-NEXT: andq %rax, %rcx
+; CHECK-NEXT: movabsq $1676976733973595602, %rdx # imm = 0x1745D1745D1745D2
+; CHECK-NEXT: movq %rcx, %rax
+; CHECK-NEXT: mulq %rdx
+; CHECK-NEXT: leaq (%rdx,%rdx,4), %rax
+; CHECK-NEXT: leaq (%rdx,%rax,2), %rax
+; CHECK-NEXT: subq %rax, %rcx
+; CHECK-NEXT: subq %rcx, %rdi
+; CHECK-NEXT: sbbq $0, %rsi
+; CHECK-NEXT: movabsq $-6707906935894382406, %rcx # imm = 0xA2E8BA2E8BA2E8BA
+; CHECK-NEXT: imulq %rdi, %rcx
+; CHECK-NEXT: movabsq $3353953467947191203, %r8 # imm = 0x2E8BA2E8BA2E8BA3
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: mulq %r8
+; CHECK-NEXT: addq %rcx, %rdx
+; CHECK-NEXT: imulq %rsi, %r8
+; CHECK-NEXT: addq %r8, %rdx
+; CHECK-NEXT: retq
+ %div = udiv i128 %x, 11
+ ret i128 %div
+}
+
+define i128 @div_by_22(i128 %x) {
+; CHECK-LABEL: div_by_22:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: movl $22, %edx
+; CHECK-NEXT: xorl %ecx, %ecx
+; CHECK-NEXT: callq __udivti3 at PLT
+; CHECK-NEXT: popq %rcx
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
+entry:
+ %div = udiv i128 %x, 22
+ ret i128 %div
+}
+
+define i128 @div_by_25(i128 %x) {
+; CHECK-LABEL: div_by_25:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: movabsq $1152921504606846975, %rax # imm = 0xFFFFFFFFFFFFFFF
+; CHECK-NEXT: movq %rdi, %rdx
+; CHECK-NEXT: andq %rax, %rdx
+; CHECK-NEXT: movq %rsi, %r8
+; CHECK-NEXT: shldq $4, %rdi, %r8
+; CHECK-NEXT: addq %rdi, %r8
+; CHECK-NEXT: movq %r8, %r9
+; CHECK-NEXT: andq %rax, %r9
+; CHECK-NEXT: movq %rsi, %rcx
+; CHECK-NEXT: shrq $56, %rcx
+; CHECK-NEXT: cmpq %rdx, %r9
+; CHECK-NEXT: adcq %r8, %rcx
+; CHECK-NEXT: andq %rax, %rcx
+; CHECK-NEXT: movabsq $737869762948382065, %rdx # imm = 0xA3D70A3D70A3D71
+; CHECK-NEXT: movq %rcx, %rax
+; CHECK-NEXT: mulq %rdx
+; CHECK-NEXT: leaq (%rdx,%rdx,4), %rax
+; CHECK-NEXT: leaq (%rax,%rax,4), %rax
+; CHECK-NEXT: subq %rax, %rcx
+; CHECK-NEXT: subq %rcx, %rdi
+; CHECK-NEXT: sbbq $0, %rsi
+; CHECK-NEXT: movabsq $2951479051793528258, %rcx # imm = 0x28F5C28F5C28F5C2
+; CHECK-NEXT: imulq %rdi, %rcx
+; CHECK-NEXT: movabsq $-8116567392432202711, %r8 # imm = 0x8F5C28F5C28F5C29
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: mulq %r8
+; CHECK-NEXT: addq %rcx, %rdx
+; CHECK-NEXT: imulq %rsi, %r8
+; CHECK-NEXT: addq %r8, %rdx
+; CHECK-NEXT: retq
+entry:
+ %div = udiv i128 %x, 25
+ ret i128 %div
+}
+
+define i128 @div_by_56(i128 %x) {
+; CHECK-LABEL: div_by_56:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: movl $56, %edx
+; CHECK-NEXT: xorl %ecx, %ecx
+; CHECK-NEXT: callq __udivti3 at PLT
+; CHECK-NEXT: popq %rcx
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
+ %div = udiv i128 %x, 56 ; 8 * 7
+ ret i128 %div
+}
+
+define i128 @rem_by_7(i128 %x) {
+; CHECK-LABEL: rem_by_7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF
+; CHECK-NEXT: movq %rsi, %rcx
+; CHECK-NEXT: shldq $1, %rdi, %rcx
+; CHECK-NEXT: addq %rdi, %rcx
+; CHECK-NEXT: andq %rax, %rdi
+; CHECK-NEXT: movq %rcx, %rdx
+; CHECK-NEXT: andq %rax, %rdx
+; CHECK-NEXT: shrq $62, %rsi
+; CHECK-NEXT: cmpq %rdi, %rdx
+; CHECK-NEXT: adcq %rsi, %rcx
+; CHECK-NEXT: andq %rax, %rcx
+; CHECK-NEXT: movabsq $5270498306774157605, %rdx # imm = 0x4924924924924925
+; CHECK-NEXT: movq %rcx, %rax
+; CHECK-NEXT: mulq %rdx
+; CHECK-NEXT: shrq %rdx
+; CHECK-NEXT: leaq (,%rdx,8), %rax
+; CHECK-NEXT: subq %rax, %rdx
+; CHECK-NEXT: addq %rdx, %rcx
+; CHECK-NEXT: movq %rcx, %rax
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: retq
+ %r = urem i128 %x, 7
+ ret i128 %r
+}
+
+define i128 @rem_by_14(i128 %x) {
+; CHECK-LABEL: rem_by_14:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: movl $14, %edx
+; CHECK-NEXT: xorl %ecx, %ecx
+; CHECK-NEXT: callq __umodti3 at PLT
+; CHECK-NEXT: popq %rcx
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
+ %r = urem i128 %x, 14
+ ret i128 %r
+}
+
+define <2 x i64> @v2i64_div_by_7(<2 x i64> %x) {
+; CHECK-LABEL: v2i64_div_by_7:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: movq %xmm0, %rcx
+; CHECK-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
+; CHECK-NEXT: movq %rcx, %rax
+; CHECK-NEXT: mulq %rsi
+; CHECK-NEXT: subq %rdx, %rcx
+; CHECK-NEXT: shrq %rcx
+; CHECK-NEXT: addq %rdx, %rcx
+; CHECK-NEXT: movq %rcx, %xmm1
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; CHECK-NEXT: movq %xmm0, %rcx
+; CHECK-NEXT: movq %rcx, %rax
+; CHECK-NEXT: mulq %rsi
+; CHECK-NEXT: subq %rdx, %rcx
+; CHECK-NEXT: shrq %rcx
+; CHECK-NEXT: addq %rdx, %rcx
+; CHECK-NEXT: movq %rcx, %xmm0
+; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; CHECK-NEXT: psrlq $2, %xmm1
+; CHECK-NEXT: movdqa %xmm1, %xmm0
+; CHECK-NEXT: retq
+entry:
+ %div = udiv <2 x i64> %x, <i64 7, i64 7>
+ ret <2 x i64> %div
+}
+
+define <2 x i64> @v2i64_div_by_14(<2 x i64> %x) {
+; CHECK-LABEL: v2i64_div_by_14:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: movq %xmm0, %rax
+; CHECK-NEXT: shrq %rax
+; CHECK-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925
+; CHECK-NEXT: mulq %rcx
+; CHECK-NEXT: movq %rdx, %xmm1
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; CHECK-NEXT: movq %xmm0, %rax
+; CHECK-NEXT: shrq %rax
+; CHECK-NEXT: mulq %rcx
+; CHECK-NEXT: movq %rdx, %xmm0
+; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; CHECK-NEXT: psrlq $1, %xmm1
+; CHECK-NEXT: movdqa %xmm1, %xmm0
+; CHECK-NEXT: retq
+entry:
+ %div = udiv <2 x i64> %x, <i64 14, i64 14>
+ ret <2 x i64> %div
+}
+
+define <4 x i32> @v4i32_div_by_7(<4 x i32> %x) {
+; CHECK-LABEL: v4i32_div_by_7:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
+; CHECK-NEXT: movdqa %xmm0, %xmm2
+; CHECK-NEXT: pmuludq %xmm1, %xmm2
+; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; CHECK-NEXT: pmuludq %xmm1, %xmm3
+; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
+; CHECK-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; CHECK-NEXT: psubd %xmm2, %xmm0
+; CHECK-NEXT: psrld $1, %xmm0
+; CHECK-NEXT: paddd %xmm2, %xmm0
+; CHECK-NEXT: psrld $2, %xmm0
+; CHECK-NEXT: retq
+entry:
+ %div = udiv <4 x i32> %x, <i32 7, i32 7, i32 7, i32 7>
+ ret <4 x i32> %div
+}
+
+define <2 x i128> @v2i128_div_by_7(<2 x i128> %x) {
+; CHECK-LABEL: v2i128_div_by_7:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: pushq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: .cfi_offset %rbx, -40
+; CHECK-NEXT: .cfi_offset %r12, -32
+; CHECK-NEXT: .cfi_offset %r14, -24
+; CHECK-NEXT: .cfi_offset %r15, -16
+; CHECK-NEXT: movq %rcx, %r9
+; CHECK-NEXT: movq %rdx, %rcx
+; CHECK-NEXT: movabsq $9223372036854775807, %r11 # imm = 0x7FFFFFFFFFFFFFFF
+; CHECK-NEXT: movq %rsi, %rax
+; CHECK-NEXT: andq %r11, %rax
+; CHECK-NEXT: shldq $1, %rsi, %rdx
+; CHECK-NEXT: addq %rsi, %rdx
+; CHECK-NEXT: movq %rdx, %rbx
+; CHECK-NEXT: andq %r11, %rbx
+; CHECK-NEXT: movq %rcx, %r10
+; CHECK-NEXT: shrq $62, %r10
+; CHECK-NEXT: cmpq %rax, %rbx
+; CHECK-NEXT: adcq %rdx, %r10
+; CHECK-NEXT: andq %r11, %r10
+; CHECK-NEXT: movabsq $5270498306774157605, %r15 # imm = 0x4924924924924925
+; CHECK-NEXT: movq %r10, %rax
+; CHECK-NEXT: mulq %r15
+; CHECK-NEXT: shrq %rdx
+; CHECK-NEXT: leaq (,%rdx,8), %rax
+; CHECK-NEXT: subq %rax, %rdx
+; CHECK-NEXT: addq %r10, %rdx
+; CHECK-NEXT: subq %rdx, %rsi
+; CHECK-NEXT: sbbq $0, %rcx
+; CHECK-NEXT: movabsq $-5270498306774157605, %rbx # imm = 0xB6DB6DB6DB6DB6DB
+; CHECK-NEXT: movq %rsi, %r10
+; CHECK-NEXT: imulq %rbx, %r10
+; CHECK-NEXT: movabsq $7905747460161236407, %r14 # imm = 0x6DB6DB6DB6DB6DB7
+; CHECK-NEXT: movq %rsi, %rax
+; CHECK-NEXT: mulq %r14
+; CHECK-NEXT: movq %rax, %rsi
+; CHECK-NEXT: addq %r10, %rdx
+; CHECK-NEXT: imulq %r14, %rcx
+; CHECK-NEXT: addq %rdx, %rcx
+; CHECK-NEXT: movq %r9, %rax
+; CHECK-NEXT: andq %r11, %rax
+; CHECK-NEXT: movq %r8, %rdx
+; CHECK-NEXT: shldq $1, %r9, %rdx
+; CHECK-NEXT: addq %r9, %rdx
+; CHECK-NEXT: movq %rdx, %r12
+; CHECK-NEXT: andq %r11, %r12
+; CHECK-NEXT: movq %r8, %r10
+; CHECK-NEXT: shrq $62, %r10
+; CHECK-NEXT: cmpq %rax, %r12
+; CHECK-NEXT: adcq %rdx, %r10
+; CHECK-NEXT: andq %r11, %r10
+; CHECK-NEXT: movq %r10, %rax
+; CHECK-NEXT: mulq %r15
+; CHECK-NEXT: shrq %rdx
+; CHECK-NEXT: leaq (,%rdx,8), %rax
+; CHECK-NEXT: subq %rax, %rdx
+; CHECK-NEXT: addq %r10, %rdx
+; CHECK-NEXT: subq %rdx, %r9
+; CHECK-NEXT: sbbq $0, %r8
+; CHECK-NEXT: imulq %r9, %rbx
+; CHECK-NEXT: movq %r9, %rax
+; CHECK-NEXT: mulq %r14
+; CHECK-NEXT: addq %rbx, %rdx
+; CHECK-NEXT: imulq %r14, %r8
+; CHECK-NEXT: addq %rdx, %r8
+; CHECK-NEXT: movq %rax, 16(%rdi)
+; CHECK-NEXT: movq %rsi, (%rdi)
+; CHECK-NEXT: movq %r8, 24(%rdi)
+; CHECK-NEXT: movq %rcx, 8(%rdi)
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: popq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: popq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
+entry:
+ %div = udiv <2 x i128> %x, <i128 7, i128 7>
+ ret <2 x i128> %div
+}
diff --git a/llvm/test/CodeGen/X86/uint128-div-const.ll b/llvm/test/CodeGen/X86/uint128-div-const.ll
deleted file mode 100644
index 952b98af6adea..0000000000000
--- a/llvm/test/CodeGen/X86/uint128-div-const.ll
+++ /dev/null
@@ -1,148 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -O2 | FileCheck %s
-
-define i128 @div_by_7(i128 %x) {
-; CHECK-LABEL: div_by_7:
-; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF
-; CHECK-NEXT: movq %rdi, %rdx
-; CHECK-NEXT: andq %rax, %rdx
-; CHECK-NEXT: movq %rsi, %r8
-; CHECK-NEXT: shldq $1, %rdi, %r8
-; CHECK-NEXT: addq %rdi, %r8
-; CHECK-NEXT: movq %r8, %r9
-; CHECK-NEXT: andq %rax, %r9
-; CHECK-NEXT: movq %rsi, %rcx
-; CHECK-NEXT: shrq $62, %rcx
-; CHECK-NEXT: cmpq %rdx, %r9
-; CHECK-NEXT: adcq %r8, %rcx
-; CHECK-NEXT: andq %rax, %rcx
-; CHECK-NEXT: movabsq $5270498306774157605, %rdx # imm = 0x4924924924924925
-; CHECK-NEXT: movq %rcx, %rax
-; CHECK-NEXT: mulq %rdx
-; CHECK-NEXT: shrq %rdx
-; CHECK-NEXT: leaq (,%rdx,8), %rax
-; CHECK-NEXT: subq %rax, %rdx
-; CHECK-NEXT: addq %rcx, %rdx
-; CHECK-NEXT: subq %rdx, %rdi
-; CHECK-NEXT: sbbq $0, %rsi
-; CHECK-NEXT: movabsq $-5270498306774157605, %rcx # imm = 0xB6DB6DB6DB6DB6DB
-; CHECK-NEXT: imulq %rdi, %rcx
-; CHECK-NEXT: movabsq $7905747460161236407, %r8 # imm = 0x6DB6DB6DB6DB6DB7
-; CHECK-NEXT: movq %rdi, %rax
-; CHECK-NEXT: mulq %r8
-; CHECK-NEXT: addq %rcx, %rdx
-; CHECK-NEXT: imulq %rsi, %r8
-; CHECK-NEXT: addq %r8, %rdx
-; CHECK-NEXT: retq
-entry:
- %div = udiv i128 %x, 7
- ret i128 %div
-}
-
-define i128 @div_by_9(i128 %x) {
-; CHECK-LABEL: div_by_9:
-; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: movabsq $1152921504606846975, %rax # imm = 0xFFFFFFFFFFFFFFF
-; CHECK-NEXT: movq %rdi, %rdx
-; CHECK-NEXT: andq %rax, %rdx
-; CHECK-NEXT: movq %rsi, %r8
-; CHECK-NEXT: shldq $4, %rdi, %r8
-; CHECK-NEXT: addq %rdi, %r8
-; CHECK-NEXT: movq %r8, %r9
-; CHECK-NEXT: andq %rax, %r9
-; CHECK-NEXT: movq %rsi, %rcx
-; CHECK-NEXT: shrq $56, %rcx
-; CHECK-NEXT: cmpq %rdx, %r9
-; CHECK-NEXT: adcq %r8, %rcx
-; CHECK-NEXT: andq %rax, %rcx
-; CHECK-NEXT: movabsq $2049638230412172402, %rdx # imm = 0x1C71C71C71C71C72
-; CHECK-NEXT: movq %rcx, %rax
-; CHECK-NEXT: mulq %rdx
-; CHECK-NEXT: leaq (%rdx,%rdx,8), %rax
-; CHECK-NEXT: subq %rax, %rcx
-; CHECK-NEXT: subq %rcx, %rdi
-; CHECK-NEXT: sbbq $0, %rsi
-; CHECK-NEXT: movabsq $4099276460824344803, %rcx # imm = 0x38E38E38E38E38E3
-; CHECK-NEXT: imulq %rdi, %rcx
-; CHECK-NEXT: movabsq $-8198552921648689607, %r8 # imm = 0x8E38E38E38E38E39
-; CHECK-NEXT: movq %rdi, %rax
-; CHECK-NEXT: mulq %r8
-; CHECK-NEXT: addq %rcx, %rdx
-; CHECK-NEXT: imulq %rsi, %r8
-; CHECK-NEXT: addq %r8, %rdx
-; CHECK-NEXT: retq
-entry:
- %div = udiv i128 %x, 9
- ret i128 %div
-}
-
-define i128 @div_by_25(i128 %x) {
-; CHECK-LABEL: div_by_25:
-; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: movabsq $1152921504606846975, %rax # imm = 0xFFFFFFFFFFFFFFF
-; CHECK-NEXT: movq %rdi, %rdx
-; CHECK-NEXT: andq %rax, %rdx
-; CHECK-NEXT: movq %rsi, %r8
-; CHECK-NEXT: shldq $4, %rdi, %r8
-; CHECK-NEXT: addq %rdi, %r8
-; CHECK-NEXT: movq %r8, %r9
-; CHECK-NEXT: andq %rax, %r9
-; CHECK-NEXT: movq %rsi, %rcx
-; CHECK-NEXT: shrq $56, %rcx
-; CHECK-NEXT: cmpq %rdx, %r9
-; CHECK-NEXT: adcq %r8, %rcx
-; CHECK-NEXT: andq %rax, %rcx
-; CHECK-NEXT: movabsq $737869762948382065, %rdx # imm = 0xA3D70A3D70A3D71
-; CHECK-NEXT: movq %rcx, %rax
-; CHECK-NEXT: mulq %rdx
-; CHECK-NEXT: leaq (%rdx,%rdx,4), %rax
-; CHECK-NEXT: leaq (%rax,%rax,4), %rax
-; CHECK-NEXT: subq %rax, %rcx
-; CHECK-NEXT: subq %rcx, %rdi
-; CHECK-NEXT: sbbq $0, %rsi
-; CHECK-NEXT: movabsq $2951479051793528258, %rcx # imm = 0x28F5C28F5C28F5C2
-; CHECK-NEXT: imulq %rdi, %rcx
-; CHECK-NEXT: movabsq $-8116567392432202711, %r8 # imm = 0x8F5C28F5C28F5C29
-; CHECK-NEXT: movq %rdi, %rax
-; CHECK-NEXT: mulq %r8
-; CHECK-NEXT: addq %rcx, %rdx
-; CHECK-NEXT: imulq %rsi, %r8
-; CHECK-NEXT: addq %r8, %rdx
-; CHECK-NEXT: retq
-entry:
- %div = udiv i128 %x, 25
- ret i128 %div
-}
-
-define i128 @div_by_14(i128 %x) {
-; CHECK-LABEL: div_by_14:
-; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: movl $14, %edx
-; CHECK-NEXT: xorl %ecx, %ecx
-; CHECK-NEXT: callq __udivti3 at PLT
-; CHECK-NEXT: popq %rcx
-; CHECK-NEXT: .cfi_def_cfa_offset 8
-; CHECK-NEXT: retq
-entry:
- %div = udiv i128 %x, 14
- ret i128 %div
-}
-
-define i128 @div_by_22(i128 %x) {
-; CHECK-LABEL: div_by_22:
-; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: movl $22, %edx
-; CHECK-NEXT: xorl %ecx, %ecx
-; CHECK-NEXT: callq __udivti3 at PLT
-; CHECK-NEXT: popq %rcx
-; CHECK-NEXT: .cfi_def_cfa_offset 8
-; CHECK-NEXT: retq
-entry:
- %div = udiv i128 %x, 22
- ret i128 %div
-}
>From b0869557b22b459f4c6ac2056881f43bb84659e1 Mon Sep 17 00:00:00 2001
From: Shivam Gupta <shivam98.tkg at gmail.com>
Date: Mon, 2 Mar 2026 18:24:36 +0530
Subject: [PATCH 07/15] Improve variable naming
---
llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 45d79fd8cecf0..880ee3dcc7894 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8258,8 +8258,8 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
// Then 2^W ≡ 1 (mod Divisor), so a value written in base 2^W can be
// reduced modulo Divisor by summing its W-bit chunks.
for (unsigned i = MaxChunk; i > MaxChunk / 2; --i) {
- APInt ChunkMaxPlus1 = APInt::getOneBitSet(BitWidth, i);
- if (ChunkMaxPlus1.urem(Divisor).isOne()) {
+ APInt Pow2 = APInt::getOneBitSet(BitWidth, i);
+ if (Pow2.urem(Divisor).isOne()) {
BestChunkWidth = i;
break;
}
>From de869c291d386ffcb1997a2729b0a12910c90901 Mon Sep 17 00:00:00 2001
From: Shivam Gupta <shivam98.tkg at gmail.com>
Date: Mon, 2 Mar 2026 21:16:45 +0530
Subject: [PATCH 08/15] Address reviews comments
---
.../CodeGen/SelectionDAG/TargetLowering.cpp | 39 +++++++++++++------
llvm/test/CodeGen/X86/i128-divrem-by-const.ll | 2 +-
2 files changed, 28 insertions(+), 13 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 880ee3dcc7894..d02c61cce3ac3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8245,24 +8245,40 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
// for chunk operations.
unsigned MaxChunk;
EVT LegalVT = EVT(getRegisterType(*DAG.getContext(), VT));
- if (LegalVT.isInteger())
- MaxChunk = LegalVT.getSizeInBits();
- else
+ if (!LegalVT.isInteger())
return false;
// Clamp to the original bit width.
- MaxChunk = std::min(MaxChunk, BitWidth);
+ MaxChunk = std::min<unsigned>(LegalVT.getScalarSizeInBits(), BitWidth);
+
+ // Find the largest W in (MaxChunk/2, MaxChunk] such that
+ // 2^W ≡ 1 (mod Divisor). If this holds, the value can be
+ // reduced modulo Divisor by summing W-bit chunks.
+ //
+ // Instead of constructing 2^W for each candidate, compute
+ // 2^MaxChunk mod Divisor once and walk downward, maintaining:
+ //
+ // Mod == 2^i mod Divisor
+ //
+ // For each decrement of i, update Mod by multiplying with
+ // the modular inverse of 2 (Divisor is known to be odd here).
+ // Compute 2^MaxChunk mod Divisor
+ APInt Mod(Divisor.getBitWidth(), 1);
+ for (unsigned k = 0; k < MaxChunk; ++k)
+ Mod = (Mod.shl(1)).urem(Divisor);
+
+ // Since Divisor is odd, inverse of 2 mod D is (D+1)/2
+ APInt Inv2 = (Divisor + 1).lshr(1);
- // Find the largest chunk width W in (MaxChunk/2, MaxChunk] satisfying
- // (1 << W) % Divisor == 1.
- // Then 2^W ≡ 1 (mod Divisor), so a value written in base 2^W can be
- // reduced modulo Divisor by summing its W-bit chunks.
+ // Walk downward to find largest valid W
for (unsigned i = MaxChunk; i > MaxChunk / 2; --i) {
- APInt Pow2 = APInt::getOneBitSet(BitWidth, i);
- if (Pow2.urem(Divisor).isOne()) {
+ if (Mod.isOne()) {
BestChunkWidth = i;
break;
}
+
+ // Move from 2^i to 2^(i-1)
+ Mod = (Mod * Inv2).urem(Divisor);
}
// If we found a good chunk width, slice the number and sum the pieces.
@@ -8282,8 +8298,7 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
Chunk = DAG.getNode(ISD::TRUNCATE, dl, ChunkVT, Chunk);
Parts.push_back(Chunk);
}
- if (Parts.empty())
- return false;
+ assert(!Parts.empty() && "Failed to split divisor into chunks");
Sum = Parts[0];
// Use uaddo_carry if we can, otherwise use a compare to detect overflow.
diff --git a/llvm/test/CodeGen/X86/i128-divrem-by-const.ll b/llvm/test/CodeGen/X86/i128-divrem-by-const.ll
index ccf80e4892711..941f33b0e9b8f 100644
--- a/llvm/test/CodeGen/X86/i128-divrem-by-const.ll
+++ b/llvm/test/CodeGen/X86/i128-divrem-by-const.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-linux | FileCheck %s
define i128 @div_by_7(i128 %x) {
; CHECK-LABEL: div_by_7:
>From aeabecdbf78fe09850049a990b44583547571059 Mon Sep 17 00:00:00 2001
From: Shivam Gupta <shivam98.tkg at gmail.com>
Date: Tue, 3 Mar 2026 14:25:10 +0530
Subject: [PATCH 09/15] Address reviews comments
---
llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp | 8 +++-----
1 file changed, 3 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index d02c61cce3ac3..dcad1bb1ac9b7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8243,13 +8243,11 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
// Determine the largest legal scalar integer type we can safely use
// for chunk operations.
- unsigned MaxChunk;
- EVT LegalVT = EVT(getRegisterType(*DAG.getContext(), VT));
- if (!LegalVT.isInteger())
- return false;
+ EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
// Clamp to the original bit width.
- MaxChunk = std::min<unsigned>(LegalVT.getScalarSizeInBits(), BitWidth);
+ unsigned MaxChunk =
+ std::min<unsigned>(LegalVT.getScalarSizeInBits(), BitWidth);
// Find the largest W in (MaxChunk/2, MaxChunk] such that
// 2^W ≡ 1 (mod Divisor). If this holds, the value can be
>From 8dac2e924f81594c9a7860df67a3fd581dc2351b Mon Sep 17 00:00:00 2001
From: Shivam Gupta <shivam98.tkg at gmail.com>
Date: Thu, 12 Mar 2026 01:33:33 +0530
Subject: [PATCH 10/15] Saperate vector idiv tests from scalar
---
.../CodeGen/SelectionDAG/TargetLowering.cpp | 1 -
llvm/test/CodeGen/X86/i128-divrem-by-const.ll | 391 --
llvm/test/CodeGen/X86/i128-udiv.ll | 3232 ++++++++++++++++-
llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll | 488 +++
4 files changed, 3713 insertions(+), 399 deletions(-)
delete mode 100644 llvm/test/CodeGen/X86/i128-divrem-by-const.ll
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index dcad1bb1ac9b7..dda1cb1dcedd5 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8231,7 +8231,6 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
DAG.getConstant(0, dl, HiLoVT));
Sum = DAG.getNode(ISD::ADD, dl, HiLoVT, Sum, Carry);
}
-
} else {
// If we cannot split in two halves. Let's look for a smaller chunk
// width where (1 << ChunkWidth) mod Divisor == 1.
diff --git a/llvm/test/CodeGen/X86/i128-divrem-by-const.ll b/llvm/test/CodeGen/X86/i128-divrem-by-const.ll
deleted file mode 100644
index 941f33b0e9b8f..0000000000000
--- a/llvm/test/CodeGen/X86/i128-divrem-by-const.ll
+++ /dev/null
@@ -1,391 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-linux | FileCheck %s
-
-define i128 @div_by_7(i128 %x) {
-; CHECK-LABEL: div_by_7:
-; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF
-; CHECK-NEXT: movq %rdi, %rdx
-; CHECK-NEXT: andq %rax, %rdx
-; CHECK-NEXT: movq %rsi, %r8
-; CHECK-NEXT: shldq $1, %rdi, %r8
-; CHECK-NEXT: addq %rdi, %r8
-; CHECK-NEXT: movq %r8, %r9
-; CHECK-NEXT: andq %rax, %r9
-; CHECK-NEXT: movq %rsi, %rcx
-; CHECK-NEXT: shrq $62, %rcx
-; CHECK-NEXT: cmpq %rdx, %r9
-; CHECK-NEXT: adcq %r8, %rcx
-; CHECK-NEXT: andq %rax, %rcx
-; CHECK-NEXT: movabsq $5270498306774157605, %rdx # imm = 0x4924924924924925
-; CHECK-NEXT: movq %rcx, %rax
-; CHECK-NEXT: mulq %rdx
-; CHECK-NEXT: shrq %rdx
-; CHECK-NEXT: leaq (,%rdx,8), %rax
-; CHECK-NEXT: subq %rax, %rdx
-; CHECK-NEXT: addq %rcx, %rdx
-; CHECK-NEXT: subq %rdx, %rdi
-; CHECK-NEXT: sbbq $0, %rsi
-; CHECK-NEXT: movabsq $-5270498306774157605, %rcx # imm = 0xB6DB6DB6DB6DB6DB
-; CHECK-NEXT: imulq %rdi, %rcx
-; CHECK-NEXT: movabsq $7905747460161236407, %r8 # imm = 0x6DB6DB6DB6DB6DB7
-; CHECK-NEXT: movq %rdi, %rax
-; CHECK-NEXT: mulq %r8
-; CHECK-NEXT: addq %rcx, %rdx
-; CHECK-NEXT: imulq %rsi, %r8
-; CHECK-NEXT: addq %r8, %rdx
-; CHECK-NEXT: retq
-entry:
- %div = udiv i128 %x, 7
- ret i128 %div
-}
-
-define i128 @div_by_9(i128 %x) {
-; CHECK-LABEL: div_by_9:
-; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: movabsq $1152921504606846975, %rax # imm = 0xFFFFFFFFFFFFFFF
-; CHECK-NEXT: movq %rdi, %rdx
-; CHECK-NEXT: andq %rax, %rdx
-; CHECK-NEXT: movq %rsi, %r8
-; CHECK-NEXT: shldq $4, %rdi, %r8
-; CHECK-NEXT: addq %rdi, %r8
-; CHECK-NEXT: movq %r8, %r9
-; CHECK-NEXT: andq %rax, %r9
-; CHECK-NEXT: movq %rsi, %rcx
-; CHECK-NEXT: shrq $56, %rcx
-; CHECK-NEXT: cmpq %rdx, %r9
-; CHECK-NEXT: adcq %r8, %rcx
-; CHECK-NEXT: andq %rax, %rcx
-; CHECK-NEXT: movabsq $2049638230412172402, %rdx # imm = 0x1C71C71C71C71C72
-; CHECK-NEXT: movq %rcx, %rax
-; CHECK-NEXT: mulq %rdx
-; CHECK-NEXT: leaq (%rdx,%rdx,8), %rax
-; CHECK-NEXT: subq %rax, %rcx
-; CHECK-NEXT: subq %rcx, %rdi
-; CHECK-NEXT: sbbq $0, %rsi
-; CHECK-NEXT: movabsq $4099276460824344803, %rcx # imm = 0x38E38E38E38E38E3
-; CHECK-NEXT: imulq %rdi, %rcx
-; CHECK-NEXT: movabsq $-8198552921648689607, %r8 # imm = 0x8E38E38E38E38E39
-; CHECK-NEXT: movq %rdi, %rax
-; CHECK-NEXT: mulq %r8
-; CHECK-NEXT: addq %rcx, %rdx
-; CHECK-NEXT: imulq %rsi, %r8
-; CHECK-NEXT: addq %r8, %rdx
-; CHECK-NEXT: retq
-entry:
- %div = udiv i128 %x, 9
- ret i128 %div
-}
-
-define i128 @div_by_11(i128 %x) {
-; CHECK-LABEL: div_by_11:
-; CHECK: # %bb.0:
-; CHECK-NEXT: movabsq $1152921504606846975, %rax # imm = 0xFFFFFFFFFFFFFFF
-; CHECK-NEXT: movq %rdi, %rdx
-; CHECK-NEXT: andq %rax, %rdx
-; CHECK-NEXT: movq %rsi, %r8
-; CHECK-NEXT: shldq $4, %rdi, %r8
-; CHECK-NEXT: addq %rdi, %r8
-; CHECK-NEXT: movq %r8, %r9
-; CHECK-NEXT: andq %rax, %r9
-; CHECK-NEXT: movq %rsi, %rcx
-; CHECK-NEXT: shrq $56, %rcx
-; CHECK-NEXT: cmpq %rdx, %r9
-; CHECK-NEXT: adcq %r8, %rcx
-; CHECK-NEXT: andq %rax, %rcx
-; CHECK-NEXT: movabsq $1676976733973595602, %rdx # imm = 0x1745D1745D1745D2
-; CHECK-NEXT: movq %rcx, %rax
-; CHECK-NEXT: mulq %rdx
-; CHECK-NEXT: leaq (%rdx,%rdx,4), %rax
-; CHECK-NEXT: leaq (%rdx,%rax,2), %rax
-; CHECK-NEXT: subq %rax, %rcx
-; CHECK-NEXT: subq %rcx, %rdi
-; CHECK-NEXT: sbbq $0, %rsi
-; CHECK-NEXT: movabsq $-6707906935894382406, %rcx # imm = 0xA2E8BA2E8BA2E8BA
-; CHECK-NEXT: imulq %rdi, %rcx
-; CHECK-NEXT: movabsq $3353953467947191203, %r8 # imm = 0x2E8BA2E8BA2E8BA3
-; CHECK-NEXT: movq %rdi, %rax
-; CHECK-NEXT: mulq %r8
-; CHECK-NEXT: addq %rcx, %rdx
-; CHECK-NEXT: imulq %rsi, %r8
-; CHECK-NEXT: addq %r8, %rdx
-; CHECK-NEXT: retq
- %div = udiv i128 %x, 11
- ret i128 %div
-}
-
-define i128 @div_by_22(i128 %x) {
-; CHECK-LABEL: div_by_22:
-; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: movl $22, %edx
-; CHECK-NEXT: xorl %ecx, %ecx
-; CHECK-NEXT: callq __udivti3 at PLT
-; CHECK-NEXT: popq %rcx
-; CHECK-NEXT: .cfi_def_cfa_offset 8
-; CHECK-NEXT: retq
-entry:
- %div = udiv i128 %x, 22
- ret i128 %div
-}
-
-define i128 @div_by_25(i128 %x) {
-; CHECK-LABEL: div_by_25:
-; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: movabsq $1152921504606846975, %rax # imm = 0xFFFFFFFFFFFFFFF
-; CHECK-NEXT: movq %rdi, %rdx
-; CHECK-NEXT: andq %rax, %rdx
-; CHECK-NEXT: movq %rsi, %r8
-; CHECK-NEXT: shldq $4, %rdi, %r8
-; CHECK-NEXT: addq %rdi, %r8
-; CHECK-NEXT: movq %r8, %r9
-; CHECK-NEXT: andq %rax, %r9
-; CHECK-NEXT: movq %rsi, %rcx
-; CHECK-NEXT: shrq $56, %rcx
-; CHECK-NEXT: cmpq %rdx, %r9
-; CHECK-NEXT: adcq %r8, %rcx
-; CHECK-NEXT: andq %rax, %rcx
-; CHECK-NEXT: movabsq $737869762948382065, %rdx # imm = 0xA3D70A3D70A3D71
-; CHECK-NEXT: movq %rcx, %rax
-; CHECK-NEXT: mulq %rdx
-; CHECK-NEXT: leaq (%rdx,%rdx,4), %rax
-; CHECK-NEXT: leaq (%rax,%rax,4), %rax
-; CHECK-NEXT: subq %rax, %rcx
-; CHECK-NEXT: subq %rcx, %rdi
-; CHECK-NEXT: sbbq $0, %rsi
-; CHECK-NEXT: movabsq $2951479051793528258, %rcx # imm = 0x28F5C28F5C28F5C2
-; CHECK-NEXT: imulq %rdi, %rcx
-; CHECK-NEXT: movabsq $-8116567392432202711, %r8 # imm = 0x8F5C28F5C28F5C29
-; CHECK-NEXT: movq %rdi, %rax
-; CHECK-NEXT: mulq %r8
-; CHECK-NEXT: addq %rcx, %rdx
-; CHECK-NEXT: imulq %rsi, %r8
-; CHECK-NEXT: addq %r8, %rdx
-; CHECK-NEXT: retq
-entry:
- %div = udiv i128 %x, 25
- ret i128 %div
-}
-
-define i128 @div_by_56(i128 %x) {
-; CHECK-LABEL: div_by_56:
-; CHECK: # %bb.0:
-; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: movl $56, %edx
-; CHECK-NEXT: xorl %ecx, %ecx
-; CHECK-NEXT: callq __udivti3 at PLT
-; CHECK-NEXT: popq %rcx
-; CHECK-NEXT: .cfi_def_cfa_offset 8
-; CHECK-NEXT: retq
- %div = udiv i128 %x, 56 ; 8 * 7
- ret i128 %div
-}
-
-define i128 @rem_by_7(i128 %x) {
-; CHECK-LABEL: rem_by_7:
-; CHECK: # %bb.0:
-; CHECK-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF
-; CHECK-NEXT: movq %rsi, %rcx
-; CHECK-NEXT: shldq $1, %rdi, %rcx
-; CHECK-NEXT: addq %rdi, %rcx
-; CHECK-NEXT: andq %rax, %rdi
-; CHECK-NEXT: movq %rcx, %rdx
-; CHECK-NEXT: andq %rax, %rdx
-; CHECK-NEXT: shrq $62, %rsi
-; CHECK-NEXT: cmpq %rdi, %rdx
-; CHECK-NEXT: adcq %rsi, %rcx
-; CHECK-NEXT: andq %rax, %rcx
-; CHECK-NEXT: movabsq $5270498306774157605, %rdx # imm = 0x4924924924924925
-; CHECK-NEXT: movq %rcx, %rax
-; CHECK-NEXT: mulq %rdx
-; CHECK-NEXT: shrq %rdx
-; CHECK-NEXT: leaq (,%rdx,8), %rax
-; CHECK-NEXT: subq %rax, %rdx
-; CHECK-NEXT: addq %rdx, %rcx
-; CHECK-NEXT: movq %rcx, %rax
-; CHECK-NEXT: xorl %edx, %edx
-; CHECK-NEXT: retq
- %r = urem i128 %x, 7
- ret i128 %r
-}
-
-define i128 @rem_by_14(i128 %x) {
-; CHECK-LABEL: rem_by_14:
-; CHECK: # %bb.0:
-; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: movl $14, %edx
-; CHECK-NEXT: xorl %ecx, %ecx
-; CHECK-NEXT: callq __umodti3 at PLT
-; CHECK-NEXT: popq %rcx
-; CHECK-NEXT: .cfi_def_cfa_offset 8
-; CHECK-NEXT: retq
- %r = urem i128 %x, 14
- ret i128 %r
-}
-
-define <2 x i64> @v2i64_div_by_7(<2 x i64> %x) {
-; CHECK-LABEL: v2i64_div_by_7:
-; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: movq %xmm0, %rcx
-; CHECK-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
-; CHECK-NEXT: movq %rcx, %rax
-; CHECK-NEXT: mulq %rsi
-; CHECK-NEXT: subq %rdx, %rcx
-; CHECK-NEXT: shrq %rcx
-; CHECK-NEXT: addq %rdx, %rcx
-; CHECK-NEXT: movq %rcx, %xmm1
-; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; CHECK-NEXT: movq %xmm0, %rcx
-; CHECK-NEXT: movq %rcx, %rax
-; CHECK-NEXT: mulq %rsi
-; CHECK-NEXT: subq %rdx, %rcx
-; CHECK-NEXT: shrq %rcx
-; CHECK-NEXT: addq %rdx, %rcx
-; CHECK-NEXT: movq %rcx, %xmm0
-; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; CHECK-NEXT: psrlq $2, %xmm1
-; CHECK-NEXT: movdqa %xmm1, %xmm0
-; CHECK-NEXT: retq
-entry:
- %div = udiv <2 x i64> %x, <i64 7, i64 7>
- ret <2 x i64> %div
-}
-
-define <2 x i64> @v2i64_div_by_14(<2 x i64> %x) {
-; CHECK-LABEL: v2i64_div_by_14:
-; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: movq %xmm0, %rax
-; CHECK-NEXT: shrq %rax
-; CHECK-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925
-; CHECK-NEXT: mulq %rcx
-; CHECK-NEXT: movq %rdx, %xmm1
-; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; CHECK-NEXT: movq %xmm0, %rax
-; CHECK-NEXT: shrq %rax
-; CHECK-NEXT: mulq %rcx
-; CHECK-NEXT: movq %rdx, %xmm0
-; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; CHECK-NEXT: psrlq $1, %xmm1
-; CHECK-NEXT: movdqa %xmm1, %xmm0
-; CHECK-NEXT: retq
-entry:
- %div = udiv <2 x i64> %x, <i64 14, i64 14>
- ret <2 x i64> %div
-}
-
-define <4 x i32> @v4i32_div_by_7(<4 x i32> %x) {
-; CHECK-LABEL: v4i32_div_by_7:
-; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
-; CHECK-NEXT: movdqa %xmm0, %xmm2
-; CHECK-NEXT: pmuludq %xmm1, %xmm2
-; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-NEXT: pmuludq %xmm1, %xmm3
-; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
-; CHECK-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-NEXT: psubd %xmm2, %xmm0
-; CHECK-NEXT: psrld $1, %xmm0
-; CHECK-NEXT: paddd %xmm2, %xmm0
-; CHECK-NEXT: psrld $2, %xmm0
-; CHECK-NEXT: retq
-entry:
- %div = udiv <4 x i32> %x, <i32 7, i32 7, i32 7, i32 7>
- ret <4 x i32> %div
-}
-
-define <2 x i128> @v2i128_div_by_7(<2 x i128> %x) {
-; CHECK-LABEL: v2i128_div_by_7:
-; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: pushq %r15
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: pushq %r14
-; CHECK-NEXT: .cfi_def_cfa_offset 24
-; CHECK-NEXT: pushq %r12
-; CHECK-NEXT: .cfi_def_cfa_offset 32
-; CHECK-NEXT: pushq %rbx
-; CHECK-NEXT: .cfi_def_cfa_offset 40
-; CHECK-NEXT: .cfi_offset %rbx, -40
-; CHECK-NEXT: .cfi_offset %r12, -32
-; CHECK-NEXT: .cfi_offset %r14, -24
-; CHECK-NEXT: .cfi_offset %r15, -16
-; CHECK-NEXT: movq %rcx, %r9
-; CHECK-NEXT: movq %rdx, %rcx
-; CHECK-NEXT: movabsq $9223372036854775807, %r11 # imm = 0x7FFFFFFFFFFFFFFF
-; CHECK-NEXT: movq %rsi, %rax
-; CHECK-NEXT: andq %r11, %rax
-; CHECK-NEXT: shldq $1, %rsi, %rdx
-; CHECK-NEXT: addq %rsi, %rdx
-; CHECK-NEXT: movq %rdx, %rbx
-; CHECK-NEXT: andq %r11, %rbx
-; CHECK-NEXT: movq %rcx, %r10
-; CHECK-NEXT: shrq $62, %r10
-; CHECK-NEXT: cmpq %rax, %rbx
-; CHECK-NEXT: adcq %rdx, %r10
-; CHECK-NEXT: andq %r11, %r10
-; CHECK-NEXT: movabsq $5270498306774157605, %r15 # imm = 0x4924924924924925
-; CHECK-NEXT: movq %r10, %rax
-; CHECK-NEXT: mulq %r15
-; CHECK-NEXT: shrq %rdx
-; CHECK-NEXT: leaq (,%rdx,8), %rax
-; CHECK-NEXT: subq %rax, %rdx
-; CHECK-NEXT: addq %r10, %rdx
-; CHECK-NEXT: subq %rdx, %rsi
-; CHECK-NEXT: sbbq $0, %rcx
-; CHECK-NEXT: movabsq $-5270498306774157605, %rbx # imm = 0xB6DB6DB6DB6DB6DB
-; CHECK-NEXT: movq %rsi, %r10
-; CHECK-NEXT: imulq %rbx, %r10
-; CHECK-NEXT: movabsq $7905747460161236407, %r14 # imm = 0x6DB6DB6DB6DB6DB7
-; CHECK-NEXT: movq %rsi, %rax
-; CHECK-NEXT: mulq %r14
-; CHECK-NEXT: movq %rax, %rsi
-; CHECK-NEXT: addq %r10, %rdx
-; CHECK-NEXT: imulq %r14, %rcx
-; CHECK-NEXT: addq %rdx, %rcx
-; CHECK-NEXT: movq %r9, %rax
-; CHECK-NEXT: andq %r11, %rax
-; CHECK-NEXT: movq %r8, %rdx
-; CHECK-NEXT: shldq $1, %r9, %rdx
-; CHECK-NEXT: addq %r9, %rdx
-; CHECK-NEXT: movq %rdx, %r12
-; CHECK-NEXT: andq %r11, %r12
-; CHECK-NEXT: movq %r8, %r10
-; CHECK-NEXT: shrq $62, %r10
-; CHECK-NEXT: cmpq %rax, %r12
-; CHECK-NEXT: adcq %rdx, %r10
-; CHECK-NEXT: andq %r11, %r10
-; CHECK-NEXT: movq %r10, %rax
-; CHECK-NEXT: mulq %r15
-; CHECK-NEXT: shrq %rdx
-; CHECK-NEXT: leaq (,%rdx,8), %rax
-; CHECK-NEXT: subq %rax, %rdx
-; CHECK-NEXT: addq %r10, %rdx
-; CHECK-NEXT: subq %rdx, %r9
-; CHECK-NEXT: sbbq $0, %r8
-; CHECK-NEXT: imulq %r9, %rbx
-; CHECK-NEXT: movq %r9, %rax
-; CHECK-NEXT: mulq %r14
-; CHECK-NEXT: addq %rbx, %rdx
-; CHECK-NEXT: imulq %r14, %r8
-; CHECK-NEXT: addq %rdx, %r8
-; CHECK-NEXT: movq %rax, 16(%rdi)
-; CHECK-NEXT: movq %rsi, (%rdi)
-; CHECK-NEXT: movq %r8, 24(%rdi)
-; CHECK-NEXT: movq %rcx, 8(%rdi)
-; CHECK-NEXT: movq %rdi, %rax
-; CHECK-NEXT: popq %rbx
-; CHECK-NEXT: .cfi_def_cfa_offset 32
-; CHECK-NEXT: popq %r12
-; CHECK-NEXT: .cfi_def_cfa_offset 24
-; CHECK-NEXT: popq %r14
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: popq %r15
-; CHECK-NEXT: .cfi_def_cfa_offset 8
-; CHECK-NEXT: retq
-entry:
- %div = udiv <2 x i128> %x, <i128 7, i128 7>
- ret <2 x i128> %div
-}
diff --git a/llvm/test/CodeGen/X86/i128-udiv.ll b/llvm/test/CodeGen/X86/i128-udiv.ll
index 9011832421326..3d2654d2bcf46 100644
--- a/llvm/test/CodeGen/X86/i128-udiv.ll
+++ b/llvm/test/CodeGen/X86/i128-udiv.ll
@@ -2,9 +2,6 @@
; RUN: llc < %s -mtriple=i686-- | FileCheck %s --check-prefix=X86
; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s --check-prefix=X64
-; Make sure none of these crash, and that the power-of-two transformations
-; trigger correctly.
-
define i128 @test1(i128 %x) nounwind {
; X86-LABEL: test1:
; X86: # %bb.0:
@@ -37,8 +34,287 @@ define i128 @test1(i128 %x) nounwind {
define i128 @test2(i128 %x) nounwind {
; X86-LABEL: test2:
-; X86 doesn't have __divti3, so the urem is expanded into a loop.
-; X86: udiv-do-while
+; X86: # %bb.0: # %_udiv-special-cases
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $144, %esp
+; X86-NEXT: movl 32(%ebp), %esi
+; X86-NEXT: movl 36(%ebp), %edi
+; X86-NEXT: movl 28(%ebp), %ecx
+; X86-NEXT: testl %edi, %edi
+; X86-NEXT: jne .LBB1_1
+; X86-NEXT: # %bb.2: # %_udiv-special-cases
+; X86-NEXT: bsrl %esi, %ebx
+; X86-NEXT: xorl $31, %ebx
+; X86-NEXT: orl $32, %ebx
+; X86-NEXT: jmp .LBB1_3
+; X86-NEXT: .LBB1_1:
+; X86-NEXT: bsrl %edi, %ebx
+; X86-NEXT: xorl $31, %ebx
+; X86-NEXT: .LBB1_3: # %_udiv-special-cases
+; X86-NEXT: movl 24(%ebp), %edx
+; X86-NEXT: testl %ecx, %ecx
+; X86-NEXT: jne .LBB1_4
+; X86-NEXT: # %bb.5: # %_udiv-special-cases
+; X86-NEXT: bsrl %edx, %eax
+; X86-NEXT: xorl $31, %eax
+; X86-NEXT: orl $32, %eax
+; X86-NEXT: jmp .LBB1_6
+; X86-NEXT: .LBB1_4:
+; X86-NEXT: bsrl %ecx, %eax
+; X86-NEXT: xorl $31, %eax
+; X86-NEXT: .LBB1_6: # %_udiv-special-cases
+; X86-NEXT: orl %edi, %ecx
+; X86-NEXT: orl %esi, %edx
+; X86-NEXT: orl %edi, %esi
+; X86-NEXT: jne .LBB1_8
+; X86-NEXT: # %bb.7: # %_udiv-special-cases
+; X86-NEXT: orl $64, %eax
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: .LBB1_8: # %_udiv-special-cases
+; X86-NEXT: negl %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, %esi
+; X86-NEXT: sbbl %esi, %esi
+; X86-NEXT: movl $0, %ebx
+; X86-NEXT: sbbl %ebx, %ebx
+; X86-NEXT: movl $0, %edi
+; X86-NEXT: sbbl %edi, %edi
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: je .LBB1_9
+; X86-NEXT: # %bb.10: # %select.false.sink
+; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: movl $127, %ecx
+; X86-NEXT: cmpl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: sbbl %esi, %ecx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: sbbl %ebx, %ecx
+; X86-NEXT: sbbl %edi, %eax
+; X86-NEXT: setb %cl
+; X86-NEXT: .LBB1_11: # %select.end
+; X86-NEXT: movl %ebx, %esi
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: movl $0, (%esp) # 4-byte Folded Spill
+; X86-NEXT: testb %cl, %cl
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl $0, %edi
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: jne .LBB1_13
+; X86-NEXT: # %bb.12: # %select.end
+; X86-NEXT: movl 28(%ebp), %ebx
+; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill
+; X86-NEXT: movl 24(%ebp), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 32(%ebp), %edi
+; X86-NEXT: movl 36(%ebp), %ecx
+; X86-NEXT: .LBB1_13: # %select.end
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: jne .LBB1_14
+; X86-NEXT: # %bb.20: # %select.end
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, %ebx
+; X86-NEXT: xorl $127, %ecx
+; X86-NEXT: orl %esi, %ecx
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: je .LBB1_21
+; X86-NEXT: # %bb.18: # %udiv-bb1
+; X86-NEXT: movl 24(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 28(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 32(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 36(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: xorb $127, %cl
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $3, %al
+; X86-NEXT: andb $12, %al
+; X86-NEXT: negb %al
+; X86-NEXT: movsbl %al, %eax
+; X86-NEXT: movl 120(%esp,%eax), %edx
+; X86-NEXT: movl 124(%esp,%eax), %edi
+; X86-NEXT: shldl %cl, %edx, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 112(%esp,%eax), %edi
+; X86-NEXT: movl 116(%esp,%eax), %eax
+; X86-NEXT: shldl %cl, %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edi, %eax
+; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: shll %cl, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl $1, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: jb .LBB1_19
+; X86-NEXT: # %bb.15: # %udiv-preheader
+; X86-NEXT: movl 24(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 28(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 32(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 36(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $3, %al
+; X86-NEXT: andb $12, %al
+; X86-NEXT: movzbl %al, %esi
+; X86-NEXT: movl 76(%esp,%esi), %eax
+; X86-NEXT: movl 72(%esp,%esi), %edi
+; X86-NEXT: movl %edi, %edx
+; X86-NEXT: shrdl %cl, %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 64(%esp,%esi), %edx
+; X86-NEXT: movl 68(%esp,%esi), %esi
+; X86-NEXT: movl %esi, %ebx
+; X86-NEXT: shrdl %cl, %edi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shrl %cl, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shrdl %cl, %esi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $-4, %eax
+; X86-NEXT: addl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $-1, %eax
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: .p2align 4
+; X86-NEXT: .LBB1_16: # %udiv-do-while
+; X86-NEXT: # =>This Inner Loop Header: Depth=1
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: shldl $1, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl $1, %esi, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shldl $1, %ecx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shldl $1, %edi, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $1, %eax, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: orl %edx, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl (%esp), %edi # 4-byte Reload
+; X86-NEXT: shldl $1, %edi, %eax
+; X86-NEXT: orl %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $1, %eax, %edi
+; X86-NEXT: orl %edx, %edi
+; X86-NEXT: movl %edi, (%esp) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: addl %eax, %eax
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $-1, %eax
+; X86-NEXT: cmpl %ecx, %eax
+; X86-NEXT: movl $-1, %ecx
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %esi, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: sbbl %ebx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: sbbl %edx, %ecx
+; X86-NEXT: sarl $31, %ecx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: andl $1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: movl $-1, %eax
+; X86-NEXT: andl %eax, %esi
+; X86-NEXT: movl $-4, %eax
+; X86-NEXT: andl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: subl %ecx, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: sbbl %esi, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: addl $-1, %esi
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: adcl $-1, %edx
+; X86-NEXT: adcl $-1, %edi
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: orl %edi, %ecx
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %edx, %esi
+; X86-NEXT: orl %ecx, %esi
+; X86-NEXT: jne .LBB1_16
+; X86-NEXT: .LBB1_17: # %udiv-loop-exit
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl $1, %edi, %edx
+; X86-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT: shldl $1, %ecx, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $1, %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: leal (%esi,%eax,2), %esi
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: .LBB1_21: # %udiv-end
+; X86-NEXT: movl %esi, (%eax)
+; X86-NEXT: movl %ecx, 4(%eax)
+; X86-NEXT: movl %edi, 8(%eax)
+; X86-NEXT: movl %edx, 12(%eax)
+; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl $4
+; X86-NEXT: .LBB1_9:
+; X86-NEXT: movb $1, %cl
+; X86-NEXT: jmp .LBB1_11
+; X86-NEXT: .LBB1_19:
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: jmp .LBB1_17
+; X86-NEXT: .LBB1_14:
+; X86-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: jmp .LBB1_21
;
; X64-LABEL: test2:
; X64: # %bb.0:
@@ -48,14 +324,308 @@ define i128 @test2(i128 %x) nounwind {
; X64-NEXT: callq __udivti3 at PLT
; X64-NEXT: popq %rcx
; X64-NEXT: retq
+; X86 doesn't have __divti3, so the urem is expanded into a loop.
%tmp = udiv i128 %x, -73786976294838206464
ret i128 %tmp
}
define i128 @test3(i128 %x) nounwind {
; X86-LABEL: test3:
-; X86 doesn't have __divti3, so the urem is expanded into a loop.
-; X86: udiv-do-while
+; X86: # %bb.0: # %_udiv-special-cases
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $160, %esp
+; X86-NEXT: movl 32(%ebp), %edi
+; X86-NEXT: movl 36(%ebp), %edx
+; X86-NEXT: movl 28(%ebp), %esi
+; X86-NEXT: testl %edx, %edx
+; X86-NEXT: jne .LBB2_1
+; X86-NEXT: # %bb.2: # %_udiv-special-cases
+; X86-NEXT: bsrl %edi, %ebx
+; X86-NEXT: xorl $31, %ebx
+; X86-NEXT: orl $32, %ebx
+; X86-NEXT: jmp .LBB2_3
+; X86-NEXT: .LBB2_1:
+; X86-NEXT: bsrl %edx, %ebx
+; X86-NEXT: xorl $31, %ebx
+; X86-NEXT: .LBB2_3: # %_udiv-special-cases
+; X86-NEXT: movl 24(%ebp), %eax
+; X86-NEXT: testl %esi, %esi
+; X86-NEXT: jne .LBB2_4
+; X86-NEXT: # %bb.5: # %_udiv-special-cases
+; X86-NEXT: bsrl %eax, %ecx
+; X86-NEXT: xorl $31, %ecx
+; X86-NEXT: orl $32, %ecx
+; X86-NEXT: jmp .LBB2_6
+; X86-NEXT: .LBB2_4:
+; X86-NEXT: bsrl %esi, %ecx
+; X86-NEXT: xorl $31, %ecx
+; X86-NEXT: .LBB2_6: # %_udiv-special-cases
+; X86-NEXT: orl %edx, %esi
+; X86-NEXT: orl %edi, %eax
+; X86-NEXT: orl %edx, %edi
+; X86-NEXT: jne .LBB2_8
+; X86-NEXT: # %bb.7: # %_udiv-special-cases
+; X86-NEXT: orl $64, %ecx
+; X86-NEXT: movl %ecx, %ebx
+; X86-NEXT: .LBB2_8: # %_udiv-special-cases
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: negl %ebx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: sbbl %ecx, %ecx
+; X86-NEXT: movl $0, %edx
+; X86-NEXT: sbbl %edx, %edx
+; X86-NEXT: movl %ebx, %edi
+; X86-NEXT: movl $0, %ebx
+; X86-NEXT: sbbl %ebx, %ebx
+; X86-NEXT: orl %esi, %eax
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: je .LBB2_9
+; X86-NEXT: # %bb.10: # %select.false.sink
+; X86-NEXT: movl $127, %eax
+; X86-NEXT: cmpl %edi, %eax
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: movl %ecx, %edi
+; X86-NEXT: sbbl %ecx, %eax
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %edx, %eax
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %ebx, %eax
+; X86-NEXT: setb %cl
+; X86-NEXT: .LBB2_11: # %select.end
+; X86-NEXT: movl %ebx, %esi
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: testb %cl, %cl
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: movl $0, %ebx
+; X86-NEXT: jne .LBB2_13
+; X86-NEXT: # %bb.12: # %select.end
+; X86-NEXT: movl 28(%ebp), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 24(%ebp), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 32(%ebp), %ecx
+; X86-NEXT: movl 36(%ebp), %ebx
+; X86-NEXT: .LBB2_13: # %select.end
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: jne .LBB2_14
+; X86-NEXT: # %bb.20: # %select.end
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: xorl $127, %ecx
+; X86-NEXT: orl %edx, %ecx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, %edx
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %esi, %edx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: je .LBB2_21
+; X86-NEXT: # %bb.18: # %udiv-bb1
+; X86-NEXT: movl 24(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 28(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 32(%ebp), %ebx
+; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 36(%ebp), %ebx
+; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: xorb $127, %cl
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $3, %al
+; X86-NEXT: andb $12, %al
+; X86-NEXT: negb %al
+; X86-NEXT: movsbl %al, %eax
+; X86-NEXT: movl 136(%esp,%eax), %edx
+; X86-NEXT: movl 140(%esp,%eax), %esi
+; X86-NEXT: shldl %cl, %edx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 128(%esp,%eax), %esi
+; X86-NEXT: movl 132(%esp,%eax), %eax
+; X86-NEXT: shldl %cl, %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shll %cl, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl $1, %ebx
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: jb .LBB2_19
+; X86-NEXT: # %bb.15: # %udiv-preheader
+; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: movl 24(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 28(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 32(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 36(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $3, %al
+; X86-NEXT: andb $12, %al
+; X86-NEXT: movzbl %al, %edx
+; X86-NEXT: movl 92(%esp,%edx), %edi
+; X86-NEXT: movl 88(%esp,%edx), %esi
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: shrdl %cl, %edi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 80(%esp,%edx), %eax
+; X86-NEXT: movl 84(%esp,%edx), %ebx
+; X86-NEXT: movl %ebx, %edx
+; X86-NEXT: shrdl %cl, %esi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shrl %cl, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shrdl %cl, %ebx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $-3, %eax
+; X86-NEXT: addl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $-1, %eax
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $-5, %eax
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $-1, %eax
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: .p2align 4
+; X86-NEXT: .LBB2_16: # %udiv-do-while
+; X86-NEXT: # =>This Inner Loop Header: Depth=1
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $1, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shldl $1, %edi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: shldl $1, %ebx, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shldl $1, %edi, %ebx
+; X86-NEXT: shldl $1, %ecx, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: orl %eax, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl $1, %esi, %ecx
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl $1, %edx, %esi
+; X86-NEXT: orl %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl %edx, %edx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: sarl $31, %ecx
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: andl $1, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl %ecx, %edi
+; X86-NEXT: movl $-1, %esi
+; X86-NEXT: andl %esi, %edi
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: movl $-5, %edx
+; X86-NEXT: andl %edx, %esi
+; X86-NEXT: movl $-3, %edx
+; X86-NEXT: andl %edx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: subl %ecx, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %edi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: sbbl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: addl $-1, %esi
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: adcl $-1, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: adcl $-1, %edi
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %edi, %ecx
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %ebx, %esi
+; X86-NEXT: orl %ecx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: jne .LBB2_16
+; X86-NEXT: .LBB2_17: # %udiv-loop-exit
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: shldl $1, %ecx, %ebx
+; X86-NEXT: shldl $1, %esi, %ecx
+; X86-NEXT: shldl $1, %edx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: leal (%eax,%edx,2), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: .LBB2_21: # %udiv-end
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, (%eax)
+; X86-NEXT: movl %esi, 4(%eax)
+; X86-NEXT: movl %edx, 8(%eax)
+; X86-NEXT: movl %ebx, 12(%eax)
+; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl $4
+; X86-NEXT: .LBB2_9:
+; X86-NEXT: movl %ecx, %edi
+; X86-NEXT: movb $1, %cl
+; X86-NEXT: jmp .LBB2_11
+; X86-NEXT: .LBB2_19:
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: jmp .LBB2_17
+; X86-NEXT: .LBB2_14:
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: jmp .LBB2_21
;
; X64-LABEL: test3:
; X64: # %bb.0:
@@ -65,6 +635,2654 @@ define i128 @test3(i128 %x) nounwind {
; X64-NEXT: callq __udivti3 at PLT
; X64-NEXT: popq %rcx
; X64-NEXT: retq
+; X86 doesn't have __divti3, so the urem is expanded into a loop.
%tmp = udiv i128 %x, -73786976294838206467
ret i128 %tmp
}
+
+define i128 @div_by_7(i128 %x) {
+; X86-LABEL: div_by_7:
+; X86: # %bb.0: # %entry_udiv-special-cases
+; X86-NEXT: pushl %ebp
+; X86-NEXT: .cfi_def_cfa_offset 8
+; X86-NEXT: .cfi_offset %ebp, -8
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: .cfi_def_cfa_register %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $160, %esp
+; X86-NEXT: .cfi_offset %esi, -20
+; X86-NEXT: .cfi_offset %edi, -16
+; X86-NEXT: .cfi_offset %ebx, -12
+; X86-NEXT: movl 32(%ebp), %edi
+; X86-NEXT: movl 36(%ebp), %ebx
+; X86-NEXT: movl 28(%ebp), %edx
+; X86-NEXT: testl %ebx, %ebx
+; X86-NEXT: jne .LBB3_1
+; X86-NEXT: # %bb.2: # %entry_udiv-special-cases
+; X86-NEXT: bsrl %edi, %esi
+; X86-NEXT: xorl $31, %esi
+; X86-NEXT: orl $32, %esi
+; X86-NEXT: jmp .LBB3_3
+; X86-NEXT: .LBB3_1:
+; X86-NEXT: bsrl %ebx, %esi
+; X86-NEXT: xorl $31, %esi
+; X86-NEXT: .LBB3_3: # %entry_udiv-special-cases
+; X86-NEXT: movl 24(%ebp), %eax
+; X86-NEXT: testl %edx, %edx
+; X86-NEXT: jne .LBB3_4
+; X86-NEXT: # %bb.5: # %entry_udiv-special-cases
+; X86-NEXT: bsrl %eax, %ecx
+; X86-NEXT: xorl $31, %ecx
+; X86-NEXT: orl $32, %ecx
+; X86-NEXT: jmp .LBB3_6
+; X86-NEXT: .LBB3_4:
+; X86-NEXT: bsrl %edx, %ecx
+; X86-NEXT: xorl $31, %ecx
+; X86-NEXT: .LBB3_6: # %entry_udiv-special-cases
+; X86-NEXT: orl %ebx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %edi, %eax
+; X86-NEXT: orl %ebx, %edi
+; X86-NEXT: jne .LBB3_8
+; X86-NEXT: # %bb.7: # %entry_udiv-special-cases
+; X86-NEXT: orl $64, %ecx
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: .LBB3_8: # %entry_udiv-special-cases
+; X86-NEXT: movl $125, %ebx
+; X86-NEXT: subl %esi, %ebx
+; X86-NEXT: movl $0, %edx
+; X86-NEXT: sbbl %edx, %edx
+; X86-NEXT: movl $0, %esi
+; X86-NEXT: sbbl %esi, %esi
+; X86-NEXT: movl $0, %edi
+; X86-NEXT: sbbl %edi, %edi
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: je .LBB3_9
+; X86-NEXT: # %bb.10: # %select.false.sink
+; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: movl $127, %ecx
+; X86-NEXT: cmpl %ebx, %ecx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: sbbl %edx, %ecx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: sbbl %esi, %ecx
+; X86-NEXT: sbbl %edi, %eax
+; X86-NEXT: setb %cl
+; X86-NEXT: .LBB3_11: # %select.end
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: xorl %edx, %edx
+; X86-NEXT: testb %cl, %cl
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: jne .LBB3_13
+; X86-NEXT: # %bb.12: # %select.end
+; X86-NEXT: movl 28(%ebp), %edx
+; X86-NEXT: movl 24(%ebp), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 32(%ebp), %ecx
+; X86-NEXT: movl 36(%ebp), %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: .LBB3_13: # %select.end
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: jne .LBB3_14
+; X86-NEXT: # %bb.20: # %select.end
+; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: xorl $127, %ecx
+; X86-NEXT: orl %esi, %ecx
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl %esi, %edx
+; X86-NEXT: orl %edi, %edx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: je .LBB3_21
+; X86-NEXT: # %bb.18: # %udiv-bb1
+; X86-NEXT: movl 24(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 28(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 32(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 36(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: xorb $127, %cl
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $3, %al
+; X86-NEXT: andb $12, %al
+; X86-NEXT: negb %al
+; X86-NEXT: movsbl %al, %eax
+; X86-NEXT: movl 136(%esp,%eax), %edx
+; X86-NEXT: movl 140(%esp,%eax), %edi
+; X86-NEXT: shldl %cl, %edx, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 128(%esp,%eax), %edi
+; X86-NEXT: movl 132(%esp,%eax), %eax
+; X86-NEXT: shldl %cl, %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shll %cl, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl $1, %ebx
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: jb .LBB3_19
+; X86-NEXT: # %bb.15: # %udiv-preheader
+; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: movl 24(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 28(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 32(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 36(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $3, %al
+; X86-NEXT: andb $12, %al
+; X86-NEXT: movzbl %al, %edi
+; X86-NEXT: movl 92(%esp,%edi), %eax
+; X86-NEXT: movl 88(%esp,%edi), %ebx
+; X86-NEXT: movl %ebx, %esi
+; X86-NEXT: shrdl %cl, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 80(%esp,%edi), %esi
+; X86-NEXT: movl 84(%esp,%edi), %edi
+; X86-NEXT: movl %edi, %edx
+; X86-NEXT: shrdl %cl, %ebx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shrl %cl, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shrdl %cl, %edi, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $7, %eax
+; X86-NEXT: addl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: .p2align 4
+; X86-NEXT: .LBB3_16: # %udiv-do-while
+; X86-NEXT: # =>This Inner Loop Header: Depth=1
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: shldl $1, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shldl $1, %edi, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl $1, %edx, %edi
+; X86-NEXT: shldl $1, %eax, %edx
+; X86-NEXT: shldl $1, %ecx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: orl %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $1, %eax, %ecx
+; X86-NEXT: orl %esi, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shldl $1, %ecx, %eax
+; X86-NEXT: orl %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl %ecx, %ecx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: sbbl %edi, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: sbbl %ebx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: sarl $31, %ecx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: andl $1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl $7, %eax
+; X86-NEXT: andl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: subl %ecx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl $0, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: sbbl $0, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: sbbl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: addl $-1, %ebx
+; X86-NEXT: adcl $-1, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: adcl $-1, %edx
+; X86-NEXT: adcl $-1, %esi
+; X86-NEXT: movl %edi, %ecx
+; X86-NEXT: orl %esi, %ecx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %edx, %ebx
+; X86-NEXT: orl %ecx, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: jne .LBB3_16
+; X86-NEXT: .LBB3_17: # %udiv-loop-exit
+; X86-NEXT: shldl $1, %ecx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl $1, %edx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $1, %eax, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: leal (%esi,%eax,2), %edi
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: .LBB3_21: # %udiv-end
+; X86-NEXT: movl %edi, (%eax)
+; X86-NEXT: movl %edx, 4(%eax)
+; X86-NEXT: movl %ecx, 8(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, 12(%eax)
+; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
+; X86-NEXT: .cfi_def_cfa %esp, 4
+; X86-NEXT: retl $4
+; X86-NEXT: .LBB3_9:
+; X86-NEXT: .cfi_def_cfa %ebp, 8
+; X86-NEXT: movb $1, %cl
+; X86-NEXT: jmp .LBB3_11
+; X86-NEXT: .LBB3_19:
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: jmp .LBB3_17
+; X86-NEXT: .LBB3_14:
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: jmp .LBB3_21
+;
+; X64-LABEL: div_by_7:
+; X64: # %bb.0: # %entry
+; X64-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF
+; X64-NEXT: movq %rdi, %rdx
+; X64-NEXT: andq %rax, %rdx
+; X64-NEXT: movq %rdi, %r8
+; X64-NEXT: shrdq $63, %rsi, %r8
+; X64-NEXT: addq %rdi, %r8
+; X64-NEXT: movq %r8, %r9
+; X64-NEXT: andq %rax, %r9
+; X64-NEXT: movq %rsi, %rcx
+; X64-NEXT: shrq $62, %rcx
+; X64-NEXT: cmpq %rdx, %r9
+; X64-NEXT: adcq %r8, %rcx
+; X64-NEXT: andq %rax, %rcx
+; X64-NEXT: movabsq $5270498306774157605, %rdx # imm = 0x4924924924924925
+; X64-NEXT: movq %rcx, %rax
+; X64-NEXT: mulq %rdx
+; X64-NEXT: shrq %rdx
+; X64-NEXT: leaq (,%rdx,8), %rax
+; X64-NEXT: subq %rax, %rdx
+; X64-NEXT: addq %rcx, %rdx
+; X64-NEXT: subq %rdx, %rdi
+; X64-NEXT: sbbq $0, %rsi
+; X64-NEXT: movabsq $-5270498306774157605, %rcx # imm = 0xB6DB6DB6DB6DB6DB
+; X64-NEXT: imulq %rdi, %rcx
+; X64-NEXT: movabsq $7905747460161236407, %r8 # imm = 0x6DB6DB6DB6DB6DB7
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: mulq %r8
+; X64-NEXT: addq %rcx, %rdx
+; X64-NEXT: imulq %rsi, %r8
+; X64-NEXT: addq %r8, %rdx
+; X64-NEXT: retq
+entry:
+ %div = udiv i128 %x, 7
+ ret i128 %div
+}
+
+define i128 @div_by_9(i128 %x) {
+; X86-LABEL: div_by_9:
+; X86: # %bb.0: # %entry_udiv-special-cases
+; X86-NEXT: pushl %ebp
+; X86-NEXT: .cfi_def_cfa_offset 8
+; X86-NEXT: .cfi_offset %ebp, -8
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: .cfi_def_cfa_register %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $160, %esp
+; X86-NEXT: .cfi_offset %esi, -20
+; X86-NEXT: .cfi_offset %edi, -16
+; X86-NEXT: .cfi_offset %ebx, -12
+; X86-NEXT: movl 32(%ebp), %edi
+; X86-NEXT: movl 36(%ebp), %ebx
+; X86-NEXT: movl 28(%ebp), %edx
+; X86-NEXT: testl %ebx, %ebx
+; X86-NEXT: jne .LBB4_1
+; X86-NEXT: # %bb.2: # %entry_udiv-special-cases
+; X86-NEXT: bsrl %edi, %esi
+; X86-NEXT: xorl $31, %esi
+; X86-NEXT: orl $32, %esi
+; X86-NEXT: jmp .LBB4_3
+; X86-NEXT: .LBB4_1:
+; X86-NEXT: bsrl %ebx, %esi
+; X86-NEXT: xorl $31, %esi
+; X86-NEXT: .LBB4_3: # %entry_udiv-special-cases
+; X86-NEXT: movl 24(%ebp), %eax
+; X86-NEXT: testl %edx, %edx
+; X86-NEXT: jne .LBB4_4
+; X86-NEXT: # %bb.5: # %entry_udiv-special-cases
+; X86-NEXT: bsrl %eax, %ecx
+; X86-NEXT: xorl $31, %ecx
+; X86-NEXT: orl $32, %ecx
+; X86-NEXT: jmp .LBB4_6
+; X86-NEXT: .LBB4_4:
+; X86-NEXT: bsrl %edx, %ecx
+; X86-NEXT: xorl $31, %ecx
+; X86-NEXT: .LBB4_6: # %entry_udiv-special-cases
+; X86-NEXT: orl %ebx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %edi, %eax
+; X86-NEXT: orl %ebx, %edi
+; X86-NEXT: jne .LBB4_8
+; X86-NEXT: # %bb.7: # %entry_udiv-special-cases
+; X86-NEXT: orl $64, %ecx
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: .LBB4_8: # %entry_udiv-special-cases
+; X86-NEXT: movl $124, %ebx
+; X86-NEXT: subl %esi, %ebx
+; X86-NEXT: movl $0, %edx
+; X86-NEXT: sbbl %edx, %edx
+; X86-NEXT: movl $0, %esi
+; X86-NEXT: sbbl %esi, %esi
+; X86-NEXT: movl $0, %edi
+; X86-NEXT: sbbl %edi, %edi
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: je .LBB4_9
+; X86-NEXT: # %bb.10: # %select.false.sink
+; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: movl $127, %ecx
+; X86-NEXT: cmpl %ebx, %ecx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: sbbl %edx, %ecx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: sbbl %esi, %ecx
+; X86-NEXT: sbbl %edi, %eax
+; X86-NEXT: setb %cl
+; X86-NEXT: .LBB4_11: # %select.end
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: xorl %edx, %edx
+; X86-NEXT: testb %cl, %cl
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: jne .LBB4_13
+; X86-NEXT: # %bb.12: # %select.end
+; X86-NEXT: movl 28(%ebp), %edx
+; X86-NEXT: movl 24(%ebp), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 32(%ebp), %ecx
+; X86-NEXT: movl 36(%ebp), %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: .LBB4_13: # %select.end
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: jne .LBB4_14
+; X86-NEXT: # %bb.20: # %select.end
+; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: xorl $127, %ecx
+; X86-NEXT: orl %esi, %ecx
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl %esi, %edx
+; X86-NEXT: orl %edi, %edx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: je .LBB4_21
+; X86-NEXT: # %bb.18: # %udiv-bb1
+; X86-NEXT: movl 24(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 28(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 32(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 36(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: xorb $127, %cl
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $3, %al
+; X86-NEXT: andb $12, %al
+; X86-NEXT: negb %al
+; X86-NEXT: movsbl %al, %eax
+; X86-NEXT: movl 136(%esp,%eax), %edx
+; X86-NEXT: movl 140(%esp,%eax), %edi
+; X86-NEXT: shldl %cl, %edx, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 128(%esp,%eax), %edi
+; X86-NEXT: movl 132(%esp,%eax), %eax
+; X86-NEXT: shldl %cl, %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shll %cl, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl $1, %ebx
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: jb .LBB4_19
+; X86-NEXT: # %bb.15: # %udiv-preheader
+; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: movl 24(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 28(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 32(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 36(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $3, %al
+; X86-NEXT: andb $12, %al
+; X86-NEXT: movzbl %al, %edi
+; X86-NEXT: movl 92(%esp,%edi), %eax
+; X86-NEXT: movl 88(%esp,%edi), %ebx
+; X86-NEXT: movl %ebx, %esi
+; X86-NEXT: shrdl %cl, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 80(%esp,%edi), %esi
+; X86-NEXT: movl 84(%esp,%edi), %edi
+; X86-NEXT: movl %edi, %edx
+; X86-NEXT: shrdl %cl, %ebx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shrl %cl, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shrdl %cl, %edi, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $9, %eax
+; X86-NEXT: addl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: .p2align 4
+; X86-NEXT: .LBB4_16: # %udiv-do-while
+; X86-NEXT: # =>This Inner Loop Header: Depth=1
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: shldl $1, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shldl $1, %edi, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl $1, %edx, %edi
+; X86-NEXT: shldl $1, %eax, %edx
+; X86-NEXT: shldl $1, %ecx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: orl %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $1, %eax, %ecx
+; X86-NEXT: orl %esi, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shldl $1, %ecx, %eax
+; X86-NEXT: orl %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl %ecx, %ecx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: sbbl %edi, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: sbbl %ebx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: sarl $31, %ecx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: andl $1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl $9, %eax
+; X86-NEXT: andl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: subl %ecx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl $0, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: sbbl $0, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: sbbl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: addl $-1, %ebx
+; X86-NEXT: adcl $-1, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: adcl $-1, %edx
+; X86-NEXT: adcl $-1, %esi
+; X86-NEXT: movl %edi, %ecx
+; X86-NEXT: orl %esi, %ecx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %edx, %ebx
+; X86-NEXT: orl %ecx, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: jne .LBB4_16
+; X86-NEXT: .LBB4_17: # %udiv-loop-exit
+; X86-NEXT: shldl $1, %ecx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl $1, %edx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $1, %eax, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: leal (%esi,%eax,2), %edi
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: .LBB4_21: # %udiv-end
+; X86-NEXT: movl %edi, (%eax)
+; X86-NEXT: movl %edx, 4(%eax)
+; X86-NEXT: movl %ecx, 8(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, 12(%eax)
+; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
+; X86-NEXT: .cfi_def_cfa %esp, 4
+; X86-NEXT: retl $4
+; X86-NEXT: .LBB4_9:
+; X86-NEXT: .cfi_def_cfa %ebp, 8
+; X86-NEXT: movb $1, %cl
+; X86-NEXT: jmp .LBB4_11
+; X86-NEXT: .LBB4_19:
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: jmp .LBB4_17
+; X86-NEXT: .LBB4_14:
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: jmp .LBB4_21
+;
+; X64-LABEL: div_by_9:
+; X64: # %bb.0: # %entry
+; X64-NEXT: movabsq $1152921504606846975, %rax # imm = 0xFFFFFFFFFFFFFFF
+; X64-NEXT: movq %rdi, %rdx
+; X64-NEXT: andq %rax, %rdx
+; X64-NEXT: movq %rdi, %r8
+; X64-NEXT: shrdq $60, %rsi, %r8
+; X64-NEXT: addq %rdi, %r8
+; X64-NEXT: movq %r8, %r9
+; X64-NEXT: andq %rax, %r9
+; X64-NEXT: movq %rsi, %rcx
+; X64-NEXT: shrq $56, %rcx
+; X64-NEXT: cmpq %rdx, %r9
+; X64-NEXT: adcq %r8, %rcx
+; X64-NEXT: andq %rax, %rcx
+; X64-NEXT: movabsq $2049638230412172402, %rdx # imm = 0x1C71C71C71C71C72
+; X64-NEXT: movq %rcx, %rax
+; X64-NEXT: mulq %rdx
+; X64-NEXT: leaq (%rdx,%rdx,8), %rax
+; X64-NEXT: subq %rax, %rcx
+; X64-NEXT: subq %rcx, %rdi
+; X64-NEXT: sbbq $0, %rsi
+; X64-NEXT: movabsq $4099276460824344803, %rcx # imm = 0x38E38E38E38E38E3
+; X64-NEXT: imulq %rdi, %rcx
+; X64-NEXT: movabsq $-8198552921648689607, %r8 # imm = 0x8E38E38E38E38E39
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: mulq %r8
+; X64-NEXT: addq %rcx, %rdx
+; X64-NEXT: imulq %rsi, %r8
+; X64-NEXT: addq %r8, %rdx
+; X64-NEXT: retq
+entry:
+ %div = udiv i128 %x, 9
+ ret i128 %div
+}
+
+define i128 @div_by_11(i128 %x) {
+; X86-LABEL: div_by_11:
+; X86: # %bb.0: # %_udiv-special-cases
+; X86-NEXT: pushl %ebp
+; X86-NEXT: .cfi_def_cfa_offset 8
+; X86-NEXT: .cfi_offset %ebp, -8
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: .cfi_def_cfa_register %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $160, %esp
+; X86-NEXT: .cfi_offset %esi, -20
+; X86-NEXT: .cfi_offset %edi, -16
+; X86-NEXT: .cfi_offset %ebx, -12
+; X86-NEXT: movl 32(%ebp), %edi
+; X86-NEXT: movl 36(%ebp), %ebx
+; X86-NEXT: movl 28(%ebp), %edx
+; X86-NEXT: testl %ebx, %ebx
+; X86-NEXT: jne .LBB5_1
+; X86-NEXT: # %bb.2: # %_udiv-special-cases
+; X86-NEXT: bsrl %edi, %esi
+; X86-NEXT: xorl $31, %esi
+; X86-NEXT: orl $32, %esi
+; X86-NEXT: jmp .LBB5_3
+; X86-NEXT: .LBB5_1:
+; X86-NEXT: bsrl %ebx, %esi
+; X86-NEXT: xorl $31, %esi
+; X86-NEXT: .LBB5_3: # %_udiv-special-cases
+; X86-NEXT: movl 24(%ebp), %eax
+; X86-NEXT: testl %edx, %edx
+; X86-NEXT: jne .LBB5_4
+; X86-NEXT: # %bb.5: # %_udiv-special-cases
+; X86-NEXT: bsrl %eax, %ecx
+; X86-NEXT: xorl $31, %ecx
+; X86-NEXT: orl $32, %ecx
+; X86-NEXT: jmp .LBB5_6
+; X86-NEXT: .LBB5_4:
+; X86-NEXT: bsrl %edx, %ecx
+; X86-NEXT: xorl $31, %ecx
+; X86-NEXT: .LBB5_6: # %_udiv-special-cases
+; X86-NEXT: orl %ebx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %edi, %eax
+; X86-NEXT: orl %ebx, %edi
+; X86-NEXT: jne .LBB5_8
+; X86-NEXT: # %bb.7: # %_udiv-special-cases
+; X86-NEXT: orl $64, %ecx
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: .LBB5_8: # %_udiv-special-cases
+; X86-NEXT: movl $124, %ebx
+; X86-NEXT: subl %esi, %ebx
+; X86-NEXT: movl $0, %edx
+; X86-NEXT: sbbl %edx, %edx
+; X86-NEXT: movl $0, %esi
+; X86-NEXT: sbbl %esi, %esi
+; X86-NEXT: movl $0, %edi
+; X86-NEXT: sbbl %edi, %edi
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: je .LBB5_9
+; X86-NEXT: # %bb.10: # %select.false.sink
+; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: movl $127, %ecx
+; X86-NEXT: cmpl %ebx, %ecx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: sbbl %edx, %ecx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: sbbl %esi, %ecx
+; X86-NEXT: sbbl %edi, %eax
+; X86-NEXT: setb %cl
+; X86-NEXT: .LBB5_11: # %select.end
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: xorl %edx, %edx
+; X86-NEXT: testb %cl, %cl
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: jne .LBB5_13
+; X86-NEXT: # %bb.12: # %select.end
+; X86-NEXT: movl 28(%ebp), %edx
+; X86-NEXT: movl 24(%ebp), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 32(%ebp), %ecx
+; X86-NEXT: movl 36(%ebp), %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: .LBB5_13: # %select.end
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: jne .LBB5_14
+; X86-NEXT: # %bb.20: # %select.end
+; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: xorl $127, %ecx
+; X86-NEXT: orl %esi, %ecx
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl %esi, %edx
+; X86-NEXT: orl %edi, %edx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: je .LBB5_21
+; X86-NEXT: # %bb.18: # %udiv-bb1
+; X86-NEXT: movl 24(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 28(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 32(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 36(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: xorb $127, %cl
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $3, %al
+; X86-NEXT: andb $12, %al
+; X86-NEXT: negb %al
+; X86-NEXT: movsbl %al, %eax
+; X86-NEXT: movl 136(%esp,%eax), %edx
+; X86-NEXT: movl 140(%esp,%eax), %edi
+; X86-NEXT: shldl %cl, %edx, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 128(%esp,%eax), %edi
+; X86-NEXT: movl 132(%esp,%eax), %eax
+; X86-NEXT: shldl %cl, %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shll %cl, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl $1, %ebx
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: jb .LBB5_19
+; X86-NEXT: # %bb.15: # %udiv-preheader
+; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: movl 24(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 28(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 32(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 36(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $3, %al
+; X86-NEXT: andb $12, %al
+; X86-NEXT: movzbl %al, %edi
+; X86-NEXT: movl 92(%esp,%edi), %eax
+; X86-NEXT: movl 88(%esp,%edi), %ebx
+; X86-NEXT: movl %ebx, %esi
+; X86-NEXT: shrdl %cl, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 80(%esp,%edi), %esi
+; X86-NEXT: movl 84(%esp,%edi), %edi
+; X86-NEXT: movl %edi, %edx
+; X86-NEXT: shrdl %cl, %ebx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shrl %cl, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shrdl %cl, %edi, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $11, %eax
+; X86-NEXT: addl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: .p2align 4
+; X86-NEXT: .LBB5_16: # %udiv-do-while
+; X86-NEXT: # =>This Inner Loop Header: Depth=1
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: shldl $1, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shldl $1, %edi, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl $1, %edx, %edi
+; X86-NEXT: shldl $1, %eax, %edx
+; X86-NEXT: shldl $1, %ecx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: orl %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $1, %eax, %ecx
+; X86-NEXT: orl %esi, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shldl $1, %ecx, %eax
+; X86-NEXT: orl %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl %ecx, %ecx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: sbbl %edi, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: sbbl %ebx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: sarl $31, %ecx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: andl $1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl $11, %eax
+; X86-NEXT: andl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: subl %ecx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl $0, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: sbbl $0, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: sbbl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: addl $-1, %ebx
+; X86-NEXT: adcl $-1, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: adcl $-1, %edx
+; X86-NEXT: adcl $-1, %esi
+; X86-NEXT: movl %edi, %ecx
+; X86-NEXT: orl %esi, %ecx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %edx, %ebx
+; X86-NEXT: orl %ecx, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: jne .LBB5_16
+; X86-NEXT: .LBB5_17: # %udiv-loop-exit
+; X86-NEXT: shldl $1, %ecx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl $1, %edx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $1, %eax, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: leal (%esi,%eax,2), %edi
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: .LBB5_21: # %udiv-end
+; X86-NEXT: movl %edi, (%eax)
+; X86-NEXT: movl %edx, 4(%eax)
+; X86-NEXT: movl %ecx, 8(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, 12(%eax)
+; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
+; X86-NEXT: .cfi_def_cfa %esp, 4
+; X86-NEXT: retl $4
+; X86-NEXT: .LBB5_9:
+; X86-NEXT: .cfi_def_cfa %ebp, 8
+; X86-NEXT: movb $1, %cl
+; X86-NEXT: jmp .LBB5_11
+; X86-NEXT: .LBB5_19:
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: jmp .LBB5_17
+; X86-NEXT: .LBB5_14:
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: jmp .LBB5_21
+;
+; X64-LABEL: div_by_11:
+; X64: # %bb.0:
+; X64-NEXT: movabsq $1152921504606846975, %rax # imm = 0xFFFFFFFFFFFFFFF
+; X64-NEXT: movq %rdi, %rdx
+; X64-NEXT: andq %rax, %rdx
+; X64-NEXT: movq %rdi, %r8
+; X64-NEXT: shrdq $60, %rsi, %r8
+; X64-NEXT: addq %rdi, %r8
+; X64-NEXT: movq %r8, %r9
+; X64-NEXT: andq %rax, %r9
+; X64-NEXT: movq %rsi, %rcx
+; X64-NEXT: shrq $56, %rcx
+; X64-NEXT: cmpq %rdx, %r9
+; X64-NEXT: adcq %r8, %rcx
+; X64-NEXT: andq %rax, %rcx
+; X64-NEXT: movabsq $1676976733973595602, %rdx # imm = 0x1745D1745D1745D2
+; X64-NEXT: movq %rcx, %rax
+; X64-NEXT: mulq %rdx
+; X64-NEXT: leaq (%rdx,%rdx,4), %rax
+; X64-NEXT: leaq (%rdx,%rax,2), %rax
+; X64-NEXT: subq %rax, %rcx
+; X64-NEXT: subq %rcx, %rdi
+; X64-NEXT: sbbq $0, %rsi
+; X64-NEXT: movabsq $-6707906935894382406, %rcx # imm = 0xA2E8BA2E8BA2E8BA
+; X64-NEXT: imulq %rdi, %rcx
+; X64-NEXT: movabsq $3353953467947191203, %r8 # imm = 0x2E8BA2E8BA2E8BA3
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: mulq %r8
+; X64-NEXT: addq %rcx, %rdx
+; X64-NEXT: imulq %rsi, %r8
+; X64-NEXT: addq %r8, %rdx
+; X64-NEXT: retq
+ %div = udiv i128 %x, 11
+ ret i128 %div
+}
+
+define i128 @div_by_22(i128 %x) {
+; X86-LABEL: div_by_22:
+; X86: # %bb.0: # %entry_udiv-special-cases
+; X86-NEXT: pushl %ebp
+; X86-NEXT: .cfi_def_cfa_offset 8
+; X86-NEXT: .cfi_offset %ebp, -8
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: .cfi_def_cfa_register %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $160, %esp
+; X86-NEXT: .cfi_offset %esi, -20
+; X86-NEXT: .cfi_offset %edi, -16
+; X86-NEXT: .cfi_offset %ebx, -12
+; X86-NEXT: movl 32(%ebp), %edi
+; X86-NEXT: movl 36(%ebp), %ebx
+; X86-NEXT: movl 28(%ebp), %edx
+; X86-NEXT: testl %ebx, %ebx
+; X86-NEXT: jne .LBB6_1
+; X86-NEXT: # %bb.2: # %entry_udiv-special-cases
+; X86-NEXT: bsrl %edi, %esi
+; X86-NEXT: xorl $31, %esi
+; X86-NEXT: orl $32, %esi
+; X86-NEXT: jmp .LBB6_3
+; X86-NEXT: .LBB6_1:
+; X86-NEXT: bsrl %ebx, %esi
+; X86-NEXT: xorl $31, %esi
+; X86-NEXT: .LBB6_3: # %entry_udiv-special-cases
+; X86-NEXT: movl 24(%ebp), %eax
+; X86-NEXT: testl %edx, %edx
+; X86-NEXT: jne .LBB6_4
+; X86-NEXT: # %bb.5: # %entry_udiv-special-cases
+; X86-NEXT: bsrl %eax, %ecx
+; X86-NEXT: xorl $31, %ecx
+; X86-NEXT: orl $32, %ecx
+; X86-NEXT: jmp .LBB6_6
+; X86-NEXT: .LBB6_4:
+; X86-NEXT: bsrl %edx, %ecx
+; X86-NEXT: xorl $31, %ecx
+; X86-NEXT: .LBB6_6: # %entry_udiv-special-cases
+; X86-NEXT: orl %ebx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %edi, %eax
+; X86-NEXT: orl %ebx, %edi
+; X86-NEXT: jne .LBB6_8
+; X86-NEXT: # %bb.7: # %entry_udiv-special-cases
+; X86-NEXT: orl $64, %ecx
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: .LBB6_8: # %entry_udiv-special-cases
+; X86-NEXT: movl $123, %ebx
+; X86-NEXT: subl %esi, %ebx
+; X86-NEXT: movl $0, %edx
+; X86-NEXT: sbbl %edx, %edx
+; X86-NEXT: movl $0, %esi
+; X86-NEXT: sbbl %esi, %esi
+; X86-NEXT: movl $0, %edi
+; X86-NEXT: sbbl %edi, %edi
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: je .LBB6_9
+; X86-NEXT: # %bb.10: # %select.false.sink
+; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: movl $127, %ecx
+; X86-NEXT: cmpl %ebx, %ecx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: sbbl %edx, %ecx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: sbbl %esi, %ecx
+; X86-NEXT: sbbl %edi, %eax
+; X86-NEXT: setb %cl
+; X86-NEXT: .LBB6_11: # %select.end
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: xorl %edx, %edx
+; X86-NEXT: testb %cl, %cl
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: jne .LBB6_13
+; X86-NEXT: # %bb.12: # %select.end
+; X86-NEXT: movl 28(%ebp), %edx
+; X86-NEXT: movl 24(%ebp), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 32(%ebp), %ecx
+; X86-NEXT: movl 36(%ebp), %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: .LBB6_13: # %select.end
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: jne .LBB6_14
+; X86-NEXT: # %bb.20: # %select.end
+; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: xorl $127, %ecx
+; X86-NEXT: orl %esi, %ecx
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl %esi, %edx
+; X86-NEXT: orl %edi, %edx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: je .LBB6_21
+; X86-NEXT: # %bb.18: # %udiv-bb1
+; X86-NEXT: movl 24(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 28(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 32(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 36(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: xorb $127, %cl
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $3, %al
+; X86-NEXT: andb $12, %al
+; X86-NEXT: negb %al
+; X86-NEXT: movsbl %al, %eax
+; X86-NEXT: movl 136(%esp,%eax), %edx
+; X86-NEXT: movl 140(%esp,%eax), %edi
+; X86-NEXT: shldl %cl, %edx, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 128(%esp,%eax), %edi
+; X86-NEXT: movl 132(%esp,%eax), %eax
+; X86-NEXT: shldl %cl, %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shll %cl, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl $1, %ebx
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: jb .LBB6_19
+; X86-NEXT: # %bb.15: # %udiv-preheader
+; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: movl 24(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 28(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 32(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 36(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $3, %al
+; X86-NEXT: andb $12, %al
+; X86-NEXT: movzbl %al, %edi
+; X86-NEXT: movl 92(%esp,%edi), %eax
+; X86-NEXT: movl 88(%esp,%edi), %ebx
+; X86-NEXT: movl %ebx, %esi
+; X86-NEXT: shrdl %cl, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 80(%esp,%edi), %esi
+; X86-NEXT: movl 84(%esp,%edi), %edi
+; X86-NEXT: movl %edi, %edx
+; X86-NEXT: shrdl %cl, %ebx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shrl %cl, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shrdl %cl, %edi, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $22, %eax
+; X86-NEXT: addl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: .p2align 4
+; X86-NEXT: .LBB6_16: # %udiv-do-while
+; X86-NEXT: # =>This Inner Loop Header: Depth=1
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: shldl $1, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shldl $1, %edi, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl $1, %edx, %edi
+; X86-NEXT: shldl $1, %eax, %edx
+; X86-NEXT: shldl $1, %ecx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: orl %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $1, %eax, %ecx
+; X86-NEXT: orl %esi, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shldl $1, %ecx, %eax
+; X86-NEXT: orl %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl %ecx, %ecx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: sbbl %edi, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: sbbl %ebx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: sarl $31, %ecx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: andl $1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl $22, %eax
+; X86-NEXT: andl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: subl %ecx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl $0, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: sbbl $0, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: sbbl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: addl $-1, %ebx
+; X86-NEXT: adcl $-1, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: adcl $-1, %edx
+; X86-NEXT: adcl $-1, %esi
+; X86-NEXT: movl %edi, %ecx
+; X86-NEXT: orl %esi, %ecx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %edx, %ebx
+; X86-NEXT: orl %ecx, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: jne .LBB6_16
+; X86-NEXT: .LBB6_17: # %udiv-loop-exit
+; X86-NEXT: shldl $1, %ecx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl $1, %edx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $1, %eax, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: leal (%esi,%eax,2), %edi
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: .LBB6_21: # %udiv-end
+; X86-NEXT: movl %edi, (%eax)
+; X86-NEXT: movl %edx, 4(%eax)
+; X86-NEXT: movl %ecx, 8(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, 12(%eax)
+; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
+; X86-NEXT: .cfi_def_cfa %esp, 4
+; X86-NEXT: retl $4
+; X86-NEXT: .LBB6_9:
+; X86-NEXT: .cfi_def_cfa %ebp, 8
+; X86-NEXT: movb $1, %cl
+; X86-NEXT: jmp .LBB6_11
+; X86-NEXT: .LBB6_19:
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: jmp .LBB6_17
+; X86-NEXT: .LBB6_14:
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: jmp .LBB6_21
+;
+; X64-LABEL: div_by_22:
+; X64: # %bb.0: # %entry
+; X64-NEXT: pushq %rax
+; X64-NEXT: .cfi_def_cfa_offset 16
+; X64-NEXT: movl $22, %edx
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: callq __udivti3 at PLT
+; X64-NEXT: popq %rcx
+; X64-NEXT: .cfi_def_cfa_offset 8
+; X64-NEXT: retq
+entry:
+ %div = udiv i128 %x, 22
+ ret i128 %div
+}
+
+define i128 @div_by_25(i128 %x) {
+; X86-LABEL: div_by_25:
+; X86: # %bb.0: # %entry_udiv-special-cases
+; X86-NEXT: pushl %ebp
+; X86-NEXT: .cfi_def_cfa_offset 8
+; X86-NEXT: .cfi_offset %ebp, -8
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: .cfi_def_cfa_register %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $160, %esp
+; X86-NEXT: .cfi_offset %esi, -20
+; X86-NEXT: .cfi_offset %edi, -16
+; X86-NEXT: .cfi_offset %ebx, -12
+; X86-NEXT: movl 32(%ebp), %edi
+; X86-NEXT: movl 36(%ebp), %ebx
+; X86-NEXT: movl 28(%ebp), %edx
+; X86-NEXT: testl %ebx, %ebx
+; X86-NEXT: jne .LBB7_1
+; X86-NEXT: # %bb.2: # %entry_udiv-special-cases
+; X86-NEXT: bsrl %edi, %esi
+; X86-NEXT: xorl $31, %esi
+; X86-NEXT: orl $32, %esi
+; X86-NEXT: jmp .LBB7_3
+; X86-NEXT: .LBB7_1:
+; X86-NEXT: bsrl %ebx, %esi
+; X86-NEXT: xorl $31, %esi
+; X86-NEXT: .LBB7_3: # %entry_udiv-special-cases
+; X86-NEXT: movl 24(%ebp), %eax
+; X86-NEXT: testl %edx, %edx
+; X86-NEXT: jne .LBB7_4
+; X86-NEXT: # %bb.5: # %entry_udiv-special-cases
+; X86-NEXT: bsrl %eax, %ecx
+; X86-NEXT: xorl $31, %ecx
+; X86-NEXT: orl $32, %ecx
+; X86-NEXT: jmp .LBB7_6
+; X86-NEXT: .LBB7_4:
+; X86-NEXT: bsrl %edx, %ecx
+; X86-NEXT: xorl $31, %ecx
+; X86-NEXT: .LBB7_6: # %entry_udiv-special-cases
+; X86-NEXT: orl %ebx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %edi, %eax
+; X86-NEXT: orl %ebx, %edi
+; X86-NEXT: jne .LBB7_8
+; X86-NEXT: # %bb.7: # %entry_udiv-special-cases
+; X86-NEXT: orl $64, %ecx
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: .LBB7_8: # %entry_udiv-special-cases
+; X86-NEXT: movl $123, %ebx
+; X86-NEXT: subl %esi, %ebx
+; X86-NEXT: movl $0, %edx
+; X86-NEXT: sbbl %edx, %edx
+; X86-NEXT: movl $0, %esi
+; X86-NEXT: sbbl %esi, %esi
+; X86-NEXT: movl $0, %edi
+; X86-NEXT: sbbl %edi, %edi
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: je .LBB7_9
+; X86-NEXT: # %bb.10: # %select.false.sink
+; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: movl $127, %ecx
+; X86-NEXT: cmpl %ebx, %ecx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: sbbl %edx, %ecx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: sbbl %esi, %ecx
+; X86-NEXT: sbbl %edi, %eax
+; X86-NEXT: setb %cl
+; X86-NEXT: .LBB7_11: # %select.end
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: xorl %edx, %edx
+; X86-NEXT: testb %cl, %cl
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: jne .LBB7_13
+; X86-NEXT: # %bb.12: # %select.end
+; X86-NEXT: movl 28(%ebp), %edx
+; X86-NEXT: movl 24(%ebp), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 32(%ebp), %ecx
+; X86-NEXT: movl 36(%ebp), %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: .LBB7_13: # %select.end
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: jne .LBB7_14
+; X86-NEXT: # %bb.20: # %select.end
+; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: xorl $127, %ecx
+; X86-NEXT: orl %esi, %ecx
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl %esi, %edx
+; X86-NEXT: orl %edi, %edx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: je .LBB7_21
+; X86-NEXT: # %bb.18: # %udiv-bb1
+; X86-NEXT: movl 24(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 28(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 32(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 36(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: xorb $127, %cl
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $3, %al
+; X86-NEXT: andb $12, %al
+; X86-NEXT: negb %al
+; X86-NEXT: movsbl %al, %eax
+; X86-NEXT: movl 136(%esp,%eax), %edx
+; X86-NEXT: movl 140(%esp,%eax), %edi
+; X86-NEXT: shldl %cl, %edx, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 128(%esp,%eax), %edi
+; X86-NEXT: movl 132(%esp,%eax), %eax
+; X86-NEXT: shldl %cl, %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shll %cl, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl $1, %ebx
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: jb .LBB7_19
+; X86-NEXT: # %bb.15: # %udiv-preheader
+; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: movl 24(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 28(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 32(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 36(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $3, %al
+; X86-NEXT: andb $12, %al
+; X86-NEXT: movzbl %al, %edi
+; X86-NEXT: movl 92(%esp,%edi), %eax
+; X86-NEXT: movl 88(%esp,%edi), %ebx
+; X86-NEXT: movl %ebx, %esi
+; X86-NEXT: shrdl %cl, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 80(%esp,%edi), %esi
+; X86-NEXT: movl 84(%esp,%edi), %edi
+; X86-NEXT: movl %edi, %edx
+; X86-NEXT: shrdl %cl, %ebx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shrl %cl, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shrdl %cl, %edi, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $25, %eax
+; X86-NEXT: addl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: .p2align 4
+; X86-NEXT: .LBB7_16: # %udiv-do-while
+; X86-NEXT: # =>This Inner Loop Header: Depth=1
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: shldl $1, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shldl $1, %edi, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl $1, %edx, %edi
+; X86-NEXT: shldl $1, %eax, %edx
+; X86-NEXT: shldl $1, %ecx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: orl %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $1, %eax, %ecx
+; X86-NEXT: orl %esi, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shldl $1, %ecx, %eax
+; X86-NEXT: orl %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl %ecx, %ecx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: sbbl %edi, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: sbbl %ebx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: sarl $31, %ecx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: andl $1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl $25, %eax
+; X86-NEXT: andl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: subl %ecx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl $0, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: sbbl $0, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: sbbl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: addl $-1, %ebx
+; X86-NEXT: adcl $-1, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: adcl $-1, %edx
+; X86-NEXT: adcl $-1, %esi
+; X86-NEXT: movl %edi, %ecx
+; X86-NEXT: orl %esi, %ecx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %edx, %ebx
+; X86-NEXT: orl %ecx, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: jne .LBB7_16
+; X86-NEXT: .LBB7_17: # %udiv-loop-exit
+; X86-NEXT: shldl $1, %ecx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl $1, %edx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $1, %eax, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: leal (%esi,%eax,2), %edi
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: .LBB7_21: # %udiv-end
+; X86-NEXT: movl %edi, (%eax)
+; X86-NEXT: movl %edx, 4(%eax)
+; X86-NEXT: movl %ecx, 8(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, 12(%eax)
+; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
+; X86-NEXT: .cfi_def_cfa %esp, 4
+; X86-NEXT: retl $4
+; X86-NEXT: .LBB7_9:
+; X86-NEXT: .cfi_def_cfa %ebp, 8
+; X86-NEXT: movb $1, %cl
+; X86-NEXT: jmp .LBB7_11
+; X86-NEXT: .LBB7_19:
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: jmp .LBB7_17
+; X86-NEXT: .LBB7_14:
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: jmp .LBB7_21
+;
+; X64-LABEL: div_by_25:
+; X64: # %bb.0: # %entry
+; X64-NEXT: movabsq $1152921504606846975, %rax # imm = 0xFFFFFFFFFFFFFFF
+; X64-NEXT: movq %rdi, %rdx
+; X64-NEXT: andq %rax, %rdx
+; X64-NEXT: movq %rdi, %r8
+; X64-NEXT: shrdq $60, %rsi, %r8
+; X64-NEXT: addq %rdi, %r8
+; X64-NEXT: movq %r8, %r9
+; X64-NEXT: andq %rax, %r9
+; X64-NEXT: movq %rsi, %rcx
+; X64-NEXT: shrq $56, %rcx
+; X64-NEXT: cmpq %rdx, %r9
+; X64-NEXT: adcq %r8, %rcx
+; X64-NEXT: andq %rax, %rcx
+; X64-NEXT: movabsq $737869762948382065, %rdx # imm = 0xA3D70A3D70A3D71
+; X64-NEXT: movq %rcx, %rax
+; X64-NEXT: mulq %rdx
+; X64-NEXT: leaq (%rdx,%rdx,4), %rax
+; X64-NEXT: leaq (%rax,%rax,4), %rax
+; X64-NEXT: subq %rax, %rcx
+; X64-NEXT: subq %rcx, %rdi
+; X64-NEXT: sbbq $0, %rsi
+; X64-NEXT: movabsq $2951479051793528258, %rcx # imm = 0x28F5C28F5C28F5C2
+; X64-NEXT: imulq %rdi, %rcx
+; X64-NEXT: movabsq $-8116567392432202711, %r8 # imm = 0x8F5C28F5C28F5C29
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: mulq %r8
+; X64-NEXT: addq %rcx, %rdx
+; X64-NEXT: imulq %rsi, %r8
+; X64-NEXT: addq %r8, %rdx
+; X64-NEXT: retq
+entry:
+ %div = udiv i128 %x, 25
+ ret i128 %div
+}
+
+define i128 @div_by_56(i128 %x) {
+; X86-LABEL: div_by_56:
+; X86: # %bb.0: # %_udiv-special-cases
+; X86-NEXT: pushl %ebp
+; X86-NEXT: .cfi_def_cfa_offset 8
+; X86-NEXT: .cfi_offset %ebp, -8
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: .cfi_def_cfa_register %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $160, %esp
+; X86-NEXT: .cfi_offset %esi, -20
+; X86-NEXT: .cfi_offset %edi, -16
+; X86-NEXT: .cfi_offset %ebx, -12
+; X86-NEXT: movl 32(%ebp), %edi
+; X86-NEXT: movl 36(%ebp), %ebx
+; X86-NEXT: movl 28(%ebp), %edx
+; X86-NEXT: testl %ebx, %ebx
+; X86-NEXT: jne .LBB8_1
+; X86-NEXT: # %bb.2: # %_udiv-special-cases
+; X86-NEXT: bsrl %edi, %esi
+; X86-NEXT: xorl $31, %esi
+; X86-NEXT: orl $32, %esi
+; X86-NEXT: jmp .LBB8_3
+; X86-NEXT: .LBB8_1:
+; X86-NEXT: bsrl %ebx, %esi
+; X86-NEXT: xorl $31, %esi
+; X86-NEXT: .LBB8_3: # %_udiv-special-cases
+; X86-NEXT: movl 24(%ebp), %eax
+; X86-NEXT: testl %edx, %edx
+; X86-NEXT: jne .LBB8_4
+; X86-NEXT: # %bb.5: # %_udiv-special-cases
+; X86-NEXT: bsrl %eax, %ecx
+; X86-NEXT: xorl $31, %ecx
+; X86-NEXT: orl $32, %ecx
+; X86-NEXT: jmp .LBB8_6
+; X86-NEXT: .LBB8_4:
+; X86-NEXT: bsrl %edx, %ecx
+; X86-NEXT: xorl $31, %ecx
+; X86-NEXT: .LBB8_6: # %_udiv-special-cases
+; X86-NEXT: orl %ebx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %edi, %eax
+; X86-NEXT: orl %ebx, %edi
+; X86-NEXT: jne .LBB8_8
+; X86-NEXT: # %bb.7: # %_udiv-special-cases
+; X86-NEXT: orl $64, %ecx
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: .LBB8_8: # %_udiv-special-cases
+; X86-NEXT: movl $122, %ebx
+; X86-NEXT: subl %esi, %ebx
+; X86-NEXT: movl $0, %edx
+; X86-NEXT: sbbl %edx, %edx
+; X86-NEXT: movl $0, %esi
+; X86-NEXT: sbbl %esi, %esi
+; X86-NEXT: movl $0, %edi
+; X86-NEXT: sbbl %edi, %edi
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: je .LBB8_9
+; X86-NEXT: # %bb.10: # %select.false.sink
+; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: movl $127, %ecx
+; X86-NEXT: cmpl %ebx, %ecx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: sbbl %edx, %ecx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: sbbl %esi, %ecx
+; X86-NEXT: sbbl %edi, %eax
+; X86-NEXT: setb %cl
+; X86-NEXT: .LBB8_11: # %select.end
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: xorl %edx, %edx
+; X86-NEXT: testb %cl, %cl
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: jne .LBB8_13
+; X86-NEXT: # %bb.12: # %select.end
+; X86-NEXT: movl 28(%ebp), %edx
+; X86-NEXT: movl 24(%ebp), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 32(%ebp), %ecx
+; X86-NEXT: movl 36(%ebp), %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: .LBB8_13: # %select.end
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: jne .LBB8_14
+; X86-NEXT: # %bb.20: # %select.end
+; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: xorl $127, %ecx
+; X86-NEXT: orl %esi, %ecx
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl %esi, %edx
+; X86-NEXT: orl %edi, %edx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: je .LBB8_21
+; X86-NEXT: # %bb.18: # %udiv-bb1
+; X86-NEXT: movl 24(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 28(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 32(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 36(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: xorb $127, %cl
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $3, %al
+; X86-NEXT: andb $12, %al
+; X86-NEXT: negb %al
+; X86-NEXT: movsbl %al, %eax
+; X86-NEXT: movl 136(%esp,%eax), %edx
+; X86-NEXT: movl 140(%esp,%eax), %edi
+; X86-NEXT: shldl %cl, %edx, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 128(%esp,%eax), %edi
+; X86-NEXT: movl 132(%esp,%eax), %eax
+; X86-NEXT: shldl %cl, %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shll %cl, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl $1, %ebx
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: jb .LBB8_19
+; X86-NEXT: # %bb.15: # %udiv-preheader
+; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: movl 24(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 28(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 32(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 36(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $3, %al
+; X86-NEXT: andb $12, %al
+; X86-NEXT: movzbl %al, %edi
+; X86-NEXT: movl 92(%esp,%edi), %eax
+; X86-NEXT: movl 88(%esp,%edi), %ebx
+; X86-NEXT: movl %ebx, %esi
+; X86-NEXT: shrdl %cl, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 80(%esp,%edi), %esi
+; X86-NEXT: movl 84(%esp,%edi), %edi
+; X86-NEXT: movl %edi, %edx
+; X86-NEXT: shrdl %cl, %ebx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shrl %cl, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shrdl %cl, %edi, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $56, %eax
+; X86-NEXT: addl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: .p2align 4
+; X86-NEXT: .LBB8_16: # %udiv-do-while
+; X86-NEXT: # =>This Inner Loop Header: Depth=1
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: shldl $1, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shldl $1, %edi, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl $1, %edx, %edi
+; X86-NEXT: shldl $1, %eax, %edx
+; X86-NEXT: shldl $1, %ecx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: orl %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $1, %eax, %ecx
+; X86-NEXT: orl %esi, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shldl $1, %ecx, %eax
+; X86-NEXT: orl %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl %ecx, %ecx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: sbbl %edi, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: sbbl %ebx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: sarl $31, %ecx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: andl $1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl $56, %eax
+; X86-NEXT: andl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: subl %ecx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl $0, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: sbbl $0, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: sbbl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: addl $-1, %ebx
+; X86-NEXT: adcl $-1, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: adcl $-1, %edx
+; X86-NEXT: adcl $-1, %esi
+; X86-NEXT: movl %edi, %ecx
+; X86-NEXT: orl %esi, %ecx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %edx, %ebx
+; X86-NEXT: orl %ecx, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: jne .LBB8_16
+; X86-NEXT: .LBB8_17: # %udiv-loop-exit
+; X86-NEXT: shldl $1, %ecx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl $1, %edx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $1, %eax, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: leal (%esi,%eax,2), %edi
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: .LBB8_21: # %udiv-end
+; X86-NEXT: movl %edi, (%eax)
+; X86-NEXT: movl %edx, 4(%eax)
+; X86-NEXT: movl %ecx, 8(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, 12(%eax)
+; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
+; X86-NEXT: .cfi_def_cfa %esp, 4
+; X86-NEXT: retl $4
+; X86-NEXT: .LBB8_9:
+; X86-NEXT: .cfi_def_cfa %ebp, 8
+; X86-NEXT: movb $1, %cl
+; X86-NEXT: jmp .LBB8_11
+; X86-NEXT: .LBB8_19:
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: jmp .LBB8_17
+; X86-NEXT: .LBB8_14:
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: jmp .LBB8_21
+;
+; X64-LABEL: div_by_56:
+; X64: # %bb.0:
+; X64-NEXT: pushq %rax
+; X64-NEXT: .cfi_def_cfa_offset 16
+; X64-NEXT: movl $56, %edx
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: callq __udivti3 at PLT
+; X64-NEXT: popq %rcx
+; X64-NEXT: .cfi_def_cfa_offset 8
+; X64-NEXT: retq
+ %div = udiv i128 %x, 56 ; 8 * 7
+ ret i128 %div
+}
+
+define i128 @rem_by_7(i128 %x) {
+; X86-LABEL: rem_by_7:
+; X86: # %bb.0: # %_udiv-special-cases
+; X86-NEXT: pushl %ebp
+; X86-NEXT: .cfi_def_cfa_offset 8
+; X86-NEXT: .cfi_offset %ebp, -8
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: .cfi_def_cfa_register %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $160, %esp
+; X86-NEXT: .cfi_offset %esi, -20
+; X86-NEXT: .cfi_offset %edi, -16
+; X86-NEXT: .cfi_offset %ebx, -12
+; X86-NEXT: movl 32(%ebp), %edi
+; X86-NEXT: movl 36(%ebp), %ebx
+; X86-NEXT: movl 28(%ebp), %eax
+; X86-NEXT: testl %ebx, %ebx
+; X86-NEXT: jne .LBB9_1
+; X86-NEXT: # %bb.2: # %_udiv-special-cases
+; X86-NEXT: bsrl %edi, %edx
+; X86-NEXT: xorl $31, %edx
+; X86-NEXT: orl $32, %edx
+; X86-NEXT: jmp .LBB9_3
+; X86-NEXT: .LBB9_1:
+; X86-NEXT: bsrl %ebx, %edx
+; X86-NEXT: xorl $31, %edx
+; X86-NEXT: .LBB9_3: # %_udiv-special-cases
+; X86-NEXT: movl 24(%ebp), %esi
+; X86-NEXT: testl %eax, %eax
+; X86-NEXT: jne .LBB9_4
+; X86-NEXT: # %bb.5: # %_udiv-special-cases
+; X86-NEXT: bsrl %esi, %ecx
+; X86-NEXT: xorl $31, %ecx
+; X86-NEXT: orl $32, %ecx
+; X86-NEXT: jmp .LBB9_6
+; X86-NEXT: .LBB9_4:
+; X86-NEXT: bsrl %eax, %ecx
+; X86-NEXT: xorl $31, %ecx
+; X86-NEXT: .LBB9_6: # %_udiv-special-cases
+; X86-NEXT: orl %ebx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %edi, %esi
+; X86-NEXT: orl %ebx, %edi
+; X86-NEXT: jne .LBB9_8
+; X86-NEXT: # %bb.7: # %_udiv-special-cases
+; X86-NEXT: orl $64, %ecx
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: .LBB9_8: # %_udiv-special-cases
+; X86-NEXT: movl $125, %ebx
+; X86-NEXT: subl %edx, %ebx
+; X86-NEXT: movl $0, %edx
+; X86-NEXT: sbbl %edx, %edx
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %eax, %eax
+; X86-NEXT: movl $0, %edi
+; X86-NEXT: sbbl %edi, %edi
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: je .LBB9_9
+; X86-NEXT: # %bb.10: # %select.false.sink
+; X86-NEXT: xorl %esi, %esi
+; X86-NEXT: movl $127, %ecx
+; X86-NEXT: cmpl %ebx, %ecx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: sbbl %edx, %ecx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: sbbl %eax, %ecx
+; X86-NEXT: sbbl %edi, %esi
+; X86-NEXT: setb %al
+; X86-NEXT: .LBB9_11: # %select.end
+; X86-NEXT: movl 32(%ebp), %edi
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: xorl %edx, %edx
+; X86-NEXT: testb %al, %al
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: movl $0, %esi
+; X86-NEXT: jne .LBB9_13
+; X86-NEXT: # %bb.12: # %select.end
+; X86-NEXT: movl 28(%ebp), %edx
+; X86-NEXT: movl 24(%ebp), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl 36(%ebp), %esi
+; X86-NEXT: .LBB9_13: # %select.end
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: jne .LBB9_14
+; X86-NEXT: # %bb.20: # %select.end
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: xorl $127, %eax
+; X86-NEXT: orl %ecx, %eax
+; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: orl %edi, %ecx
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: je .LBB9_21
+; X86-NEXT: # %bb.18: # %udiv-bb1
+; X86-NEXT: movl 24(%ebp), %ebx
+; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 28(%ebp), %esi
+; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 32(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 36(%ebp), %edx
+; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: xorb $127, %cl
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $3, %al
+; X86-NEXT: andb $12, %al
+; X86-NEXT: negb %al
+; X86-NEXT: movsbl %al, %eax
+; X86-NEXT: movl 136(%esp,%eax), %esi
+; X86-NEXT: movl 140(%esp,%eax), %edx
+; X86-NEXT: shldl %cl, %esi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 128(%esp,%eax), %edx
+; X86-NEXT: movl 132(%esp,%eax), %eax
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shll %cl, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl $1, %ebx
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: jb .LBB9_19
+; X86-NEXT: # %bb.15: # %udiv-preheader
+; X86-NEXT: movl 24(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 28(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 32(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 36(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $3, %al
+; X86-NEXT: andb $12, %al
+; X86-NEXT: movzbl %al, %eax
+; X86-NEXT: movl 92(%esp,%eax), %esi
+; X86-NEXT: movl 88(%esp,%eax), %edx
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: shrdl %cl, %esi, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 80(%esp,%eax), %edi
+; X86-NEXT: movl 84(%esp,%eax), %ebx
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: shrdl %cl, %edx, %eax
+; X86-NEXT: shrl %cl, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shrdl %cl, %ebx, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $7, %edx
+; X86-NEXT: addl $-1, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, %edx
+; X86-NEXT: adcl $-1, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, %edx
+; X86-NEXT: adcl $-1, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, %edx
+; X86-NEXT: adcl $-1, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: xorl %edx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: .p2align 4
+; X86-NEXT: .LBB9_16: # %udiv-do-while
+; X86-NEXT: # =>This Inner Loop Header: Depth=1
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: shldl $1, %ebx, %esi
+; X86-NEXT: shldl $1, %eax, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $1, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: shldl $1, %ecx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shldl $1, %edi, %ecx
+; X86-NEXT: orl %edx, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shldl $1, %ecx, %edi
+; X86-NEXT: orl %edx, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shldl $1, %edi, %ecx
+; X86-NEXT: orl %edx, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl %edi, %edi
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: cmpl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: sbbl %ebx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: sbbl %esi, %ecx
+; X86-NEXT: sarl $31, %ecx
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: andl $1, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $7, %edx
+; X86-NEXT: andl %edx, %ecx
+; X86-NEXT: subl %ecx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl $0, %eax
+; X86-NEXT: sbbl $0, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: sbbl $0, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: addl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: adcl $-1, %ecx
+; X86-NEXT: adcl $-1, %ebx
+; X86-NEXT: adcl $-1, %esi
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %esi, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: orl %ebx, %edx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl $0, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: jne .LBB9_16
+; X86-NEXT: .LBB9_17: # %udiv-loop-exit
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $1, %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl $1, %edx, %eax
+; X86-NEXT: shldl $1, %edi, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: leal (%esi,%edi,2), %ebx
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: .LBB9_21: # %udiv-end
+; X86-NEXT: movl $7, %ecx
+; X86-NEXT: imull %ecx, %esi
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $7, %eax
+; X86-NEXT: mull %ebx
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $7, %eax
+; X86-NEXT: mull %edi
+; X86-NEXT: addl %ebx, %eax
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: adcl %esi, %ecx
+; X86-NEXT: movl 24(%ebp), %ebx
+; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: movl 28(%ebp), %esi
+; X86-NEXT: sbbl %eax, %esi
+; X86-NEXT: movl 32(%ebp), %edi
+; X86-NEXT: sbbl %edx, %edi
+; X86-NEXT: movl 36(%ebp), %edx
+; X86-NEXT: sbbl %ecx, %edx
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: movl %ebx, (%eax)
+; X86-NEXT: movl %esi, 4(%eax)
+; X86-NEXT: movl %edi, 8(%eax)
+; X86-NEXT: movl %edx, 12(%eax)
+; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
+; X86-NEXT: .cfi_def_cfa %esp, 4
+; X86-NEXT: retl $4
+; X86-NEXT: .LBB9_9:
+; X86-NEXT: .cfi_def_cfa %ebp, 8
+; X86-NEXT: movb $1, %al
+; X86-NEXT: jmp .LBB9_11
+; X86-NEXT: .LBB9_19:
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: jmp .LBB9_17
+; X86-NEXT: .LBB9_14:
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: jmp .LBB9_21
+;
+; X64-LABEL: rem_by_7:
+; X64: # %bb.0:
+; X64-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF
+; X64-NEXT: movq %rdi, %rcx
+; X64-NEXT: shrdq $63, %rsi, %rcx
+; X64-NEXT: addq %rdi, %rcx
+; X64-NEXT: andq %rax, %rdi
+; X64-NEXT: movq %rcx, %rdx
+; X64-NEXT: andq %rax, %rdx
+; X64-NEXT: shrq $62, %rsi
+; X64-NEXT: cmpq %rdi, %rdx
+; X64-NEXT: adcq %rsi, %rcx
+; X64-NEXT: andq %rax, %rcx
+; X64-NEXT: movabsq $5270498306774157605, %rdx # imm = 0x4924924924924925
+; X64-NEXT: movq %rcx, %rax
+; X64-NEXT: mulq %rdx
+; X64-NEXT: shrq %rdx
+; X64-NEXT: leaq (,%rdx,8), %rax
+; X64-NEXT: subq %rax, %rdx
+; X64-NEXT: addq %rdx, %rcx
+; X64-NEXT: movq %rcx, %rax
+; X64-NEXT: xorl %edx, %edx
+; X64-NEXT: retq
+ %rem = urem i128 %x, 7
+ ret i128 %rem
+}
+
+define i128 @rem_by_14(i128 %x) {
+; X86-LABEL: rem_by_14:
+; X86: # %bb.0: # %_udiv-special-cases
+; X86-NEXT: pushl %ebp
+; X86-NEXT: .cfi_def_cfa_offset 8
+; X86-NEXT: .cfi_offset %ebp, -8
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: .cfi_def_cfa_register %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $160, %esp
+; X86-NEXT: .cfi_offset %esi, -20
+; X86-NEXT: .cfi_offset %edi, -16
+; X86-NEXT: .cfi_offset %ebx, -12
+; X86-NEXT: movl 32(%ebp), %edi
+; X86-NEXT: movl 36(%ebp), %ebx
+; X86-NEXT: movl 28(%ebp), %eax
+; X86-NEXT: testl %ebx, %ebx
+; X86-NEXT: jne .LBB10_1
+; X86-NEXT: # %bb.2: # %_udiv-special-cases
+; X86-NEXT: bsrl %edi, %edx
+; X86-NEXT: xorl $31, %edx
+; X86-NEXT: orl $32, %edx
+; X86-NEXT: jmp .LBB10_3
+; X86-NEXT: .LBB10_1:
+; X86-NEXT: bsrl %ebx, %edx
+; X86-NEXT: xorl $31, %edx
+; X86-NEXT: .LBB10_3: # %_udiv-special-cases
+; X86-NEXT: movl 24(%ebp), %esi
+; X86-NEXT: testl %eax, %eax
+; X86-NEXT: jne .LBB10_4
+; X86-NEXT: # %bb.5: # %_udiv-special-cases
+; X86-NEXT: bsrl %esi, %ecx
+; X86-NEXT: xorl $31, %ecx
+; X86-NEXT: orl $32, %ecx
+; X86-NEXT: jmp .LBB10_6
+; X86-NEXT: .LBB10_4:
+; X86-NEXT: bsrl %eax, %ecx
+; X86-NEXT: xorl $31, %ecx
+; X86-NEXT: .LBB10_6: # %_udiv-special-cases
+; X86-NEXT: orl %ebx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %edi, %esi
+; X86-NEXT: orl %ebx, %edi
+; X86-NEXT: jne .LBB10_8
+; X86-NEXT: # %bb.7: # %_udiv-special-cases
+; X86-NEXT: orl $64, %ecx
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: .LBB10_8: # %_udiv-special-cases
+; X86-NEXT: movl $124, %ebx
+; X86-NEXT: subl %edx, %ebx
+; X86-NEXT: movl $0, %edx
+; X86-NEXT: sbbl %edx, %edx
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %eax, %eax
+; X86-NEXT: movl $0, %edi
+; X86-NEXT: sbbl %edi, %edi
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: je .LBB10_9
+; X86-NEXT: # %bb.10: # %select.false.sink
+; X86-NEXT: xorl %esi, %esi
+; X86-NEXT: movl $127, %ecx
+; X86-NEXT: cmpl %ebx, %ecx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: sbbl %edx, %ecx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: sbbl %eax, %ecx
+; X86-NEXT: sbbl %edi, %esi
+; X86-NEXT: setb %al
+; X86-NEXT: .LBB10_11: # %select.end
+; X86-NEXT: movl 32(%ebp), %edi
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: xorl %edx, %edx
+; X86-NEXT: testb %al, %al
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: movl $0, %esi
+; X86-NEXT: jne .LBB10_13
+; X86-NEXT: # %bb.12: # %select.end
+; X86-NEXT: movl 28(%ebp), %edx
+; X86-NEXT: movl 24(%ebp), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl 36(%ebp), %esi
+; X86-NEXT: .LBB10_13: # %select.end
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: jne .LBB10_14
+; X86-NEXT: # %bb.20: # %select.end
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: xorl $127, %eax
+; X86-NEXT: orl %ecx, %eax
+; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: orl %edi, %ecx
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: je .LBB10_21
+; X86-NEXT: # %bb.18: # %udiv-bb1
+; X86-NEXT: movl 24(%ebp), %ebx
+; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 28(%ebp), %esi
+; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 32(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 36(%ebp), %edx
+; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: xorb $127, %cl
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $3, %al
+; X86-NEXT: andb $12, %al
+; X86-NEXT: negb %al
+; X86-NEXT: movsbl %al, %eax
+; X86-NEXT: movl 136(%esp,%eax), %esi
+; X86-NEXT: movl 140(%esp,%eax), %edx
+; X86-NEXT: shldl %cl, %esi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 128(%esp,%eax), %edx
+; X86-NEXT: movl 132(%esp,%eax), %eax
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shll %cl, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl $1, %ebx
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: jb .LBB10_19
+; X86-NEXT: # %bb.15: # %udiv-preheader
+; X86-NEXT: movl 24(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 28(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 32(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 36(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $3, %al
+; X86-NEXT: andb $12, %al
+; X86-NEXT: movzbl %al, %eax
+; X86-NEXT: movl 92(%esp,%eax), %esi
+; X86-NEXT: movl 88(%esp,%eax), %edx
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: shrdl %cl, %esi, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 80(%esp,%eax), %edi
+; X86-NEXT: movl 84(%esp,%eax), %ebx
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: shrdl %cl, %edx, %eax
+; X86-NEXT: shrl %cl, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shrdl %cl, %ebx, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $14, %edx
+; X86-NEXT: addl $-1, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, %edx
+; X86-NEXT: adcl $-1, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, %edx
+; X86-NEXT: adcl $-1, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, %edx
+; X86-NEXT: adcl $-1, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: xorl %edx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: .p2align 4
+; X86-NEXT: .LBB10_16: # %udiv-do-while
+; X86-NEXT: # =>This Inner Loop Header: Depth=1
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: shldl $1, %ebx, %esi
+; X86-NEXT: shldl $1, %eax, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $1, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: shldl $1, %ecx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shldl $1, %edi, %ecx
+; X86-NEXT: orl %edx, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shldl $1, %ecx, %edi
+; X86-NEXT: orl %edx, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shldl $1, %edi, %ecx
+; X86-NEXT: orl %edx, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl %edi, %edi
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: cmpl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: sbbl %ebx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: sbbl %esi, %ecx
+; X86-NEXT: sarl $31, %ecx
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: andl $1, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $14, %edx
+; X86-NEXT: andl %edx, %ecx
+; X86-NEXT: subl %ecx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl $0, %eax
+; X86-NEXT: sbbl $0, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: sbbl $0, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: addl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: adcl $-1, %ecx
+; X86-NEXT: adcl $-1, %ebx
+; X86-NEXT: adcl $-1, %esi
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %esi, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: orl %ebx, %edx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl $0, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: jne .LBB10_16
+; X86-NEXT: .LBB10_17: # %udiv-loop-exit
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $1, %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl $1, %edx, %eax
+; X86-NEXT: shldl $1, %edi, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: leal (%esi,%edi,2), %ebx
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: .LBB10_21: # %udiv-end
+; X86-NEXT: movl $14, %ecx
+; X86-NEXT: imull %ecx, %esi
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $14, %eax
+; X86-NEXT: mull %ebx
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $14, %eax
+; X86-NEXT: mull %edi
+; X86-NEXT: addl %ebx, %eax
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: adcl %esi, %ecx
+; X86-NEXT: movl 24(%ebp), %ebx
+; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: movl 28(%ebp), %esi
+; X86-NEXT: sbbl %eax, %esi
+; X86-NEXT: movl 32(%ebp), %edi
+; X86-NEXT: sbbl %edx, %edi
+; X86-NEXT: movl 36(%ebp), %edx
+; X86-NEXT: sbbl %ecx, %edx
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: movl %ebx, (%eax)
+; X86-NEXT: movl %esi, 4(%eax)
+; X86-NEXT: movl %edi, 8(%eax)
+; X86-NEXT: movl %edx, 12(%eax)
+; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
+; X86-NEXT: .cfi_def_cfa %esp, 4
+; X86-NEXT: retl $4
+; X86-NEXT: .LBB10_9:
+; X86-NEXT: .cfi_def_cfa %ebp, 8
+; X86-NEXT: movb $1, %al
+; X86-NEXT: jmp .LBB10_11
+; X86-NEXT: .LBB10_19:
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: jmp .LBB10_17
+; X86-NEXT: .LBB10_14:
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: jmp .LBB10_21
+;
+; X64-LABEL: rem_by_14:
+; X64: # %bb.0:
+; X64-NEXT: pushq %rax
+; X64-NEXT: .cfi_def_cfa_offset 16
+; X64-NEXT: movl $14, %edx
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: callq __umodti3 at PLT
+; X64-NEXT: popq %rcx
+; X64-NEXT: .cfi_def_cfa_offset 8
+; X64-NEXT: retq
+ %rem = urem i128 %x, 14
+ ret i128 %rem
+}
diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
index 3117865184ecc..64b5f6f3db2e9 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
@@ -1209,3 +1209,491 @@ define <16 x i8> @PR143238(<16 x i8> %a0) {
%mask = and <16 x i8> %sdiv, splat (i8 1)
ret <16 x i8> %mask
}
+
+define <2 x i128> @v2i128_div_by_7(<2 x i128> %x) {
+; SSE-LABEL: v2i128_div_by_7:
+; SSE: # %bb.0: # %entry
+; SSE-NEXT: pushq %r15
+; SSE-NEXT: .cfi_def_cfa_offset 16
+; SSE-NEXT: pushq %r14
+; SSE-NEXT: .cfi_def_cfa_offset 24
+; SSE-NEXT: pushq %r12
+; SSE-NEXT: .cfi_def_cfa_offset 32
+; SSE-NEXT: pushq %rbx
+; SSE-NEXT: .cfi_def_cfa_offset 40
+; SSE-NEXT: .cfi_offset %rbx, -40
+; SSE-NEXT: .cfi_offset %r12, -32
+; SSE-NEXT: .cfi_offset %r14, -24
+; SSE-NEXT: .cfi_offset %r15, -16
+; SSE-NEXT: movq %rcx, %r9
+; SSE-NEXT: movq %rdx, %rcx
+; SSE-NEXT: movabsq $9223372036854775807, %r11 # imm = 0x7FFFFFFFFFFFFFFF
+; SSE-NEXT: movq %rsi, %rax
+; SSE-NEXT: andq %r11, %rax
+; SSE-NEXT: movq %rsi, %rdx
+; SSE-NEXT: shrdq $63, %rcx, %rdx
+; SSE-NEXT: addq %rsi, %rdx
+; SSE-NEXT: movq %rdx, %rbx
+; SSE-NEXT: andq %r11, %rbx
+; SSE-NEXT: movq %rcx, %r10
+; SSE-NEXT: shrq $62, %r10
+; SSE-NEXT: cmpq %rax, %rbx
+; SSE-NEXT: adcq %rdx, %r10
+; SSE-NEXT: andq %r11, %r10
+; SSE-NEXT: movabsq $5270498306774157605, %r15 # imm = 0x4924924924924925
+; SSE-NEXT: movq %r10, %rax
+; SSE-NEXT: mulq %r15
+; SSE-NEXT: shrq %rdx
+; SSE-NEXT: leaq (,%rdx,8), %rax
+; SSE-NEXT: subq %rax, %rdx
+; SSE-NEXT: addq %r10, %rdx
+; SSE-NEXT: subq %rdx, %rsi
+; SSE-NEXT: sbbq $0, %rcx
+; SSE-NEXT: movabsq $-5270498306774157605, %rbx # imm = 0xB6DB6DB6DB6DB6DB
+; SSE-NEXT: movq %rsi, %r10
+; SSE-NEXT: imulq %rbx, %r10
+; SSE-NEXT: movabsq $7905747460161236407, %r14 # imm = 0x6DB6DB6DB6DB6DB7
+; SSE-NEXT: movq %rsi, %rax
+; SSE-NEXT: mulq %r14
+; SSE-NEXT: movq %rax, %rsi
+; SSE-NEXT: addq %r10, %rdx
+; SSE-NEXT: imulq %r14, %rcx
+; SSE-NEXT: addq %rdx, %rcx
+; SSE-NEXT: movq %r9, %rax
+; SSE-NEXT: andq %r11, %rax
+; SSE-NEXT: movq %r9, %rdx
+; SSE-NEXT: shrdq $63, %r8, %rdx
+; SSE-NEXT: addq %r9, %rdx
+; SSE-NEXT: movq %rdx, %r12
+; SSE-NEXT: andq %r11, %r12
+; SSE-NEXT: movq %r8, %r10
+; SSE-NEXT: shrq $62, %r10
+; SSE-NEXT: cmpq %rax, %r12
+; SSE-NEXT: adcq %rdx, %r10
+; SSE-NEXT: andq %r11, %r10
+; SSE-NEXT: movq %r10, %rax
+; SSE-NEXT: mulq %r15
+; SSE-NEXT: shrq %rdx
+; SSE-NEXT: leaq (,%rdx,8), %rax
+; SSE-NEXT: subq %rax, %rdx
+; SSE-NEXT: addq %r10, %rdx
+; SSE-NEXT: subq %rdx, %r9
+; SSE-NEXT: sbbq $0, %r8
+; SSE-NEXT: imulq %r9, %rbx
+; SSE-NEXT: movq %r9, %rax
+; SSE-NEXT: mulq %r14
+; SSE-NEXT: addq %rbx, %rdx
+; SSE-NEXT: imulq %r14, %r8
+; SSE-NEXT: addq %rdx, %r8
+; SSE-NEXT: movq %rax, 16(%rdi)
+; SSE-NEXT: movq %rsi, (%rdi)
+; SSE-NEXT: movq %r8, 24(%rdi)
+; SSE-NEXT: movq %rcx, 8(%rdi)
+; SSE-NEXT: movq %rdi, %rax
+; SSE-NEXT: popq %rbx
+; SSE-NEXT: .cfi_def_cfa_offset 32
+; SSE-NEXT: popq %r12
+; SSE-NEXT: .cfi_def_cfa_offset 24
+; SSE-NEXT: popq %r14
+; SSE-NEXT: .cfi_def_cfa_offset 16
+; SSE-NEXT: popq %r15
+; SSE-NEXT: .cfi_def_cfa_offset 8
+; SSE-NEXT: retq
+;
+; AVX-LABEL: v2i128_div_by_7:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: pushq %r15
+; AVX-NEXT: .cfi_def_cfa_offset 16
+; AVX-NEXT: pushq %r14
+; AVX-NEXT: .cfi_def_cfa_offset 24
+; AVX-NEXT: pushq %r12
+; AVX-NEXT: .cfi_def_cfa_offset 32
+; AVX-NEXT: pushq %rbx
+; AVX-NEXT: .cfi_def_cfa_offset 40
+; AVX-NEXT: .cfi_offset %rbx, -40
+; AVX-NEXT: .cfi_offset %r12, -32
+; AVX-NEXT: .cfi_offset %r14, -24
+; AVX-NEXT: .cfi_offset %r15, -16
+; AVX-NEXT: movq %rcx, %r9
+; AVX-NEXT: movq %rdx, %rcx
+; AVX-NEXT: movabsq $9223372036854775807, %r11 # imm = 0x7FFFFFFFFFFFFFFF
+; AVX-NEXT: movq %rsi, %rax
+; AVX-NEXT: andq %r11, %rax
+; AVX-NEXT: movq %rsi, %rdx
+; AVX-NEXT: shrdq $63, %rcx, %rdx
+; AVX-NEXT: addq %rsi, %rdx
+; AVX-NEXT: movq %rdx, %rbx
+; AVX-NEXT: andq %r11, %rbx
+; AVX-NEXT: movq %rcx, %r10
+; AVX-NEXT: shrq $62, %r10
+; AVX-NEXT: cmpq %rax, %rbx
+; AVX-NEXT: adcq %rdx, %r10
+; AVX-NEXT: andq %r11, %r10
+; AVX-NEXT: movabsq $5270498306774157605, %r15 # imm = 0x4924924924924925
+; AVX-NEXT: movq %r10, %rax
+; AVX-NEXT: mulq %r15
+; AVX-NEXT: shrq %rdx
+; AVX-NEXT: leaq (,%rdx,8), %rax
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: addq %r10, %rdx
+; AVX-NEXT: subq %rdx, %rsi
+; AVX-NEXT: sbbq $0, %rcx
+; AVX-NEXT: movabsq $-5270498306774157605, %rbx # imm = 0xB6DB6DB6DB6DB6DB
+; AVX-NEXT: movq %rsi, %r10
+; AVX-NEXT: imulq %rbx, %r10
+; AVX-NEXT: movabsq $7905747460161236407, %r14 # imm = 0x6DB6DB6DB6DB6DB7
+; AVX-NEXT: movq %rsi, %rax
+; AVX-NEXT: mulq %r14
+; AVX-NEXT: movq %rax, %rsi
+; AVX-NEXT: addq %r10, %rdx
+; AVX-NEXT: imulq %r14, %rcx
+; AVX-NEXT: addq %rdx, %rcx
+; AVX-NEXT: movq %r9, %rax
+; AVX-NEXT: andq %r11, %rax
+; AVX-NEXT: movq %r9, %rdx
+; AVX-NEXT: shrdq $63, %r8, %rdx
+; AVX-NEXT: addq %r9, %rdx
+; AVX-NEXT: movq %rdx, %r12
+; AVX-NEXT: andq %r11, %r12
+; AVX-NEXT: movq %r8, %r10
+; AVX-NEXT: shrq $62, %r10
+; AVX-NEXT: cmpq %rax, %r12
+; AVX-NEXT: adcq %rdx, %r10
+; AVX-NEXT: andq %r11, %r10
+; AVX-NEXT: movq %r10, %rax
+; AVX-NEXT: mulq %r15
+; AVX-NEXT: shrq %rdx
+; AVX-NEXT: leaq (,%rdx,8), %rax
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: addq %r10, %rdx
+; AVX-NEXT: subq %rdx, %r9
+; AVX-NEXT: sbbq $0, %r8
+; AVX-NEXT: imulq %r9, %rbx
+; AVX-NEXT: movq %r9, %rax
+; AVX-NEXT: mulq %r14
+; AVX-NEXT: addq %rbx, %rdx
+; AVX-NEXT: imulq %r14, %r8
+; AVX-NEXT: addq %rdx, %r8
+; AVX-NEXT: movq %rax, 16(%rdi)
+; AVX-NEXT: movq %rsi, (%rdi)
+; AVX-NEXT: movq %r8, 24(%rdi)
+; AVX-NEXT: movq %rcx, 8(%rdi)
+; AVX-NEXT: movq %rdi, %rax
+; AVX-NEXT: popq %rbx
+; AVX-NEXT: .cfi_def_cfa_offset 32
+; AVX-NEXT: popq %r12
+; AVX-NEXT: .cfi_def_cfa_offset 24
+; AVX-NEXT: popq %r14
+; AVX-NEXT: .cfi_def_cfa_offset 16
+; AVX-NEXT: popq %r15
+; AVX-NEXT: .cfi_def_cfa_offset 8
+; AVX-NEXT: retq
+entry:
+ %div = udiv <2 x i128> %x, <i128 7, i128 7>
+ ret <2 x i128> %div
+}
+
+define <2 x i128> @v2i128_div_by_14(<2 x i128> %x) {
+; SSE-LABEL: v2i128_div_by_14:
+; SSE: # %bb.0: # %entry
+; SSE-NEXT: pushq %r15
+; SSE-NEXT: .cfi_def_cfa_offset 16
+; SSE-NEXT: pushq %r14
+; SSE-NEXT: .cfi_def_cfa_offset 24
+; SSE-NEXT: pushq %r13
+; SSE-NEXT: .cfi_def_cfa_offset 32
+; SSE-NEXT: pushq %r12
+; SSE-NEXT: .cfi_def_cfa_offset 40
+; SSE-NEXT: pushq %rbx
+; SSE-NEXT: .cfi_def_cfa_offset 48
+; SSE-NEXT: .cfi_offset %rbx, -48
+; SSE-NEXT: .cfi_offset %r12, -40
+; SSE-NEXT: .cfi_offset %r13, -32
+; SSE-NEXT: .cfi_offset %r14, -24
+; SSE-NEXT: .cfi_offset %r15, -16
+; SSE-NEXT: movq %r8, %rbx
+; SSE-NEXT: movq %rcx, %r14
+; SSE-NEXT: movq %rdx, %rax
+; SSE-NEXT: movq %rdi, %r15
+; SSE-NEXT: movl $14, %edx
+; SSE-NEXT: movq %rsi, %rdi
+; SSE-NEXT: movq %rax, %rsi
+; SSE-NEXT: xorl %ecx, %ecx
+; SSE-NEXT: callq __udivti3 at PLT
+; SSE-NEXT: movq %rax, %r12
+; SSE-NEXT: movq %rdx, %r13
+; SSE-NEXT: movl $14, %edx
+; SSE-NEXT: movq %r14, %rdi
+; SSE-NEXT: movq %rbx, %rsi
+; SSE-NEXT: xorl %ecx, %ecx
+; SSE-NEXT: callq __udivti3 at PLT
+; SSE-NEXT: movq %rdx, 24(%r15)
+; SSE-NEXT: movq %rax, 16(%r15)
+; SSE-NEXT: movq %r13, 8(%r15)
+; SSE-NEXT: movq %r12, (%r15)
+; SSE-NEXT: movq %r15, %rax
+; SSE-NEXT: popq %rbx
+; SSE-NEXT: .cfi_def_cfa_offset 40
+; SSE-NEXT: popq %r12
+; SSE-NEXT: .cfi_def_cfa_offset 32
+; SSE-NEXT: popq %r13
+; SSE-NEXT: .cfi_def_cfa_offset 24
+; SSE-NEXT: popq %r14
+; SSE-NEXT: .cfi_def_cfa_offset 16
+; SSE-NEXT: popq %r15
+; SSE-NEXT: .cfi_def_cfa_offset 8
+; SSE-NEXT: retq
+;
+; AVX-LABEL: v2i128_div_by_14:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: pushq %r15
+; AVX-NEXT: .cfi_def_cfa_offset 16
+; AVX-NEXT: pushq %r14
+; AVX-NEXT: .cfi_def_cfa_offset 24
+; AVX-NEXT: pushq %r13
+; AVX-NEXT: .cfi_def_cfa_offset 32
+; AVX-NEXT: pushq %r12
+; AVX-NEXT: .cfi_def_cfa_offset 40
+; AVX-NEXT: pushq %rbx
+; AVX-NEXT: .cfi_def_cfa_offset 48
+; AVX-NEXT: .cfi_offset %rbx, -48
+; AVX-NEXT: .cfi_offset %r12, -40
+; AVX-NEXT: .cfi_offset %r13, -32
+; AVX-NEXT: .cfi_offset %r14, -24
+; AVX-NEXT: .cfi_offset %r15, -16
+; AVX-NEXT: movq %r8, %rbx
+; AVX-NEXT: movq %rcx, %r14
+; AVX-NEXT: movq %rdx, %rax
+; AVX-NEXT: movq %rdi, %r15
+; AVX-NEXT: movl $14, %edx
+; AVX-NEXT: movq %rsi, %rdi
+; AVX-NEXT: movq %rax, %rsi
+; AVX-NEXT: xorl %ecx, %ecx
+; AVX-NEXT: callq __udivti3 at PLT
+; AVX-NEXT: movq %rax, %r12
+; AVX-NEXT: movq %rdx, %r13
+; AVX-NEXT: movl $14, %edx
+; AVX-NEXT: movq %r14, %rdi
+; AVX-NEXT: movq %rbx, %rsi
+; AVX-NEXT: xorl %ecx, %ecx
+; AVX-NEXT: callq __udivti3 at PLT
+; AVX-NEXT: movq %rdx, 24(%r15)
+; AVX-NEXT: movq %rax, 16(%r15)
+; AVX-NEXT: movq %r13, 8(%r15)
+; AVX-NEXT: movq %r12, (%r15)
+; AVX-NEXT: movq %r15, %rax
+; AVX-NEXT: popq %rbx
+; AVX-NEXT: .cfi_def_cfa_offset 40
+; AVX-NEXT: popq %r12
+; AVX-NEXT: .cfi_def_cfa_offset 32
+; AVX-NEXT: popq %r13
+; AVX-NEXT: .cfi_def_cfa_offset 24
+; AVX-NEXT: popq %r14
+; AVX-NEXT: .cfi_def_cfa_offset 16
+; AVX-NEXT: popq %r15
+; AVX-NEXT: .cfi_def_cfa_offset 8
+; AVX-NEXT: retq
+entry:
+ %div = udiv <2 x i128> %x, <i128 14, i128 14>
+ ret <2 x i128> %div
+}
+
+define <2 x i128> @v2i128_rem_by_7(<2 x i128> %x) {
+; SSE-LABEL: v2i128_rem_by_7:
+; SSE: # %bb.0: # %entry
+; SSE-NEXT: movq %rdx, %r9
+; SSE-NEXT: movabsq $9223372036854775807, %r10 # imm = 0x7FFFFFFFFFFFFFFF
+; SSE-NEXT: movq %rsi, %rax
+; SSE-NEXT: shrdq $63, %rdx, %rax
+; SSE-NEXT: addq %rsi, %rax
+; SSE-NEXT: andq %r10, %rsi
+; SSE-NEXT: movq %rax, %rdx
+; SSE-NEXT: andq %r10, %rdx
+; SSE-NEXT: shrq $62, %r9
+; SSE-NEXT: cmpq %rsi, %rdx
+; SSE-NEXT: adcq %rax, %r9
+; SSE-NEXT: andq %r10, %r9
+; SSE-NEXT: movabsq $5270498306774157605, %r11 # imm = 0x4924924924924925
+; SSE-NEXT: movq %r9, %rax
+; SSE-NEXT: mulq %r11
+; SSE-NEXT: movq %rdx, %rsi
+; SSE-NEXT: shrq %rsi
+; SSE-NEXT: leaq (,%rsi,8), %rax
+; SSE-NEXT: subq %rax, %rsi
+; SSE-NEXT: addq %r9, %rsi
+; SSE-NEXT: movq %rcx, %rax
+; SSE-NEXT: shrdq $63, %r8, %rax
+; SSE-NEXT: addq %rcx, %rax
+; SSE-NEXT: andq %r10, %rcx
+; SSE-NEXT: movq %rax, %rdx
+; SSE-NEXT: andq %r10, %rdx
+; SSE-NEXT: shrq $62, %r8
+; SSE-NEXT: cmpq %rcx, %rdx
+; SSE-NEXT: adcq %rax, %r8
+; SSE-NEXT: andq %r10, %r8
+; SSE-NEXT: movq %r8, %rax
+; SSE-NEXT: mulq %r11
+; SSE-NEXT: shrq %rdx
+; SSE-NEXT: leaq (,%rdx,8), %rax
+; SSE-NEXT: subq %rax, %rdx
+; SSE-NEXT: addq %r8, %rdx
+; SSE-NEXT: movq %rdx, 16(%rdi)
+; SSE-NEXT: movq %rsi, (%rdi)
+; SSE-NEXT: movq $0, 24(%rdi)
+; SSE-NEXT: movq $0, 8(%rdi)
+; SSE-NEXT: movq %rdi, %rax
+; SSE-NEXT: retq
+;
+; AVX-LABEL: v2i128_rem_by_7:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: movq %rdx, %r9
+; AVX-NEXT: movabsq $9223372036854775807, %r10 # imm = 0x7FFFFFFFFFFFFFFF
+; AVX-NEXT: movq %rsi, %rax
+; AVX-NEXT: shrdq $63, %rdx, %rax
+; AVX-NEXT: addq %rsi, %rax
+; AVX-NEXT: andq %r10, %rsi
+; AVX-NEXT: movq %rax, %rdx
+; AVX-NEXT: andq %r10, %rdx
+; AVX-NEXT: shrq $62, %r9
+; AVX-NEXT: cmpq %rsi, %rdx
+; AVX-NEXT: adcq %rax, %r9
+; AVX-NEXT: andq %r10, %r9
+; AVX-NEXT: movabsq $5270498306774157605, %r11 # imm = 0x4924924924924925
+; AVX-NEXT: movq %r9, %rax
+; AVX-NEXT: mulq %r11
+; AVX-NEXT: movq %rdx, %rsi
+; AVX-NEXT: shrq %rsi
+; AVX-NEXT: leaq (,%rsi,8), %rax
+; AVX-NEXT: subq %rax, %rsi
+; AVX-NEXT: addq %r9, %rsi
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: shrdq $63, %r8, %rax
+; AVX-NEXT: addq %rcx, %rax
+; AVX-NEXT: andq %r10, %rcx
+; AVX-NEXT: movq %rax, %rdx
+; AVX-NEXT: andq %r10, %rdx
+; AVX-NEXT: shrq $62, %r8
+; AVX-NEXT: cmpq %rcx, %rdx
+; AVX-NEXT: adcq %rax, %r8
+; AVX-NEXT: andq %r10, %r8
+; AVX-NEXT: movq %r8, %rax
+; AVX-NEXT: mulq %r11
+; AVX-NEXT: shrq %rdx
+; AVX-NEXT: leaq (,%rdx,8), %rax
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: addq %r8, %rdx
+; AVX-NEXT: movq %rdx, 16(%rdi)
+; AVX-NEXT: movq %rsi, (%rdi)
+; AVX-NEXT: movq $0, 24(%rdi)
+; AVX-NEXT: movq $0, 8(%rdi)
+; AVX-NEXT: movq %rdi, %rax
+; AVX-NEXT: retq
+entry:
+ %rem = urem <2 x i128> %x, <i128 7, i128 7>
+ ret <2 x i128> %rem
+}
+
+define <2 x i128> @v2i128_rem_by_14(<2 x i128> %x) {
+; SSE-LABEL: v2i128_rem_by_14:
+; SSE: # %bb.0: # %entry
+; SSE-NEXT: pushq %r15
+; SSE-NEXT: .cfi_def_cfa_offset 16
+; SSE-NEXT: pushq %r14
+; SSE-NEXT: .cfi_def_cfa_offset 24
+; SSE-NEXT: pushq %r13
+; SSE-NEXT: .cfi_def_cfa_offset 32
+; SSE-NEXT: pushq %r12
+; SSE-NEXT: .cfi_def_cfa_offset 40
+; SSE-NEXT: pushq %rbx
+; SSE-NEXT: .cfi_def_cfa_offset 48
+; SSE-NEXT: .cfi_offset %rbx, -48
+; SSE-NEXT: .cfi_offset %r12, -40
+; SSE-NEXT: .cfi_offset %r13, -32
+; SSE-NEXT: .cfi_offset %r14, -24
+; SSE-NEXT: .cfi_offset %r15, -16
+; SSE-NEXT: movq %r8, %rbx
+; SSE-NEXT: movq %rcx, %r14
+; SSE-NEXT: movq %rdx, %rax
+; SSE-NEXT: movq %rdi, %r15
+; SSE-NEXT: movl $14, %edx
+; SSE-NEXT: movq %rsi, %rdi
+; SSE-NEXT: movq %rax, %rsi
+; SSE-NEXT: xorl %ecx, %ecx
+; SSE-NEXT: callq __umodti3 at PLT
+; SSE-NEXT: movq %rax, %r12
+; SSE-NEXT: movq %rdx, %r13
+; SSE-NEXT: movl $14, %edx
+; SSE-NEXT: movq %r14, %rdi
+; SSE-NEXT: movq %rbx, %rsi
+; SSE-NEXT: xorl %ecx, %ecx
+; SSE-NEXT: callq __umodti3 at PLT
+; SSE-NEXT: movq %rdx, 24(%r15)
+; SSE-NEXT: movq %rax, 16(%r15)
+; SSE-NEXT: movq %r13, 8(%r15)
+; SSE-NEXT: movq %r12, (%r15)
+; SSE-NEXT: movq %r15, %rax
+; SSE-NEXT: popq %rbx
+; SSE-NEXT: .cfi_def_cfa_offset 40
+; SSE-NEXT: popq %r12
+; SSE-NEXT: .cfi_def_cfa_offset 32
+; SSE-NEXT: popq %r13
+; SSE-NEXT: .cfi_def_cfa_offset 24
+; SSE-NEXT: popq %r14
+; SSE-NEXT: .cfi_def_cfa_offset 16
+; SSE-NEXT: popq %r15
+; SSE-NEXT: .cfi_def_cfa_offset 8
+; SSE-NEXT: retq
+;
+; AVX-LABEL: v2i128_rem_by_14:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: pushq %r15
+; AVX-NEXT: .cfi_def_cfa_offset 16
+; AVX-NEXT: pushq %r14
+; AVX-NEXT: .cfi_def_cfa_offset 24
+; AVX-NEXT: pushq %r13
+; AVX-NEXT: .cfi_def_cfa_offset 32
+; AVX-NEXT: pushq %r12
+; AVX-NEXT: .cfi_def_cfa_offset 40
+; AVX-NEXT: pushq %rbx
+; AVX-NEXT: .cfi_def_cfa_offset 48
+; AVX-NEXT: .cfi_offset %rbx, -48
+; AVX-NEXT: .cfi_offset %r12, -40
+; AVX-NEXT: .cfi_offset %r13, -32
+; AVX-NEXT: .cfi_offset %r14, -24
+; AVX-NEXT: .cfi_offset %r15, -16
+; AVX-NEXT: movq %r8, %rbx
+; AVX-NEXT: movq %rcx, %r14
+; AVX-NEXT: movq %rdx, %rax
+; AVX-NEXT: movq %rdi, %r15
+; AVX-NEXT: movl $14, %edx
+; AVX-NEXT: movq %rsi, %rdi
+; AVX-NEXT: movq %rax, %rsi
+; AVX-NEXT: xorl %ecx, %ecx
+; AVX-NEXT: callq __umodti3 at PLT
+; AVX-NEXT: movq %rax, %r12
+; AVX-NEXT: movq %rdx, %r13
+; AVX-NEXT: movl $14, %edx
+; AVX-NEXT: movq %r14, %rdi
+; AVX-NEXT: movq %rbx, %rsi
+; AVX-NEXT: xorl %ecx, %ecx
+; AVX-NEXT: callq __umodti3 at PLT
+; AVX-NEXT: movq %rdx, 24(%r15)
+; AVX-NEXT: movq %rax, 16(%r15)
+; AVX-NEXT: movq %r13, 8(%r15)
+; AVX-NEXT: movq %r12, (%r15)
+; AVX-NEXT: movq %r15, %rax
+; AVX-NEXT: popq %rbx
+; AVX-NEXT: .cfi_def_cfa_offset 40
+; AVX-NEXT: popq %r12
+; AVX-NEXT: .cfi_def_cfa_offset 32
+; AVX-NEXT: popq %r13
+; AVX-NEXT: .cfi_def_cfa_offset 24
+; AVX-NEXT: popq %r14
+; AVX-NEXT: .cfi_def_cfa_offset 16
+; AVX-NEXT: popq %r15
+; AVX-NEXT: .cfi_def_cfa_offset 8
+; AVX-NEXT: retq
+entry:
+ %rem = urem <2 x i128> %x, <i128 14, i128 14>
+ ret <2 x i128> %rem
+}
>From 11493ae94a4114b328e3f0ec749df4b0373563a2 Mon Sep 17 00:00:00 2001
From: Shivam Gupta <shivam98.tkg at gmail.com>
Date: Sat, 14 Mar 2026 17:26:20 +0530
Subject: [PATCH 11/15] Move from carry propagation to accumulation sum
---
.../CodeGen/SelectionDAG/TargetLowering.cpp | 102 +++---
llvm/test/CodeGen/AArch64/rem-by-const.ll | 87 +++---
llvm/test/CodeGen/RISCV/div-by-constant.ll | 85 ++---
.../CodeGen/RISCV/split-udiv-by-constant.ll | 147 ++++-----
.../CodeGen/RISCV/split-urem-by-constant.ll | 115 +++----
llvm/test/CodeGen/RISCV/urem-vector-lkk.ll | 24 +-
llvm/test/CodeGen/X86/divide-by-constant.ll | 34 +-
llvm/test/CodeGen/X86/divmod128.ll | 50 +--
llvm/test/CodeGen/X86/i128-udiv.ll | 122 ++++----
llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll | 292 ++++++++++--------
10 files changed, 571 insertions(+), 487 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index dda1cb1dcedd5..1c4851bb3c2cf 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8232,93 +8232,73 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
Sum = DAG.getNode(ISD::ADD, dl, HiLoVT, Sum, Carry);
}
} else {
- // If we cannot split in two halves. Let's look for a smaller chunk
- // width where (1 << ChunkWidth) mod Divisor == 1.
- // This ensures that the sum of all such chunks modulo Divisor
- // is equivalent to the original value modulo Divisor.
+ // If we cannot split in two halves, look for a smaller chunk width W
+ // such that (1 << W) % Divisor == 1.
const APInt &Divisor = CN->getAPIntValue();
unsigned BitWidth = VT.getScalarSizeInBits();
unsigned BestChunkWidth = 0;
- // Determine the largest legal scalar integer type we can safely use
- // for chunk operations.
+ // Determine the legal scalar integer type for chunk operations (e.g., i64).
EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
+ unsigned LegalWidth = LegalVT.getScalarSizeInBits();
+ unsigned MaxChunk = std::min<unsigned>(LegalWidth, BitWidth);
- // Clamp to the original bit width.
- unsigned MaxChunk =
- std::min<unsigned>(LegalVT.getScalarSizeInBits(), BitWidth);
-
- // Find the largest W in (MaxChunk/2, MaxChunk] such that
- // 2^W ≡ 1 (mod Divisor). If this holds, the value can be
- // reduced modulo Divisor by summing W-bit chunks.
- //
- // Instead of constructing 2^W for each candidate, compute
- // 2^MaxChunk mod Divisor once and walk downward, maintaining:
- //
- // Mod == 2^i mod Divisor
- //
- // For each decrement of i, update Mod by multiplying with
- // the modular inverse of 2 (Divisor is known to be odd here).
- // Compute 2^MaxChunk mod Divisor
+ // Precompute 2^MaxChunk mod Divisor
APInt Mod(Divisor.getBitWidth(), 1);
for (unsigned k = 0; k < MaxChunk; ++k)
- Mod = (Mod.shl(1)).urem(Divisor);
+ Mod = Mod.shl(1).urem(Divisor);
- // Since Divisor is odd, inverse of 2 mod D is (D+1)/2
+ // Since Divisor is odd, modular inverse of 2 is (Divisor + 1) / 2
APInt Inv2 = (Divisor + 1).lshr(1);
- // Walk downward to find largest valid W
+ // Search for W where 2^W % Divisor == 1
for (unsigned i = MaxChunk; i > MaxChunk / 2; --i) {
if (Mod.isOne()) {
- BestChunkWidth = i;
- break;
+ // Safety Check: Ensure (NumChunks * MaxChunkValue) doesn't overflow
+ // LegalVT
+ unsigned NumChunks = divideCeil(BitWidth, i);
+ // if the ChunkWidth (i) plus the Potential Carry Bits is less than the
+ // Register Width (64), we have enough "slack" at the top of the
+ // register to let the carries pile up safely. Max sum is NumChunks *
+ // (2^i - 1) so by approximation we need NumChunks × 2^i < 2^L. Taking
+ // log on both size we have log2(NumChunks) + i < L.
+ if (i + Log2_32_Ceil(NumChunks) < LegalWidth) {
+ BestChunkWidth = i;
+ break;
+ }
}
-
- // Move from 2^i to 2^(i-1)
Mod = (Mod * Inv2).urem(Divisor);
}
- // If we found a good chunk width, slice the number and sum the pieces.
if (!BestChunkWidth)
return false;
- EVT ChunkVT = EVT::getIntegerVT(*DAG.getContext(), BestChunkWidth);
-
SDValue In =
LL ? DAG.getNode(ISD::BUILD_PAIR, dl, VT, LL, LH) : N->getOperand(0);
+ SDValue TotalSum = DAG.getConstant(0, dl, LegalVT);
+ APInt MaskVal = APInt::getLowBitsSet(LegalWidth, BestChunkWidth);
+ SDValue Mask = DAG.getConstant(MaskVal, dl, LegalVT);
- SmallVector<SDValue, 8> Parts;
- // Split into fixed-size chunks
for (unsigned i = 0; i < BitWidth; i += BestChunkWidth) {
SDValue Shift = DAG.getShiftAmountConstant(i, VT, dl);
SDValue Chunk = DAG.getNode(ISD::SRL, dl, VT, In, Shift);
- Chunk = DAG.getNode(ISD::TRUNCATE, dl, ChunkVT, Chunk);
- Parts.push_back(Chunk);
- }
- assert(!Parts.empty() && "Failed to split divisor into chunks");
- Sum = Parts[0];
-
- // Use uaddo_carry if we can, otherwise use a compare to detect overflow.
- // same logic as used in above if condition.
- SDValue Carry = DAG.getConstant(0, dl, ChunkVT);
- EVT SetCCType =
- getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), ChunkVT);
- for (unsigned i = 1; i < Parts.size(); ++i) {
- if (isOperationLegalOrCustom(ISD::UADDO_CARRY, ChunkVT)) {
- SDVTList VTList = DAG.getVTList(ChunkVT, SetCCType);
- SDValue UAdd = DAG.getNode(ISD::UADDO, dl, VTList, Sum, Parts[i]);
- Sum = DAG.getNode(ISD::UADDO_CARRY, dl, VTList, UAdd, Carry,
- UAdd.getValue(1));
- } else {
- SDValue Add = DAG.getNode(ISD::ADD, dl, ChunkVT, Sum, Parts[i]);
- SDValue NewCarry = DAG.getSetCC(dl, SetCCType, Add, Sum, ISD::SETULT);
- NewCarry = DAG.getZExtOrTrunc(NewCarry, dl, ChunkVT);
- Sum = DAG.getNode(ISD::ADD, dl, ChunkVT, Add, Carry);
- Carry = NewCarry;
- }
- }
-
- Sum = DAG.getNode(ISD::ZERO_EXTEND, dl, HiLoVT, Sum);
+ // Truncate to LegalVT
+ SDValue TruncChunk = DAG.getNode(ISD::TRUNCATE, dl, LegalVT, Chunk);
+ // For the last chunk, we might not need a mask if it's smaller than
+ // BestChunkWidth, but applying it is always safe.
+ SDValue MaskedChunk =
+ DAG.getNode(ISD::AND, dl, LegalVT, TruncChunk, Mask);
+ TotalSum = DAG.getNode(ISD::ADD, dl, LegalVT, TotalSum, MaskedChunk);
+ }
+
+ // Final reduction: TotalSum % Divisor.
+ // Since TotalSum is in LegalVT, this UREM will be lowered via magic
+ // multiplication.
+ SDValue ResRem =
+ DAG.getNode(ISD::UREM, dl, LegalVT, TotalSum,
+ DAG.getConstant(Divisor.trunc(LegalWidth), dl, LegalVT));
+
+ Sum = DAG.getNode(ISD::ZERO_EXTEND, dl, HiLoVT, ResRem);
}
// If we didn't find a sum, we can't do the expansion.
diff --git a/llvm/test/CodeGen/AArch64/rem-by-const.ll b/llvm/test/CodeGen/AArch64/rem-by-const.ll
index d2875d9a3fc05..23655839ae164 100644
--- a/llvm/test/CodeGen/AArch64/rem-by-const.ll
+++ b/llvm/test/CodeGen/AArch64/rem-by-const.ll
@@ -500,23 +500,27 @@ entry:
define i128 @ui128_7(i128 %a, i128 %b) {
; CHECK-SD-LABEL: ui128_7:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: extr x9, x1, x0, #63
-; CHECK-SD-NEXT: mov x8, #18725 // =0x4925
-; CHECK-SD-NEXT: and x11, x0, #0x7fffffffffffffff
-; CHECK-SD-NEXT: movk x8, #9362, lsl #16
-; CHECK-SD-NEXT: add x9, x0, x9
-; CHECK-SD-NEXT: movk x8, #37449, lsl #32
-; CHECK-SD-NEXT: add x10, x9, x1, lsr #62
-; CHECK-SD-NEXT: and x9, x9, #0x7fffffffffffffff
-; CHECK-SD-NEXT: movk x8, #18724, lsl #48
-; CHECK-SD-NEXT: cmp x9, x11
+; CHECK-SD-NEXT: extr x8, x1, x0, #60
+; CHECK-SD-NEXT: and x9, x0, #0xfffffffffffffff
+; CHECK-SD-NEXT: and x8, x8, #0xfffffffffffffff
+; CHECK-SD-NEXT: add x8, x9, x8
+; CHECK-SD-NEXT: mov x9, #18725 // =0x4925
+; CHECK-SD-NEXT: movk x9, #9362, lsl #16
+; CHECK-SD-NEXT: add x8, x8, x1, lsr #56
; CHECK-SD-NEXT: mov x1, xzr
-; CHECK-SD-NEXT: cinc x9, x10, lo
-; CHECK-SD-NEXT: and x9, x9, #0x7fffffffffffffff
-; CHECK-SD-NEXT: umulh x8, x9, x8
-; CHECK-SD-NEXT: lsr x8, x8, #1
-; CHECK-SD-NEXT: sub x8, x8, x8, lsl #3
-; CHECK-SD-NEXT: add x0, x9, x8
+; CHECK-SD-NEXT: movk x9, #37449, lsl #32
+; CHECK-SD-NEXT: movk x9, #18724, lsl #48
+; CHECK-SD-NEXT: umulh x9, x8, x9
+; CHECK-SD-NEXT: lsr x9, x9, #1
+; CHECK-SD-NEXT: sub x9, x9, x9, lsl #3
+; CHECK-SD-NEXT: add x8, x8, x9
+; CHECK-SD-NEXT: mov x9, #9363 // =0x2493
+; CHECK-SD-NEXT: movk x9, #37449, lsl #16
+; CHECK-SD-NEXT: movk x9, #18724, lsl #32
+; CHECK-SD-NEXT: movk x9, #9362, lsl #48
+; CHECK-SD-NEXT: umulh x9, x8, x9
+; CHECK-SD-NEXT: sub x9, x9, x9, lsl #3
+; CHECK-SD-NEXT: add x0, x8, x9
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: ui128_7:
@@ -2566,7 +2570,8 @@ define <3 x i32> @uv3i32_7(<3 x i32> %d, <3 x i32> %e) {
; CHECK-SD-NEXT: shrn v1.2s, v1.2d, #32
; CHECK-SD-NEXT: sub w9, w9, w9, lsl #3
; CHECK-SD-NEXT: sub v2.2s, v0.2s, v1.2s
-; CHECK-SD: add w8, w8, w9
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 def $q0
+; CHECK-SD-NEXT: add w8, w8, w9
; CHECK-SD-NEXT: ushll v2.2d, v2.2s, #0
; CHECK-SD-NEXT: shrn v2.2s, v2.2d, #1
; CHECK-SD-NEXT: add v1.2s, v2.2s, v1.2s
@@ -3089,36 +3094,40 @@ entry:
define <2 x i128> @uv2i128_7(<2 x i128> %d, <2 x i128> %e) {
; CHECK-SD-LABEL: uv2i128_7:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: extr x9, x1, x0, #63
-; CHECK-SD-NEXT: extr x8, x3, x2, #63
-; CHECK-SD-NEXT: and x10, x0, #0x7fffffffffffffff
-; CHECK-SD-NEXT: and x12, x2, #0x7fffffffffffffff
-; CHECK-SD-NEXT: add x9, x0, x9
-; CHECK-SD-NEXT: add x8, x2, x8
-; CHECK-SD-NEXT: add x11, x9, x1, lsr #62
-; CHECK-SD-NEXT: and x9, x9, #0x7fffffffffffffff
-; CHECK-SD-NEXT: mov x1, xzr
-; CHECK-SD-NEXT: cmp x9, x10
-; CHECK-SD-NEXT: add x9, x8, x3, lsr #62
-; CHECK-SD-NEXT: and x8, x8, #0x7fffffffffffffff
-; CHECK-SD-NEXT: cinc x10, x11, lo
+; CHECK-SD-NEXT: extr x9, x1, x0, #60
+; CHECK-SD-NEXT: extr x8, x3, x2, #60
+; CHECK-SD-NEXT: and x10, x0, #0xfffffffffffffff
; CHECK-SD-NEXT: mov x11, #18725 // =0x4925
-; CHECK-SD-NEXT: cmp x8, x12
+; CHECK-SD-NEXT: mov x12, #9363 // =0x2493
+; CHECK-SD-NEXT: and x9, x9, #0xfffffffffffffff
+; CHECK-SD-NEXT: and x8, x8, #0xfffffffffffffff
; CHECK-SD-NEXT: movk x11, #9362, lsl #16
-; CHECK-SD-NEXT: cinc x9, x9, lo
-; CHECK-SD-NEXT: and x8, x10, #0x7fffffffffffffff
+; CHECK-SD-NEXT: add x9, x10, x9
+; CHECK-SD-NEXT: and x10, x2, #0xfffffffffffffff
; CHECK-SD-NEXT: movk x11, #37449, lsl #32
-; CHECK-SD-NEXT: and x9, x9, #0x7fffffffffffffff
-; CHECK-SD-NEXT: mov x3, xzr
+; CHECK-SD-NEXT: add x8, x10, x8
+; CHECK-SD-NEXT: add x9, x9, x1, lsr #56
; CHECK-SD-NEXT: movk x11, #18724, lsl #48
-; CHECK-SD-NEXT: umulh x10, x8, x11
-; CHECK-SD-NEXT: umulh x11, x9, x11
+; CHECK-SD-NEXT: add x8, x8, x3, lsr #56
+; CHECK-SD-NEXT: movk x12, #37449, lsl #16
+; CHECK-SD-NEXT: mov x1, xzr
+; CHECK-SD-NEXT: umulh x10, x9, x11
+; CHECK-SD-NEXT: movk x12, #18724, lsl #32
+; CHECK-SD-NEXT: mov x3, xzr
+; CHECK-SD-NEXT: movk x12, #9362, lsl #48
+; CHECK-SD-NEXT: umulh x11, x8, x11
; CHECK-SD-NEXT: lsr x10, x10, #1
; CHECK-SD-NEXT: lsr x11, x11, #1
; CHECK-SD-NEXT: sub x10, x10, x10, lsl #3
; CHECK-SD-NEXT: sub x11, x11, x11, lsl #3
-; CHECK-SD-NEXT: add x0, x8, x10
-; CHECK-SD-NEXT: add x2, x9, x11
+; CHECK-SD-NEXT: add x9, x9, x10
+; CHECK-SD-NEXT: umulh x10, x9, x12
+; CHECK-SD-NEXT: add x8, x8, x11
+; CHECK-SD-NEXT: umulh x11, x8, x12
+; CHECK-SD-NEXT: sub x10, x10, x10, lsl #3
+; CHECK-SD-NEXT: sub x11, x11, x11, lsl #3
+; CHECK-SD-NEXT: add x0, x9, x10
+; CHECK-SD-NEXT: add x2, x8, x11
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: uv2i128_7:
diff --git a/llvm/test/CodeGen/RISCV/div-by-constant.ll b/llvm/test/CodeGen/RISCV/div-by-constant.ll
index bf73f37b09d08..062b93843c80a 100644
--- a/llvm/test/CodeGen/RISCV/div-by-constant.ll
+++ b/llvm/test/CodeGen/RISCV/div-by-constant.ll
@@ -113,29 +113,30 @@ define i64 @udiv64_constant_no_add(i64 %a) nounwind {
define i64 @udiv64_constant_add(i64 %a) nounwind {
; RV32IM-LABEL: udiv64_constant_add:
; RV32IM: # %bb.0:
-; RV32IM-NEXT: lui a2, 262144
-; RV32IM-NEXT: slli a3, a1, 2
-; RV32IM-NEXT: srli a4, a0, 30
-; RV32IM-NEXT: srli a5, a1, 28
+; RV32IM-NEXT: lui a2, 32768
+; RV32IM-NEXT: slli a3, a1, 5
+; RV32IM-NEXT: srli a4, a0, 27
+; RV32IM-NEXT: srli a5, a1, 22
; RV32IM-NEXT: lui a6, 149797
-; RV32IM-NEXT: addi a2, a2, -1
; RV32IM-NEXT: or a3, a4, a3
-; RV32IM-NEXT: and a4, a0, a2
-; RV32IM-NEXT: add a3, a0, a3
-; RV32IM-NEXT: add a5, a3, a5
-; RV32IM-NEXT: and a3, a3, a2
-; RV32IM-NEXT: sltu a3, a3, a4
; RV32IM-NEXT: lui a4, 449390
-; RV32IM-NEXT: add a3, a5, a3
-; RV32IM-NEXT: lui a5, 748983
+; RV32IM-NEXT: addi a2, a2, -1
; RV32IM-NEXT: addi a6, a6, -1755
+; RV32IM-NEXT: and a3, a3, a2
+; RV32IM-NEXT: and a2, a0, a2
+; RV32IM-NEXT: add a2, a2, a3
+; RV32IM-NEXT: add a2, a2, a5
+; RV32IM-NEXT: mulhu a3, a2, a6
+; RV32IM-NEXT: slli a5, a3, 3
+; RV32IM-NEXT: sub a3, a3, a5
+; RV32IM-NEXT: lui a5, 748983
; RV32IM-NEXT: addi a4, a4, -1171
; RV32IM-NEXT: addi a5, a5, -585
-; RV32IM-NEXT: and a2, a3, a2
+; RV32IM-NEXT: add a2, a2, a3
; RV32IM-NEXT: mulhu a3, a2, a6
; RV32IM-NEXT: slli a6, a3, 3
+; RV32IM-NEXT: sub a3, a3, a6
; RV32IM-NEXT: add a2, a2, a3
-; RV32IM-NEXT: sub a2, a2, a6
; RV32IM-NEXT: sub a3, a0, a2
; RV32IM-NEXT: sltu a0, a0, a2
; RV32IM-NEXT: mul a2, a3, a4
@@ -149,37 +150,39 @@ define i64 @udiv64_constant_add(i64 %a) nounwind {
;
; RV32IMZB-LABEL: udiv64_constant_add:
; RV32IMZB: # %bb.0:
-; RV32IMZB-NEXT: srli a2, a0, 30
-; RV32IMZB-NEXT: srli a3, a1, 28
-; RV32IMZB-NEXT: lui a4, 786432
-; RV32IMZB-NEXT: slli a5, a0, 2
-; RV32IMZB-NEXT: lui a6, 149797
-; RV32IMZB-NEXT: sh2add a2, a1, a2
-; RV32IMZB-NEXT: srli a5, a5, 2
-; RV32IMZB-NEXT: add a2, a0, a2
-; RV32IMZB-NEXT: add a3, a2, a3
+; RV32IMZB-NEXT: slli a2, a1, 5
+; RV32IMZB-NEXT: srli a3, a0, 27
+; RV32IMZB-NEXT: lui a4, 1015808
+; RV32IMZB-NEXT: slli a5, a0, 5
+; RV32IMZB-NEXT: srli a6, a1, 22
+; RV32IMZB-NEXT: or a2, a3, a2
+; RV32IMZB-NEXT: lui a3, 149797
; RV32IMZB-NEXT: andn a2, a2, a4
-; RV32IMZB-NEXT: sltu a2, a2, a5
-; RV32IMZB-NEXT: lui a5, 449390
-; RV32IMZB-NEXT: add a2, a3, a2
-; RV32IMZB-NEXT: lui a3, 748983
-; RV32IMZB-NEXT: addi a6, a6, -1755
-; RV32IMZB-NEXT: addi a5, a5, -1171
-; RV32IMZB-NEXT: addi a3, a3, -585
-; RV32IMZB-NEXT: andn a2, a2, a4
-; RV32IMZB-NEXT: mulhu a4, a2, a6
-; RV32IMZB-NEXT: slli a6, a4, 3
-; RV32IMZB-NEXT: add a2, a2, a4
-; RV32IMZB-NEXT: sub a2, a2, a6
-; RV32IMZB-NEXT: sub a4, a0, a2
+; RV32IMZB-NEXT: lui a4, 449390
+; RV32IMZB-NEXT: srli a5, a5, 5
+; RV32IMZB-NEXT: addi a3, a3, -1755
+; RV32IMZB-NEXT: add a5, a5, a6
+; RV32IMZB-NEXT: add a2, a5, a2
+; RV32IMZB-NEXT: mulhu a5, a2, a3
+; RV32IMZB-NEXT: slli a6, a5, 3
+; RV32IMZB-NEXT: sub a5, a5, a6
+; RV32IMZB-NEXT: lui a6, 748983
+; RV32IMZB-NEXT: addi a4, a4, -1171
+; RV32IMZB-NEXT: addi a6, a6, -585
+; RV32IMZB-NEXT: add a2, a2, a5
+; RV32IMZB-NEXT: mulhu a3, a2, a3
+; RV32IMZB-NEXT: slli a5, a3, 3
+; RV32IMZB-NEXT: sub a3, a3, a5
+; RV32IMZB-NEXT: add a2, a2, a3
+; RV32IMZB-NEXT: sub a3, a0, a2
; RV32IMZB-NEXT: sltu a0, a0, a2
-; RV32IMZB-NEXT: mul a2, a4, a5
-; RV32IMZB-NEXT: mulhu a5, a4, a3
+; RV32IMZB-NEXT: mul a2, a3, a4
+; RV32IMZB-NEXT: mulhu a4, a3, a6
; RV32IMZB-NEXT: sub a1, a1, a0
-; RV32IMZB-NEXT: add a2, a5, a2
-; RV32IMZB-NEXT: mul a1, a1, a3
+; RV32IMZB-NEXT: add a2, a4, a2
+; RV32IMZB-NEXT: mul a1, a1, a6
; RV32IMZB-NEXT: add a1, a2, a1
-; RV32IMZB-NEXT: mul a0, a4, a3
+; RV32IMZB-NEXT: mul a0, a3, a6
; RV32IMZB-NEXT: ret
;
; RV64-LABEL: udiv64_constant_add:
diff --git a/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll b/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll
index b151370a15edc..b3f673415f923 100644
--- a/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll
+++ b/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll
@@ -117,29 +117,30 @@ define iXLen2 @test_udiv_5(iXLen2 %x) nounwind {
define iXLen2 @test_udiv_7(iXLen2 %x) nounwind {
; RV32-LABEL: test_udiv_7:
; RV32: # %bb.0:
-; RV32-NEXT: lui a2, 262144
-; RV32-NEXT: slli a3, a1, 2
-; RV32-NEXT: srli a4, a0, 30
-; RV32-NEXT: srli a5, a1, 28
+; RV32-NEXT: lui a2, 32768
+; RV32-NEXT: slli a3, a1, 5
+; RV32-NEXT: srli a4, a0, 27
+; RV32-NEXT: srli a5, a1, 22
; RV32-NEXT: lui a6, 149797
-; RV32-NEXT: addi a2, a2, -1
; RV32-NEXT: or a3, a4, a3
-; RV32-NEXT: and a4, a0, a2
-; RV32-NEXT: add a3, a0, a3
-; RV32-NEXT: add a5, a3, a5
-; RV32-NEXT: and a3, a3, a2
-; RV32-NEXT: sltu a3, a3, a4
; RV32-NEXT: lui a4, 449390
-; RV32-NEXT: add a3, a5, a3
-; RV32-NEXT: lui a5, 748983
+; RV32-NEXT: addi a2, a2, -1
; RV32-NEXT: addi a6, a6, -1755
+; RV32-NEXT: and a3, a3, a2
+; RV32-NEXT: and a2, a0, a2
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a2, a2, a5
+; RV32-NEXT: mulhu a3, a2, a6
+; RV32-NEXT: slli a5, a3, 3
+; RV32-NEXT: sub a3, a3, a5
+; RV32-NEXT: lui a5, 748983
; RV32-NEXT: addi a4, a4, -1171
; RV32-NEXT: addi a5, a5, -585
-; RV32-NEXT: and a2, a3, a2
+; RV32-NEXT: add a2, a2, a3
; RV32-NEXT: mulhu a3, a2, a6
; RV32-NEXT: slli a6, a3, 3
+; RV32-NEXT: sub a3, a3, a6
; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: sub a2, a2, a6
; RV32-NEXT: sub a3, a0, a2
; RV32-NEXT: sltu a0, a0, a2
; RV32-NEXT: mul a2, a3, a4
@@ -154,40 +155,43 @@ define iXLen2 @test_udiv_7(iXLen2 %x) nounwind {
; RV64-LABEL: test_udiv_7:
; RV64: # %bb.0:
; RV64-NEXT: li a2, -1
-; RV64-NEXT: slli a3, a1, 1
-; RV64-NEXT: srli a4, a0, 63
-; RV64-NEXT: srli a5, a1, 62
-; RV64-NEXT: lui a6, 748983
-; RV64-NEXT: srli a2, a2, 1
-; RV64-NEXT: or a3, a4, a3
-; RV64-NEXT: addi a4, a6, -585
-; RV64-NEXT: slli a6, a4, 33
-; RV64-NEXT: add a4, a4, a6
-; RV64-NEXT: and a6, a0, a2
-; RV64-NEXT: add a3, a0, a3
-; RV64-NEXT: add a5, a3, a5
-; RV64-NEXT: and a3, a3, a2
-; RV64-NEXT: sltu a3, a3, a6
+; RV64-NEXT: slli a3, a1, 4
+; RV64-NEXT: srli a4, a0, 60
+; RV64-NEXT: srli a5, a1, 56
; RV64-NEXT: lui a6, %hi(.LCPI2_0)
+; RV64-NEXT: or a3, a4, a3
+; RV64-NEXT: lui a4, 748983
+; RV64-NEXT: srli a2, a2, 4
; RV64-NEXT: ld a6, %lo(.LCPI2_0)(a6)
-; RV64-NEXT: add a3, a5, a3
-; RV64-NEXT: lui a5, %hi(.LCPI2_1)
-; RV64-NEXT: ld a5, %lo(.LCPI2_1)(a5)
-; RV64-NEXT: and a2, a3, a2
-; RV64-NEXT: mulhu a3, a2, a6
-; RV64-NEXT: srli a3, a3, 1
-; RV64-NEXT: slli a6, a3, 3
+; RV64-NEXT: addi a4, a4, -585
+; RV64-NEXT: and a3, a3, a2
+; RV64-NEXT: and a2, a0, a2
; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: sub a2, a2, a6
-; RV64-NEXT: sub a3, a0, a2
+; RV64-NEXT: slli a3, a4, 33
+; RV64-NEXT: add a3, a4, a3
+; RV64-NEXT: lui a4, %hi(.LCPI2_1)
+; RV64-NEXT: ld a4, %lo(.LCPI2_1)(a4)
+; RV64-NEXT: add a2, a2, a5
+; RV64-NEXT: mulhu a5, a2, a6
+; RV64-NEXT: srli a5, a5, 1
+; RV64-NEXT: slli a6, a5, 3
+; RV64-NEXT: sub a5, a5, a6
+; RV64-NEXT: lui a6, %hi(.LCPI2_2)
+; RV64-NEXT: ld a6, %lo(.LCPI2_2)(a6)
+; RV64-NEXT: add a2, a2, a5
+; RV64-NEXT: mulhu a4, a2, a4
+; RV64-NEXT: slli a5, a4, 3
+; RV64-NEXT: sub a4, a4, a5
+; RV64-NEXT: add a2, a2, a4
+; RV64-NEXT: sub a4, a0, a2
; RV64-NEXT: sltu a0, a0, a2
-; RV64-NEXT: mul a2, a3, a5
-; RV64-NEXT: mulhu a5, a3, a4
+; RV64-NEXT: mul a2, a4, a6
+; RV64-NEXT: mulhu a5, a4, a3
; RV64-NEXT: sub a1, a1, a0
; RV64-NEXT: add a2, a5, a2
-; RV64-NEXT: mul a1, a1, a4
+; RV64-NEXT: mul a1, a1, a3
; RV64-NEXT: add a1, a2, a1
-; RV64-NEXT: mul a0, a3, a4
+; RV64-NEXT: mul a0, a4, a3
; RV64-NEXT: ret
%a = udiv iXLen2 %x, 7
ret iXLen2 %a
@@ -196,37 +200,39 @@ define iXLen2 @test_udiv_7(iXLen2 %x) nounwind {
define iXLen2 @test_udiv_9(iXLen2 %x) nounwind {
; RV32-LABEL: test_udiv_9:
; RV32: # %bb.0:
-; RV32-NEXT: lui a2, 262144
-; RV32-NEXT: slli a3, a1, 2
-; RV32-NEXT: srli a4, a0, 30
-; RV32-NEXT: srli a5, a1, 28
-; RV32-NEXT: lui a6, 233017
-; RV32-NEXT: addi a2, a2, -1
+; RV32-NEXT: lui a2, 4096
+; RV32-NEXT: slli a3, a1, 8
+; RV32-NEXT: srli a4, a0, 24
+; RV32-NEXT: srli a5, a1, 16
+; RV32-NEXT: lui a6, 116508
; RV32-NEXT: or a3, a4, a3
-; RV32-NEXT: and a4, a0, a2
-; RV32-NEXT: add a3, a0, a3
-; RV32-NEXT: add a5, a3, a5
-; RV32-NEXT: and a3, a3, a2
-; RV32-NEXT: sltu a3, a3, a4
; RV32-NEXT: lui a4, 582542
-; RV32-NEXT: addi a6, a6, -455
-; RV32-NEXT: addi a4, a4, 910
-; RV32-NEXT: add a3, a5, a3
-; RV32-NEXT: and a2, a3, a2
+; RV32-NEXT: addi a2, a2, -1
+; RV32-NEXT: addi a6, a6, 1821
+; RV32-NEXT: and a3, a3, a2
+; RV32-NEXT: and a2, a0, a2
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a2, a2, a5
; RV32-NEXT: mulhu a3, a2, a6
-; RV32-NEXT: srli a3, a3, 1
; RV32-NEXT: slli a5, a3, 3
+; RV32-NEXT: add a3, a5, a3
+; RV32-NEXT: lui a5, 233017
+; RV32-NEXT: addi a4, a4, 910
+; RV32-NEXT: addi a5, a5, -455
+; RV32-NEXT: sub a2, a2, a3
+; RV32-NEXT: mulhu a3, a2, a6
+; RV32-NEXT: slli a6, a3, 3
+; RV32-NEXT: or a3, a6, a3
; RV32-NEXT: sub a2, a2, a3
-; RV32-NEXT: sub a2, a2, a5
; RV32-NEXT: sub a3, a0, a2
; RV32-NEXT: sltu a0, a0, a2
; RV32-NEXT: mul a2, a3, a4
-; RV32-NEXT: mulhu a4, a3, a6
+; RV32-NEXT: mulhu a4, a3, a5
; RV32-NEXT: sub a1, a1, a0
; RV32-NEXT: add a2, a4, a2
-; RV32-NEXT: mul a1, a1, a6
+; RV32-NEXT: mul a1, a1, a5
; RV32-NEXT: add a1, a2, a1
-; RV32-NEXT: mul a0, a3, a6
+; RV32-NEXT: mul a0, a3, a5
; RV32-NEXT: ret
;
; RV64-LABEL: test_udiv_9:
@@ -236,24 +242,25 @@ define iXLen2 @test_udiv_9(iXLen2 %x) nounwind {
; RV64-NEXT: srli a4, a0, 60
; RV64-NEXT: srli a5, a1, 56
; RV64-NEXT: lui a6, %hi(.LCPI3_0)
-; RV64-NEXT: srli a2, a2, 4
; RV64-NEXT: or a3, a4, a3
-; RV64-NEXT: and a4, a0, a2
-; RV64-NEXT: add a3, a0, a3
-; RV64-NEXT: add a5, a3, a5
-; RV64-NEXT: and a3, a3, a2
-; RV64-NEXT: sltu a3, a3, a4
; RV64-NEXT: lui a4, %hi(.LCPI3_1)
+; RV64-NEXT: srli a2, a2, 4
+; RV64-NEXT: ld a6, %lo(.LCPI3_0)(a6)
+; RV64-NEXT: and a3, a3, a2
+; RV64-NEXT: and a2, a0, a2
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a2, a2, a5
+; RV64-NEXT: mulhu a3, a2, a6
+; RV64-NEXT: slli a5, a3, 3
; RV64-NEXT: add a3, a5, a3
; RV64-NEXT: lui a5, %hi(.LCPI3_2)
-; RV64-NEXT: ld a6, %lo(.LCPI3_0)(a6)
; RV64-NEXT: ld a4, %lo(.LCPI3_1)(a4)
; RV64-NEXT: ld a5, %lo(.LCPI3_2)(a5)
-; RV64-NEXT: and a2, a3, a2
+; RV64-NEXT: sub a2, a2, a3
; RV64-NEXT: mulhu a3, a2, a6
; RV64-NEXT: slli a6, a3, 3
+; RV64-NEXT: or a3, a6, a3
; RV64-NEXT: sub a2, a2, a3
-; RV64-NEXT: sub a2, a2, a6
; RV64-NEXT: sub a3, a0, a2
; RV64-NEXT: sltu a0, a0, a2
; RV64-NEXT: mul a2, a3, a4
diff --git a/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll b/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll
index 4b0c41861664a..d2033d7caf602 100644
--- a/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll
+++ b/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll
@@ -79,50 +79,54 @@ define iXLen2 @test_urem_5(iXLen2 %x) nounwind {
define iXLen2 @test_urem_7(iXLen2 %x) nounwind {
; RV32-LABEL: test_urem_7:
; RV32: # %bb.0:
-; RV32-NEXT: lui a2, 262144
-; RV32-NEXT: slli a3, a1, 2
-; RV32-NEXT: srli a4, a0, 30
-; RV32-NEXT: srli a1, a1, 28
-; RV32-NEXT: lui a5, 149797
-; RV32-NEXT: addi a2, a2, -1
+; RV32-NEXT: lui a2, 32768
+; RV32-NEXT: slli a3, a1, 5
+; RV32-NEXT: srli a4, a0, 27
+; RV32-NEXT: srli a1, a1, 22
; RV32-NEXT: or a3, a4, a3
-; RV32-NEXT: addi a4, a5, -1755
-; RV32-NEXT: and a5, a0, a2
-; RV32-NEXT: add a0, a0, a3
-; RV32-NEXT: and a3, a0, a2
-; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: sltu a1, a3, a5
-; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: lui a4, 149797
+; RV32-NEXT: addi a2, a2, -1
; RV32-NEXT: and a0, a0, a2
-; RV32-NEXT: mulhu a1, a0, a4
+; RV32-NEXT: and a2, a3, a2
+; RV32-NEXT: add a0, a0, a2
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: addi a1, a4, -1755
+; RV32-NEXT: mulhu a2, a0, a1
+; RV32-NEXT: slli a3, a2, 3
+; RV32-NEXT: sub a2, a2, a3
+; RV32-NEXT: add a0, a0, a2
+; RV32-NEXT: mulhu a1, a0, a1
; RV32-NEXT: slli a2, a1, 3
+; RV32-NEXT: sub a1, a1, a2
; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: sub a0, a0, a2
; RV32-NEXT: li a1, 0
; RV32-NEXT: ret
;
; RV64-LABEL: test_urem_7:
; RV64: # %bb.0:
; RV64-NEXT: li a2, -1
-; RV64-NEXT: slli a3, a1, 1
-; RV64-NEXT: srli a4, a0, 63
-; RV64-NEXT: srli a1, a1, 62
-; RV64-NEXT: lui a5, %hi(.LCPI2_0)
-; RV64-NEXT: srli a2, a2, 1
+; RV64-NEXT: slli a3, a1, 4
+; RV64-NEXT: srli a4, a0, 60
+; RV64-NEXT: srli a1, a1, 56
; RV64-NEXT: or a3, a4, a3
-; RV64-NEXT: ld a4, %lo(.LCPI2_0)(a5)
-; RV64-NEXT: and a5, a0, a2
-; RV64-NEXT: add a0, a0, a3
-; RV64-NEXT: and a3, a0, a2
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: sltu a1, a3, a5
-; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: lui a4, %hi(.LCPI2_0)
+; RV64-NEXT: srli a2, a2, 4
+; RV64-NEXT: ld a4, %lo(.LCPI2_0)(a4)
; RV64-NEXT: and a0, a0, a2
+; RV64-NEXT: and a2, a3, a2
+; RV64-NEXT: lui a3, %hi(.LCPI2_1)
+; RV64-NEXT: add a0, a0, a2
+; RV64-NEXT: add a0, a0, a1
; RV64-NEXT: mulhu a1, a0, a4
; RV64-NEXT: srli a1, a1, 1
+; RV64-NEXT: ld a2, %lo(.LCPI2_1)(a3)
+; RV64-NEXT: slli a3, a1, 3
+; RV64-NEXT: sub a1, a1, a3
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: mulhu a1, a0, a2
; RV64-NEXT: slli a2, a1, 3
+; RV64-NEXT: sub a1, a1, a2
; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: sub a0, a0, a2
; RV64-NEXT: li a1, 0
; RV64-NEXT: ret
%a = urem iXLen2 %x, 7
@@ -132,26 +136,26 @@ define iXLen2 @test_urem_7(iXLen2 %x) nounwind {
define iXLen2 @test_urem_9(iXLen2 %x) nounwind {
; RV32-LABEL: test_urem_9:
; RV32: # %bb.0:
-; RV32-NEXT: lui a2, 262144
-; RV32-NEXT: slli a3, a1, 2
-; RV32-NEXT: srli a4, a0, 30
-; RV32-NEXT: srli a1, a1, 28
-; RV32-NEXT: lui a5, 233017
-; RV32-NEXT: addi a2, a2, -1
+; RV32-NEXT: lui a2, 4096
+; RV32-NEXT: slli a3, a1, 8
+; RV32-NEXT: srli a4, a0, 24
+; RV32-NEXT: srli a1, a1, 16
; RV32-NEXT: or a3, a4, a3
-; RV32-NEXT: addi a4, a5, -455
-; RV32-NEXT: and a5, a0, a2
-; RV32-NEXT: add a0, a0, a3
-; RV32-NEXT: and a3, a0, a2
-; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: sltu a1, a3, a5
-; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: lui a4, 116508
+; RV32-NEXT: addi a2, a2, -1
; RV32-NEXT: and a0, a0, a2
-; RV32-NEXT: mulhu a1, a0, a4
-; RV32-NEXT: srli a1, a1, 1
+; RV32-NEXT: and a2, a3, a2
+; RV32-NEXT: add a0, a0, a2
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: addi a1, a4, 1821
+; RV32-NEXT: mulhu a2, a0, a1
+; RV32-NEXT: slli a3, a2, 3
+; RV32-NEXT: add a2, a3, a2
+; RV32-NEXT: sub a0, a0, a2
+; RV32-NEXT: mulhu a1, a0, a1
; RV32-NEXT: slli a2, a1, 3
+; RV32-NEXT: or a1, a2, a1
; RV32-NEXT: sub a0, a0, a1
-; RV32-NEXT: sub a0, a0, a2
; RV32-NEXT: li a1, 0
; RV32-NEXT: ret
;
@@ -160,22 +164,23 @@ define iXLen2 @test_urem_9(iXLen2 %x) nounwind {
; RV64-NEXT: li a2, -1
; RV64-NEXT: slli a3, a1, 4
; RV64-NEXT: srli a4, a0, 60
-; RV64-NEXT: srli a1, a1, 56
-; RV64-NEXT: lui a5, %hi(.LCPI3_0)
-; RV64-NEXT: srli a2, a2, 4
; RV64-NEXT: or a3, a4, a3
-; RV64-NEXT: ld a4, %lo(.LCPI3_0)(a5)
-; RV64-NEXT: and a5, a0, a2
-; RV64-NEXT: add a0, a0, a3
-; RV64-NEXT: and a3, a0, a2
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: sltu a1, a3, a5
-; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: lui a4, %hi(.LCPI3_0)
+; RV64-NEXT: srli a2, a2, 4
+; RV64-NEXT: ld a4, %lo(.LCPI3_0)(a4)
; RV64-NEXT: and a0, a0, a2
+; RV64-NEXT: and a2, a3, a2
+; RV64-NEXT: add a0, a0, a2
+; RV64-NEXT: srli a1, a1, 56
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: mulhu a1, a0, a4
+; RV64-NEXT: slli a2, a1, 3
+; RV64-NEXT: add a1, a2, a1
+; RV64-NEXT: sub a0, a0, a1
; RV64-NEXT: mulhu a1, a0, a4
; RV64-NEXT: slli a2, a1, 3
+; RV64-NEXT: or a1, a2, a1
; RV64-NEXT: sub a0, a0, a1
-; RV64-NEXT: sub a0, a0, a2
; RV64-NEXT: li a1, 0
; RV64-NEXT: ret
%a = urem iXLen2 %x, 9
diff --git a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
index e2f2c00c7818b..003df634699ae 100644
--- a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
+++ b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
@@ -872,23 +872,23 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind {
; RV32IM-NEXT: lw s3, 8(a1)
; RV32IM-NEXT: lw s4, 12(a1)
; RV32IM-NEXT: lui a1, 1024
-; RV32IM-NEXT: lui a5, 45590
+; RV32IM-NEXT: slli a5, a4, 10
+; RV32IM-NEXT: srli a6, a2, 22
+; RV32IM-NEXT: or a5, a6, a5
+; RV32IM-NEXT: lui a6, 45590
; RV32IM-NEXT: addi a1, a1, -1
-; RV32IM-NEXT: addi a5, a5, 1069
-; RV32IM-NEXT: slli a6, a4, 10
-; RV32IM-NEXT: srli a7, a2, 22
-; RV32IM-NEXT: or a6, a7, a6
-; RV32IM-NEXT: and a7, a2, a1
+; RV32IM-NEXT: addi a6, a6, 1069
+; RV32IM-NEXT: and a2, a2, a1
; RV32IM-NEXT: srli a4, a4, 12
-; RV32IM-NEXT: add a2, a2, a6
-; RV32IM-NEXT: and a6, a2, a1
; RV32IM-NEXT: add a2, a2, a4
-; RV32IM-NEXT: sltu a4, a6, a7
-; RV32IM-NEXT: add a2, a2, a4
-; RV32IM-NEXT: and a1, a2, a1
-; RV32IM-NEXT: mulhu a2, a1, a5
+; RV32IM-NEXT: and a1, a5, a1
+; RV32IM-NEXT: add a1, a2, a1
+; RV32IM-NEXT: mulhu a2, a1, a6
; RV32IM-NEXT: li a4, 23
; RV32IM-NEXT: mul a2, a2, a4
+; RV32IM-NEXT: sub a1, a1, a2
+; RV32IM-NEXT: mulhu a2, a1, a6
+; RV32IM-NEXT: mul a2, a2, a4
; RV32IM-NEXT: sub s7, a1, a2
; RV32IM-NEXT: li a2, 1
; RV32IM-NEXT: mv a1, a3
diff --git a/llvm/test/CodeGen/X86/divide-by-constant.ll b/llvm/test/CodeGen/X86/divide-by-constant.ll
index f4f99749969e9..b825c2a5befbb 100644
--- a/llvm/test/CodeGen/X86/divide-by-constant.ll
+++ b/llvm/test/CodeGen/X86/divide-by-constant.ll
@@ -294,9 +294,11 @@ entry:
define i64 @PR23590(i64 %x) nounwind {
; X86-LABEL: PR23590:
; X86: # %bb.0: # %entry
+; X86-NEXT: pushl %ebp
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
+; X86-NEXT: subl $12, %esp
; X86-NEXT: pushl $0
; X86-NEXT: pushl $12345 # imm = 0x3039
; X86-NEXT: pushl {{[0-9]+}}(%esp)
@@ -305,21 +307,23 @@ define i64 @PR23590(i64 %x) nounwind {
; X86-NEXT: addl $16, %esp
; X86-NEXT: movl %eax, %esi
; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %edx, %eax
-; X86-NEXT: shldl $2, %esi, %eax
-; X86-NEXT: addl %esi, %eax
-; X86-NEXT: movl %eax, %edx
-; X86-NEXT: andl $1073741823, %edx # imm = 0x3FFFFFFF
-; X86-NEXT: movl %ecx, %edi
-; X86-NEXT: shrl $28, %edi
-; X86-NEXT: movl %esi, %ebx
-; X86-NEXT: andl $1073741823, %ebx # imm = 0x3FFFFFFF
-; X86-NEXT: cmpl %ebx, %edx
-; X86-NEXT: adcl %eax, %edi
-; X86-NEXT: andl $1073741823, %edi # imm = 0x3FFFFFFF
-; X86-NEXT: movl $613566757, %edx # imm = 0x24924925
+; X86-NEXT: andl $134217727, %eax # imm = 0x7FFFFFF
+; X86-NEXT: movl %esi, %edx
+; X86-NEXT: shrdl $27, %ecx, %edx
+; X86-NEXT: andl $134217727, %edx # imm = 0x7FFFFFF
+; X86-NEXT: movl %ecx, %ebx
+; X86-NEXT: shrl $22, %ebx
+; X86-NEXT: addl %eax, %ebx
+; X86-NEXT: addl %edx, %ebx
+; X86-NEXT: movl $613566757, %ebp # imm = 0x24924925
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: mull %ebp
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: leal (,%edx,8), %eax
+; X86-NEXT: subl %eax, %edi
+; X86-NEXT: addl %ebx, %edi
; X86-NEXT: movl %edi, %eax
-; X86-NEXT: mull %edx
+; X86-NEXT: mull %ebp
; X86-NEXT: leal (,%edx,8), %eax
; X86-NEXT: subl %eax, %edx
; X86-NEXT: addl %edi, %edx
@@ -332,9 +336,11 @@ define i64 @PR23590(i64 %x) nounwind {
; X86-NEXT: addl %esi, %edx
; X86-NEXT: imull $-1227133513, %ecx, %ecx # imm = 0xB6DB6DB7
; X86-NEXT: addl %ecx, %edx
+; X86-NEXT: addl $12, %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; X64-FAST-LABEL: PR23590:
diff --git a/llvm/test/CodeGen/X86/divmod128.ll b/llvm/test/CodeGen/X86/divmod128.ll
index 10b91e82f915a..262d31b44be91 100644
--- a/llvm/test/CodeGen/X86/divmod128.ll
+++ b/llvm/test/CodeGen/X86/divmod128.ll
@@ -67,21 +67,26 @@ define i64 @div128(i128 %x) nounwind {
define i64 @umod128(i128 %x) nounwind {
; X86-64-LABEL: umod128:
; X86-64: # %bb.0:
-; X86-64-NEXT: movabsq $1152921504606846975, %rax # imm = 0xFFFFFFFFFFFFFFF
-; X86-64-NEXT: movq %rsi, %rcx
-; X86-64-NEXT: shldq $4, %rdi, %rcx
-; X86-64-NEXT: addq %rdi, %rcx
-; X86-64-NEXT: andq %rax, %rdi
-; X86-64-NEXT: movq %rcx, %rdx
-; X86-64-NEXT: andq %rax, %rdx
+; X86-64-NEXT: movabsq $1152921504606846975, %rcx # imm = 0xFFFFFFFFFFFFFFF
+; X86-64-NEXT: movq %rdi, %rax
+; X86-64-NEXT: andq %rcx, %rax
+; X86-64-NEXT: shrdq $60, %rsi, %rdi
+; X86-64-NEXT: andq %rdi, %rcx
+; X86-64-NEXT: addq %rax, %rcx
; X86-64-NEXT: shrq $56, %rsi
-; X86-64-NEXT: cmpq %rdi, %rdx
-; X86-64-NEXT: adcq %rsi, %rcx
-; X86-64-NEXT: andq %rax, %rcx
+; X86-64-NEXT: addq %rsi, %rcx
+; X86-64-NEXT: movabsq $3353953467947191203, %rdx # imm = 0x2E8BA2E8BA2E8BA3
+; X86-64-NEXT: movq %rcx, %rax
+; X86-64-NEXT: mulq %rdx
+; X86-64-NEXT: shrq %rdx
+; X86-64-NEXT: leaq (%rdx,%rdx,4), %rax
+; X86-64-NEXT: leaq (%rdx,%rax,2), %rax
+; X86-64-NEXT: subq %rax, %rcx
; X86-64-NEXT: movabsq $1676976733973595602, %rdx # imm = 0x1745D1745D1745D2
; X86-64-NEXT: movq %rcx, %rax
; X86-64-NEXT: mulq %rdx
; X86-64-NEXT: leaq (%rdx,%rdx,4), %rax
+; X86-64-NEXT: movl %eax, %eax
; X86-64-NEXT: leaq (%rdx,%rax,2), %rax
; X86-64-NEXT: subq %rax, %rcx
; X86-64-NEXT: movq %rcx, %rax
@@ -89,21 +94,26 @@ define i64 @umod128(i128 %x) nounwind {
;
; WIN64-LABEL: umod128:
; WIN64: # %bb.0:
-; WIN64-NEXT: movabsq $1152921504606846975, %rax # imm = 0xFFFFFFFFFFFFFFF
-; WIN64-NEXT: movq %rdx, %r8
-; WIN64-NEXT: shldq $4, %rcx, %r8
-; WIN64-NEXT: addq %rcx, %r8
-; WIN64-NEXT: andq %rax, %rcx
-; WIN64-NEXT: movq %r8, %r9
-; WIN64-NEXT: andq %rax, %r9
+; WIN64-NEXT: movabsq $1152921504606846975, %r8 # imm = 0xFFFFFFFFFFFFFFF
+; WIN64-NEXT: movq %rcx, %rax
+; WIN64-NEXT: andq %r8, %rax
+; WIN64-NEXT: shrdq $60, %rdx, %rcx
+; WIN64-NEXT: andq %rcx, %r8
+; WIN64-NEXT: addq %rax, %r8
; WIN64-NEXT: shrq $56, %rdx
-; WIN64-NEXT: cmpq %rcx, %r9
-; WIN64-NEXT: adcq %rdx, %r8
-; WIN64-NEXT: andq %rax, %r8
+; WIN64-NEXT: addq %rdx, %r8
+; WIN64-NEXT: movabsq $3353953467947191203, %rcx # imm = 0x2E8BA2E8BA2E8BA3
+; WIN64-NEXT: movq %r8, %rax
+; WIN64-NEXT: mulq %rcx
+; WIN64-NEXT: shrq %rdx
+; WIN64-NEXT: leaq (%rdx,%rdx,4), %rax
+; WIN64-NEXT: leaq (%rdx,%rax,2), %rax
+; WIN64-NEXT: subq %rax, %r8
; WIN64-NEXT: movabsq $1676976733973595602, %rcx # imm = 0x1745D1745D1745D2
; WIN64-NEXT: movq %r8, %rax
; WIN64-NEXT: mulq %rcx
; WIN64-NEXT: leaq (%rdx,%rdx,4), %rax
+; WIN64-NEXT: movl %eax, %eax
; WIN64-NEXT: leaq (%rdx,%rax,2), %rax
; WIN64-NEXT: subq %rax, %r8
; WIN64-NEXT: movq %r8, %rax
diff --git a/llvm/test/CodeGen/X86/i128-udiv.ll b/llvm/test/CodeGen/X86/i128-udiv.ll
index 3d2654d2bcf46..5e55e072e0d63 100644
--- a/llvm/test/CodeGen/X86/i128-udiv.ll
+++ b/llvm/test/CodeGen/X86/i128-udiv.ll
@@ -32,6 +32,7 @@ define i128 @test1(i128 %x) nounwind {
ret i128 %tmp
}
+; X86 doesn't have __divti3, so the urem is expanded into a loop.
define i128 @test2(i128 %x) nounwind {
; X86-LABEL: test2:
; X86: # %bb.0: # %_udiv-special-cases
@@ -324,11 +325,11 @@ define i128 @test2(i128 %x) nounwind {
; X64-NEXT: callq __udivti3 at PLT
; X64-NEXT: popq %rcx
; X64-NEXT: retq
-; X86 doesn't have __divti3, so the urem is expanded into a loop.
%tmp = udiv i128 %x, -73786976294838206464
ret i128 %tmp
}
+; X86 doesn't have __divti3, so the urem is expanded into a loop.
define i128 @test3(i128 %x) nounwind {
; X86-LABEL: test3:
; X86: # %bb.0: # %_udiv-special-cases
@@ -635,7 +636,6 @@ define i128 @test3(i128 %x) nounwind {
; X64-NEXT: callq __udivti3 at PLT
; X64-NEXT: popq %rcx
; X64-NEXT: retq
-; X86 doesn't have __divti3, so the urem is expanded into a loop.
%tmp = udiv i128 %x, -73786976294838206467
ret i128 %tmp
}
@@ -941,26 +941,30 @@ define i128 @div_by_7(i128 %x) {
;
; X64-LABEL: div_by_7:
; X64: # %bb.0: # %entry
-; X64-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF
+; X64-NEXT: movabsq $1152921504606846975, %rax # imm = 0xFFFFFFFFFFFFFFF
+; X64-NEXT: movq %rdi, %rcx
+; X64-NEXT: andq %rax, %rcx
; X64-NEXT: movq %rdi, %rdx
+; X64-NEXT: shrdq $60, %rsi, %rdx
; X64-NEXT: andq %rax, %rdx
-; X64-NEXT: movq %rdi, %r8
-; X64-NEXT: shrdq $63, %rsi, %r8
-; X64-NEXT: addq %rdi, %r8
-; X64-NEXT: movq %r8, %r9
-; X64-NEXT: andq %rax, %r9
+; X64-NEXT: addq %rcx, %rdx
; X64-NEXT: movq %rsi, %rcx
-; X64-NEXT: shrq $62, %rcx
-; X64-NEXT: cmpq %rdx, %r9
-; X64-NEXT: adcq %r8, %rcx
-; X64-NEXT: andq %rax, %rcx
+; X64-NEXT: shrq $56, %rcx
+; X64-NEXT: addq %rdx, %rcx
; X64-NEXT: movabsq $5270498306774157605, %rdx # imm = 0x4924924924924925
; X64-NEXT: movq %rcx, %rax
; X64-NEXT: mulq %rdx
-; X64-NEXT: shrq %rdx
-; X64-NEXT: leaq (,%rdx,8), %rax
+; X64-NEXT: movq %rdx, %r8
+; X64-NEXT: shrq %r8
+; X64-NEXT: leaq (,%r8,8), %rax
+; X64-NEXT: subq %rax, %r8
+; X64-NEXT: addq %rcx, %r8
+; X64-NEXT: movabsq $2635249153387078803, %rcx # imm = 0x2492492492492493
+; X64-NEXT: movq %r8, %rax
+; X64-NEXT: mulq %rcx
+; X64-NEXT: leal (,%rdx,8), %eax
; X64-NEXT: subq %rax, %rdx
-; X64-NEXT: addq %rcx, %rdx
+; X64-NEXT: addq %r8, %rdx
; X64-NEXT: subq %rdx, %rdi
; X64-NEXT: sbbq $0, %rsi
; X64-NEXT: movabsq $-5270498306774157605, %rcx # imm = 0xB6DB6DB6DB6DB6DB
@@ -1279,21 +1283,22 @@ define i128 @div_by_9(i128 %x) {
; X64-LABEL: div_by_9:
; X64: # %bb.0: # %entry
; X64-NEXT: movabsq $1152921504606846975, %rax # imm = 0xFFFFFFFFFFFFFFF
+; X64-NEXT: movq %rdi, %rcx
+; X64-NEXT: andq %rax, %rcx
; X64-NEXT: movq %rdi, %rdx
+; X64-NEXT: shrdq $60, %rsi, %rdx
; X64-NEXT: andq %rax, %rdx
-; X64-NEXT: movq %rdi, %r8
-; X64-NEXT: shrdq $60, %rsi, %r8
-; X64-NEXT: addq %rdi, %r8
-; X64-NEXT: movq %r8, %r9
-; X64-NEXT: andq %rax, %r9
+; X64-NEXT: addq %rcx, %rdx
; X64-NEXT: movq %rsi, %rcx
; X64-NEXT: shrq $56, %rcx
-; X64-NEXT: cmpq %rdx, %r9
-; X64-NEXT: adcq %r8, %rcx
-; X64-NEXT: andq %rax, %rcx
-; X64-NEXT: movabsq $2049638230412172402, %rdx # imm = 0x1C71C71C71C71C72
+; X64-NEXT: addq %rdx, %rcx
+; X64-NEXT: movabsq $2049638230412172402, %r8 # imm = 0x1C71C71C71C71C72
; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: mulq %rdx
+; X64-NEXT: mulq %r8
+; X64-NEXT: leaq (%rdx,%rdx,8), %rax
+; X64-NEXT: subq %rax, %rcx
+; X64-NEXT: movq %rcx, %rax
+; X64-NEXT: mulq %r8
; X64-NEXT: leaq (%rdx,%rdx,8), %rax
; X64-NEXT: subq %rax, %rcx
; X64-NEXT: subq %rcx, %rdi
@@ -1614,29 +1619,33 @@ define i128 @div_by_11(i128 %x) {
; X64-LABEL: div_by_11:
; X64: # %bb.0:
; X64-NEXT: movabsq $1152921504606846975, %rax # imm = 0xFFFFFFFFFFFFFFF
+; X64-NEXT: movq %rdi, %rcx
+; X64-NEXT: andq %rax, %rcx
; X64-NEXT: movq %rdi, %rdx
+; X64-NEXT: shrdq $60, %rsi, %rdx
; X64-NEXT: andq %rax, %rdx
-; X64-NEXT: movq %rdi, %r8
-; X64-NEXT: shrdq $60, %rsi, %r8
-; X64-NEXT: addq %rdi, %r8
-; X64-NEXT: movq %r8, %r9
-; X64-NEXT: andq %rax, %r9
+; X64-NEXT: addq %rcx, %rdx
; X64-NEXT: movq %rsi, %rcx
; X64-NEXT: shrq $56, %rcx
-; X64-NEXT: cmpq %rdx, %r9
-; X64-NEXT: adcq %r8, %rcx
-; X64-NEXT: andq %rax, %rcx
+; X64-NEXT: addq %rdx, %rcx
+; X64-NEXT: movabsq $3353953467947191203, %r8 # imm = 0x2E8BA2E8BA2E8BA3
+; X64-NEXT: movq %rcx, %rax
+; X64-NEXT: mulq %r8
+; X64-NEXT: shrq %rdx
+; X64-NEXT: leaq (%rdx,%rdx,4), %rax
+; X64-NEXT: leaq (%rdx,%rax,2), %rax
+; X64-NEXT: subq %rax, %rcx
; X64-NEXT: movabsq $1676976733973595602, %rdx # imm = 0x1745D1745D1745D2
; X64-NEXT: movq %rcx, %rax
; X64-NEXT: mulq %rdx
; X64-NEXT: leaq (%rdx,%rdx,4), %rax
+; X64-NEXT: movl %eax, %eax
; X64-NEXT: leaq (%rdx,%rax,2), %rax
; X64-NEXT: subq %rax, %rcx
; X64-NEXT: subq %rcx, %rdi
; X64-NEXT: sbbq $0, %rsi
; X64-NEXT: movabsq $-6707906935894382406, %rcx # imm = 0xA2E8BA2E8BA2E8BA
; X64-NEXT: imulq %rdi, %rcx
-; X64-NEXT: movabsq $3353953467947191203, %r8 # imm = 0x2E8BA2E8BA2E8BA3
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: mulq %r8
; X64-NEXT: addq %rcx, %rdx
@@ -2263,18 +2272,22 @@ define i128 @div_by_25(i128 %x) {
; X64-LABEL: div_by_25:
; X64: # %bb.0: # %entry
; X64-NEXT: movabsq $1152921504606846975, %rax # imm = 0xFFFFFFFFFFFFFFF
+; X64-NEXT: movq %rdi, %rcx
+; X64-NEXT: andq %rax, %rcx
; X64-NEXT: movq %rdi, %rdx
+; X64-NEXT: shrdq $60, %rsi, %rdx
; X64-NEXT: andq %rax, %rdx
-; X64-NEXT: movq %rdi, %r8
-; X64-NEXT: shrdq $60, %rsi, %r8
-; X64-NEXT: addq %rdi, %r8
-; X64-NEXT: movq %r8, %r9
-; X64-NEXT: andq %rax, %r9
+; X64-NEXT: addq %rcx, %rdx
; X64-NEXT: movq %rsi, %rcx
; X64-NEXT: shrq $56, %rcx
-; X64-NEXT: cmpq %rdx, %r9
-; X64-NEXT: adcq %r8, %rcx
-; X64-NEXT: andq %rax, %rcx
+; X64-NEXT: addq %rdx, %rcx
+; X64-NEXT: movabsq $2951479051793528259, %rdx # imm = 0x28F5C28F5C28F5C3
+; X64-NEXT: movq %rcx, %rax
+; X64-NEXT: mulq %rdx
+; X64-NEXT: shrq $2, %rdx
+; X64-NEXT: leaq (%rdx,%rdx,4), %rax
+; X64-NEXT: leaq (%rax,%rax,4), %rax
+; X64-NEXT: subq %rax, %rcx
; X64-NEXT: movabsq $737869762948382065, %rdx # imm = 0xA3D70A3D70A3D71
; X64-NEXT: movq %rcx, %rax
; X64-NEXT: mulq %rdx
@@ -2930,17 +2943,14 @@ define i128 @rem_by_7(i128 %x) {
;
; X64-LABEL: rem_by_7:
; X64: # %bb.0:
-; X64-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF
-; X64-NEXT: movq %rdi, %rcx
-; X64-NEXT: shrdq $63, %rsi, %rcx
-; X64-NEXT: addq %rdi, %rcx
-; X64-NEXT: andq %rax, %rdi
-; X64-NEXT: movq %rcx, %rdx
-; X64-NEXT: andq %rax, %rdx
-; X64-NEXT: shrq $62, %rsi
-; X64-NEXT: cmpq %rdi, %rdx
-; X64-NEXT: adcq %rsi, %rcx
-; X64-NEXT: andq %rax, %rcx
+; X64-NEXT: movabsq $1152921504606846975, %rcx # imm = 0xFFFFFFFFFFFFFFF
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: andq %rcx, %rax
+; X64-NEXT: shrdq $60, %rsi, %rdi
+; X64-NEXT: andq %rdi, %rcx
+; X64-NEXT: addq %rax, %rcx
+; X64-NEXT: shrq $56, %rsi
+; X64-NEXT: addq %rsi, %rcx
; X64-NEXT: movabsq $5270498306774157605, %rdx # imm = 0x4924924924924925
; X64-NEXT: movq %rcx, %rax
; X64-NEXT: mulq %rdx
@@ -2948,6 +2958,12 @@ define i128 @rem_by_7(i128 %x) {
; X64-NEXT: leaq (,%rdx,8), %rax
; X64-NEXT: subq %rax, %rdx
; X64-NEXT: addq %rdx, %rcx
+; X64-NEXT: movabsq $2635249153387078803, %rdx # imm = 0x2492492492492493
+; X64-NEXT: movq %rcx, %rax
+; X64-NEXT: mulq %rdx
+; X64-NEXT: leal (,%rdx,8), %eax
+; X64-NEXT: subq %rax, %rdx
+; X64-NEXT: addq %rdx, %rcx
; X64-NEXT: movq %rcx, %rax
; X64-NEXT: xorl %edx, %edx
; X64-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
index 64b5f6f3db2e9..e99013a09182b 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
@@ -1217,36 +1217,43 @@ define <2 x i128> @v2i128_div_by_7(<2 x i128> %x) {
; SSE-NEXT: .cfi_def_cfa_offset 16
; SSE-NEXT: pushq %r14
; SSE-NEXT: .cfi_def_cfa_offset 24
-; SSE-NEXT: pushq %r12
+; SSE-NEXT: pushq %r13
; SSE-NEXT: .cfi_def_cfa_offset 32
-; SSE-NEXT: pushq %rbx
+; SSE-NEXT: pushq %r12
; SSE-NEXT: .cfi_def_cfa_offset 40
-; SSE-NEXT: .cfi_offset %rbx, -40
-; SSE-NEXT: .cfi_offset %r12, -32
+; SSE-NEXT: pushq %rbx
+; SSE-NEXT: .cfi_def_cfa_offset 48
+; SSE-NEXT: .cfi_offset %rbx, -48
+; SSE-NEXT: .cfi_offset %r12, -40
+; SSE-NEXT: .cfi_offset %r13, -32
; SSE-NEXT: .cfi_offset %r14, -24
; SSE-NEXT: .cfi_offset %r15, -16
; SSE-NEXT: movq %rcx, %r9
; SSE-NEXT: movq %rdx, %rcx
-; SSE-NEXT: movabsq $9223372036854775807, %r11 # imm = 0x7FFFFFFFFFFFFFFF
+; SSE-NEXT: movabsq $1152921504606846975, %r15 # imm = 0xFFFFFFFFFFFFFFF
; SSE-NEXT: movq %rsi, %rax
-; SSE-NEXT: andq %r11, %rax
+; SSE-NEXT: andq %r15, %rax
; SSE-NEXT: movq %rsi, %rdx
-; SSE-NEXT: shrdq $63, %rcx, %rdx
-; SSE-NEXT: addq %rsi, %rdx
-; SSE-NEXT: movq %rdx, %rbx
-; SSE-NEXT: andq %r11, %rbx
+; SSE-NEXT: shrdq $60, %rcx, %rdx
+; SSE-NEXT: andq %r15, %rdx
+; SSE-NEXT: addq %rax, %rdx
; SSE-NEXT: movq %rcx, %r10
-; SSE-NEXT: shrq $62, %r10
-; SSE-NEXT: cmpq %rax, %rbx
-; SSE-NEXT: adcq %rdx, %r10
-; SSE-NEXT: andq %r11, %r10
-; SSE-NEXT: movabsq $5270498306774157605, %r15 # imm = 0x4924924924924925
+; SSE-NEXT: shrq $56, %r10
+; SSE-NEXT: addq %rdx, %r10
+; SSE-NEXT: movabsq $5270498306774157605, %r12 # imm = 0x4924924924924925
; SSE-NEXT: movq %r10, %rax
-; SSE-NEXT: mulq %r15
-; SSE-NEXT: shrq %rdx
-; SSE-NEXT: leaq (,%rdx,8), %rax
+; SSE-NEXT: mulq %r12
+; SSE-NEXT: movq %rdx, %r11
+; SSE-NEXT: shrq %r11
+; SSE-NEXT: leaq (,%r11,8), %rax
+; SSE-NEXT: subq %rax, %r11
+; SSE-NEXT: addq %r10, %r11
+; SSE-NEXT: movabsq $2635249153387078803, %r13 # imm = 0x2492492492492493
+; SSE-NEXT: movq %r11, %rax
+; SSE-NEXT: mulq %r13
+; SSE-NEXT: leal (,%rdx,8), %eax
; SSE-NEXT: subq %rax, %rdx
-; SSE-NEXT: addq %r10, %rdx
+; SSE-NEXT: addq %r11, %rdx
; SSE-NEXT: subq %rdx, %rsi
; SSE-NEXT: sbbq $0, %rcx
; SSE-NEXT: movabsq $-5270498306774157605, %rbx # imm = 0xB6DB6DB6DB6DB6DB
@@ -1260,23 +1267,26 @@ define <2 x i128> @v2i128_div_by_7(<2 x i128> %x) {
; SSE-NEXT: imulq %r14, %rcx
; SSE-NEXT: addq %rdx, %rcx
; SSE-NEXT: movq %r9, %rax
-; SSE-NEXT: andq %r11, %rax
+; SSE-NEXT: andq %r15, %rax
; SSE-NEXT: movq %r9, %rdx
-; SSE-NEXT: shrdq $63, %r8, %rdx
-; SSE-NEXT: addq %r9, %rdx
-; SSE-NEXT: movq %rdx, %r12
-; SSE-NEXT: andq %r11, %r12
+; SSE-NEXT: shrdq $60, %r8, %rdx
+; SSE-NEXT: andq %r15, %rdx
+; SSE-NEXT: addq %rax, %rdx
; SSE-NEXT: movq %r8, %r10
-; SSE-NEXT: shrq $62, %r10
-; SSE-NEXT: cmpq %rax, %r12
-; SSE-NEXT: adcq %rdx, %r10
-; SSE-NEXT: andq %r11, %r10
+; SSE-NEXT: shrq $56, %r10
+; SSE-NEXT: addq %rdx, %r10
; SSE-NEXT: movq %r10, %rax
-; SSE-NEXT: mulq %r15
-; SSE-NEXT: shrq %rdx
-; SSE-NEXT: leaq (,%rdx,8), %rax
+; SSE-NEXT: mulq %r12
+; SSE-NEXT: movq %rdx, %r11
+; SSE-NEXT: shrq %r11
+; SSE-NEXT: leaq (,%r11,8), %rax
+; SSE-NEXT: subq %rax, %r11
+; SSE-NEXT: addq %r10, %r11
+; SSE-NEXT: movq %r11, %rax
+; SSE-NEXT: mulq %r13
+; SSE-NEXT: leal (,%rdx,8), %eax
; SSE-NEXT: subq %rax, %rdx
-; SSE-NEXT: addq %r10, %rdx
+; SSE-NEXT: addq %r11, %rdx
; SSE-NEXT: subq %rdx, %r9
; SSE-NEXT: sbbq $0, %r8
; SSE-NEXT: imulq %r9, %rbx
@@ -1291,8 +1301,10 @@ define <2 x i128> @v2i128_div_by_7(<2 x i128> %x) {
; SSE-NEXT: movq %rcx, 8(%rdi)
; SSE-NEXT: movq %rdi, %rax
; SSE-NEXT: popq %rbx
-; SSE-NEXT: .cfi_def_cfa_offset 32
+; SSE-NEXT: .cfi_def_cfa_offset 40
; SSE-NEXT: popq %r12
+; SSE-NEXT: .cfi_def_cfa_offset 32
+; SSE-NEXT: popq %r13
; SSE-NEXT: .cfi_def_cfa_offset 24
; SSE-NEXT: popq %r14
; SSE-NEXT: .cfi_def_cfa_offset 16
@@ -1306,36 +1318,43 @@ define <2 x i128> @v2i128_div_by_7(<2 x i128> %x) {
; AVX-NEXT: .cfi_def_cfa_offset 16
; AVX-NEXT: pushq %r14
; AVX-NEXT: .cfi_def_cfa_offset 24
-; AVX-NEXT: pushq %r12
+; AVX-NEXT: pushq %r13
; AVX-NEXT: .cfi_def_cfa_offset 32
-; AVX-NEXT: pushq %rbx
+; AVX-NEXT: pushq %r12
; AVX-NEXT: .cfi_def_cfa_offset 40
-; AVX-NEXT: .cfi_offset %rbx, -40
-; AVX-NEXT: .cfi_offset %r12, -32
+; AVX-NEXT: pushq %rbx
+; AVX-NEXT: .cfi_def_cfa_offset 48
+; AVX-NEXT: .cfi_offset %rbx, -48
+; AVX-NEXT: .cfi_offset %r12, -40
+; AVX-NEXT: .cfi_offset %r13, -32
; AVX-NEXT: .cfi_offset %r14, -24
; AVX-NEXT: .cfi_offset %r15, -16
; AVX-NEXT: movq %rcx, %r9
; AVX-NEXT: movq %rdx, %rcx
-; AVX-NEXT: movabsq $9223372036854775807, %r11 # imm = 0x7FFFFFFFFFFFFFFF
+; AVX-NEXT: movabsq $1152921504606846975, %r15 # imm = 0xFFFFFFFFFFFFFFF
; AVX-NEXT: movq %rsi, %rax
-; AVX-NEXT: andq %r11, %rax
+; AVX-NEXT: andq %r15, %rax
; AVX-NEXT: movq %rsi, %rdx
-; AVX-NEXT: shrdq $63, %rcx, %rdx
-; AVX-NEXT: addq %rsi, %rdx
-; AVX-NEXT: movq %rdx, %rbx
-; AVX-NEXT: andq %r11, %rbx
+; AVX-NEXT: shrdq $60, %rcx, %rdx
+; AVX-NEXT: andq %r15, %rdx
+; AVX-NEXT: addq %rax, %rdx
; AVX-NEXT: movq %rcx, %r10
-; AVX-NEXT: shrq $62, %r10
-; AVX-NEXT: cmpq %rax, %rbx
-; AVX-NEXT: adcq %rdx, %r10
-; AVX-NEXT: andq %r11, %r10
-; AVX-NEXT: movabsq $5270498306774157605, %r15 # imm = 0x4924924924924925
+; AVX-NEXT: shrq $56, %r10
+; AVX-NEXT: addq %rdx, %r10
+; AVX-NEXT: movabsq $5270498306774157605, %r12 # imm = 0x4924924924924925
; AVX-NEXT: movq %r10, %rax
-; AVX-NEXT: mulq %r15
-; AVX-NEXT: shrq %rdx
-; AVX-NEXT: leaq (,%rdx,8), %rax
+; AVX-NEXT: mulq %r12
+; AVX-NEXT: movq %rdx, %r11
+; AVX-NEXT: shrq %r11
+; AVX-NEXT: leaq (,%r11,8), %rax
+; AVX-NEXT: subq %rax, %r11
+; AVX-NEXT: addq %r10, %r11
+; AVX-NEXT: movabsq $2635249153387078803, %r13 # imm = 0x2492492492492493
+; AVX-NEXT: movq %r11, %rax
+; AVX-NEXT: mulq %r13
+; AVX-NEXT: leal (,%rdx,8), %eax
; AVX-NEXT: subq %rax, %rdx
-; AVX-NEXT: addq %r10, %rdx
+; AVX-NEXT: addq %r11, %rdx
; AVX-NEXT: subq %rdx, %rsi
; AVX-NEXT: sbbq $0, %rcx
; AVX-NEXT: movabsq $-5270498306774157605, %rbx # imm = 0xB6DB6DB6DB6DB6DB
@@ -1349,23 +1368,26 @@ define <2 x i128> @v2i128_div_by_7(<2 x i128> %x) {
; AVX-NEXT: imulq %r14, %rcx
; AVX-NEXT: addq %rdx, %rcx
; AVX-NEXT: movq %r9, %rax
-; AVX-NEXT: andq %r11, %rax
+; AVX-NEXT: andq %r15, %rax
; AVX-NEXT: movq %r9, %rdx
-; AVX-NEXT: shrdq $63, %r8, %rdx
-; AVX-NEXT: addq %r9, %rdx
-; AVX-NEXT: movq %rdx, %r12
-; AVX-NEXT: andq %r11, %r12
+; AVX-NEXT: shrdq $60, %r8, %rdx
+; AVX-NEXT: andq %r15, %rdx
+; AVX-NEXT: addq %rax, %rdx
; AVX-NEXT: movq %r8, %r10
-; AVX-NEXT: shrq $62, %r10
-; AVX-NEXT: cmpq %rax, %r12
-; AVX-NEXT: adcq %rdx, %r10
-; AVX-NEXT: andq %r11, %r10
+; AVX-NEXT: shrq $56, %r10
+; AVX-NEXT: addq %rdx, %r10
; AVX-NEXT: movq %r10, %rax
-; AVX-NEXT: mulq %r15
-; AVX-NEXT: shrq %rdx
-; AVX-NEXT: leaq (,%rdx,8), %rax
+; AVX-NEXT: mulq %r12
+; AVX-NEXT: movq %rdx, %r11
+; AVX-NEXT: shrq %r11
+; AVX-NEXT: leaq (,%r11,8), %rax
+; AVX-NEXT: subq %rax, %r11
+; AVX-NEXT: addq %r10, %r11
+; AVX-NEXT: movq %r11, %rax
+; AVX-NEXT: mulq %r13
+; AVX-NEXT: leal (,%rdx,8), %eax
; AVX-NEXT: subq %rax, %rdx
-; AVX-NEXT: addq %r10, %rdx
+; AVX-NEXT: addq %r11, %rdx
; AVX-NEXT: subq %rdx, %r9
; AVX-NEXT: sbbq $0, %r8
; AVX-NEXT: imulq %r9, %rbx
@@ -1380,8 +1402,10 @@ define <2 x i128> @v2i128_div_by_7(<2 x i128> %x) {
; AVX-NEXT: movq %rcx, 8(%rdi)
; AVX-NEXT: movq %rdi, %rax
; AVX-NEXT: popq %rbx
-; AVX-NEXT: .cfi_def_cfa_offset 32
+; AVX-NEXT: .cfi_def_cfa_offset 40
; AVX-NEXT: popq %r12
+; AVX-NEXT: .cfi_def_cfa_offset 32
+; AVX-NEXT: popq %r13
; AVX-NEXT: .cfi_def_cfa_offset 24
; AVX-NEXT: popq %r14
; AVX-NEXT: .cfi_def_cfa_offset 16
@@ -1501,92 +1525,116 @@ entry:
define <2 x i128> @v2i128_rem_by_7(<2 x i128> %x) {
; SSE-LABEL: v2i128_rem_by_7:
; SSE: # %bb.0: # %entry
+; SSE-NEXT: pushq %rbx
+; SSE-NEXT: .cfi_def_cfa_offset 16
+; SSE-NEXT: .cfi_offset %rbx, -16
; SSE-NEXT: movq %rdx, %r9
-; SSE-NEXT: movabsq $9223372036854775807, %r10 # imm = 0x7FFFFFFFFFFFFFFF
+; SSE-NEXT: movabsq $1152921504606846975, %r11 # imm = 0xFFFFFFFFFFFFFFF
; SSE-NEXT: movq %rsi, %rax
-; SSE-NEXT: shrdq $63, %rdx, %rax
-; SSE-NEXT: addq %rsi, %rax
-; SSE-NEXT: andq %r10, %rsi
-; SSE-NEXT: movq %rax, %rdx
-; SSE-NEXT: andq %r10, %rdx
-; SSE-NEXT: shrq $62, %r9
-; SSE-NEXT: cmpq %rsi, %rdx
-; SSE-NEXT: adcq %rax, %r9
-; SSE-NEXT: andq %r10, %r9
-; SSE-NEXT: movabsq $5270498306774157605, %r11 # imm = 0x4924924924924925
+; SSE-NEXT: andq %r11, %rax
+; SSE-NEXT: shrdq $60, %rdx, %rsi
+; SSE-NEXT: andq %r11, %rsi
+; SSE-NEXT: addq %rax, %rsi
+; SSE-NEXT: shrq $56, %r9
+; SSE-NEXT: addq %rsi, %r9
+; SSE-NEXT: movabsq $5270498306774157605, %rbx # imm = 0x4924924924924925
; SSE-NEXT: movq %r9, %rax
-; SSE-NEXT: mulq %r11
+; SSE-NEXT: mulq %rbx
+; SSE-NEXT: movq %rdx, %r10
+; SSE-NEXT: shrq %r10
+; SSE-NEXT: leaq (,%r10,8), %rax
+; SSE-NEXT: subq %rax, %r10
+; SSE-NEXT: addq %r9, %r10
+; SSE-NEXT: movabsq $2635249153387078803, %r9 # imm = 0x2492492492492493
+; SSE-NEXT: movq %r10, %rax
+; SSE-NEXT: mulq %r9
; SSE-NEXT: movq %rdx, %rsi
-; SSE-NEXT: shrq %rsi
-; SSE-NEXT: leaq (,%rsi,8), %rax
+; SSE-NEXT: leal (,%rdx,8), %eax
; SSE-NEXT: subq %rax, %rsi
-; SSE-NEXT: addq %r9, %rsi
+; SSE-NEXT: addq %r10, %rsi
; SSE-NEXT: movq %rcx, %rax
-; SSE-NEXT: shrdq $63, %r8, %rax
-; SSE-NEXT: addq %rcx, %rax
-; SSE-NEXT: andq %r10, %rcx
-; SSE-NEXT: movq %rax, %rdx
-; SSE-NEXT: andq %r10, %rdx
-; SSE-NEXT: shrq $62, %r8
-; SSE-NEXT: cmpq %rcx, %rdx
-; SSE-NEXT: adcq %rax, %r8
-; SSE-NEXT: andq %r10, %r8
+; SSE-NEXT: andq %r11, %rax
+; SSE-NEXT: shrdq $60, %r8, %rcx
+; SSE-NEXT: andq %r11, %rcx
+; SSE-NEXT: addq %rax, %rcx
+; SSE-NEXT: shrq $56, %r8
+; SSE-NEXT: addq %rcx, %r8
; SSE-NEXT: movq %r8, %rax
-; SSE-NEXT: mulq %r11
-; SSE-NEXT: shrq %rdx
-; SSE-NEXT: leaq (,%rdx,8), %rax
+; SSE-NEXT: mulq %rbx
+; SSE-NEXT: movq %rdx, %rcx
+; SSE-NEXT: shrq %rcx
+; SSE-NEXT: leaq (,%rcx,8), %rax
+; SSE-NEXT: subq %rax, %rcx
+; SSE-NEXT: addq %r8, %rcx
+; SSE-NEXT: movq %rcx, %rax
+; SSE-NEXT: mulq %r9
+; SSE-NEXT: leal (,%rdx,8), %eax
; SSE-NEXT: subq %rax, %rdx
-; SSE-NEXT: addq %r8, %rdx
+; SSE-NEXT: addq %rcx, %rdx
; SSE-NEXT: movq %rdx, 16(%rdi)
; SSE-NEXT: movq %rsi, (%rdi)
; SSE-NEXT: movq $0, 24(%rdi)
; SSE-NEXT: movq $0, 8(%rdi)
; SSE-NEXT: movq %rdi, %rax
+; SSE-NEXT: popq %rbx
+; SSE-NEXT: .cfi_def_cfa_offset 8
; SSE-NEXT: retq
;
; AVX-LABEL: v2i128_rem_by_7:
; AVX: # %bb.0: # %entry
+; AVX-NEXT: pushq %rbx
+; AVX-NEXT: .cfi_def_cfa_offset 16
+; AVX-NEXT: .cfi_offset %rbx, -16
; AVX-NEXT: movq %rdx, %r9
-; AVX-NEXT: movabsq $9223372036854775807, %r10 # imm = 0x7FFFFFFFFFFFFFFF
+; AVX-NEXT: movabsq $1152921504606846975, %r11 # imm = 0xFFFFFFFFFFFFFFF
; AVX-NEXT: movq %rsi, %rax
-; AVX-NEXT: shrdq $63, %rdx, %rax
-; AVX-NEXT: addq %rsi, %rax
-; AVX-NEXT: andq %r10, %rsi
-; AVX-NEXT: movq %rax, %rdx
-; AVX-NEXT: andq %r10, %rdx
-; AVX-NEXT: shrq $62, %r9
-; AVX-NEXT: cmpq %rsi, %rdx
-; AVX-NEXT: adcq %rax, %r9
-; AVX-NEXT: andq %r10, %r9
-; AVX-NEXT: movabsq $5270498306774157605, %r11 # imm = 0x4924924924924925
+; AVX-NEXT: andq %r11, %rax
+; AVX-NEXT: shrdq $60, %rdx, %rsi
+; AVX-NEXT: andq %r11, %rsi
+; AVX-NEXT: addq %rax, %rsi
+; AVX-NEXT: shrq $56, %r9
+; AVX-NEXT: addq %rsi, %r9
+; AVX-NEXT: movabsq $5270498306774157605, %rbx # imm = 0x4924924924924925
; AVX-NEXT: movq %r9, %rax
-; AVX-NEXT: mulq %r11
+; AVX-NEXT: mulq %rbx
+; AVX-NEXT: movq %rdx, %r10
+; AVX-NEXT: shrq %r10
+; AVX-NEXT: leaq (,%r10,8), %rax
+; AVX-NEXT: subq %rax, %r10
+; AVX-NEXT: addq %r9, %r10
+; AVX-NEXT: movabsq $2635249153387078803, %r9 # imm = 0x2492492492492493
+; AVX-NEXT: movq %r10, %rax
+; AVX-NEXT: mulq %r9
; AVX-NEXT: movq %rdx, %rsi
-; AVX-NEXT: shrq %rsi
-; AVX-NEXT: leaq (,%rsi,8), %rax
+; AVX-NEXT: leal (,%rdx,8), %eax
; AVX-NEXT: subq %rax, %rsi
-; AVX-NEXT: addq %r9, %rsi
+; AVX-NEXT: addq %r10, %rsi
; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: shrdq $63, %r8, %rax
-; AVX-NEXT: addq %rcx, %rax
-; AVX-NEXT: andq %r10, %rcx
-; AVX-NEXT: movq %rax, %rdx
-; AVX-NEXT: andq %r10, %rdx
-; AVX-NEXT: shrq $62, %r8
-; AVX-NEXT: cmpq %rcx, %rdx
-; AVX-NEXT: adcq %rax, %r8
-; AVX-NEXT: andq %r10, %r8
+; AVX-NEXT: andq %r11, %rax
+; AVX-NEXT: shrdq $60, %r8, %rcx
+; AVX-NEXT: andq %r11, %rcx
+; AVX-NEXT: addq %rax, %rcx
+; AVX-NEXT: shrq $56, %r8
+; AVX-NEXT: addq %rcx, %r8
; AVX-NEXT: movq %r8, %rax
-; AVX-NEXT: mulq %r11
-; AVX-NEXT: shrq %rdx
-; AVX-NEXT: leaq (,%rdx,8), %rax
+; AVX-NEXT: mulq %rbx
+; AVX-NEXT: movq %rdx, %rcx
+; AVX-NEXT: shrq %rcx
+; AVX-NEXT: leaq (,%rcx,8), %rax
+; AVX-NEXT: subq %rax, %rcx
+; AVX-NEXT: addq %r8, %rcx
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: mulq %r9
+; AVX-NEXT: leal (,%rdx,8), %eax
; AVX-NEXT: subq %rax, %rdx
-; AVX-NEXT: addq %r8, %rdx
+; AVX-NEXT: addq %rcx, %rdx
; AVX-NEXT: movq %rdx, 16(%rdi)
; AVX-NEXT: movq %rsi, (%rdi)
; AVX-NEXT: movq $0, 24(%rdi)
; AVX-NEXT: movq $0, 8(%rdi)
; AVX-NEXT: movq %rdi, %rax
+; AVX-NEXT: popq %rbx
+; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%rem = urem <2 x i128> %x, <i128 7, i128 7>
>From ddf7f09d6a25394483c42c43b66977851e2b51d3 Mon Sep 17 00:00:00 2001
From: Shivam Gupta <shivam98.tkg at gmail.com>
Date: Sat, 14 Mar 2026 17:41:13 +0530
Subject: [PATCH 12/15] minor comment formatting
---
llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 1c4851bb3c2cf..dcaeffdc50148 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8238,7 +8238,7 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
unsigned BitWidth = VT.getScalarSizeInBits();
unsigned BestChunkWidth = 0;
- // Determine the legal scalar integer type for chunk operations (e.g., i64).
+ // Determine the legal scalar integer type for chunk operations.
EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
unsigned LegalWidth = LegalVT.getScalarSizeInBits();
unsigned MaxChunk = std::min<unsigned>(LegalWidth, BitWidth);
@@ -8259,9 +8259,10 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
unsigned NumChunks = divideCeil(BitWidth, i);
// if the ChunkWidth (i) plus the Potential Carry Bits is less than the
// Register Width (64), we have enough "slack" at the top of the
- // register to let the carries pile up safely. Max sum is NumChunks *
- // (2^i - 1) so by approximation we need NumChunks × 2^i < 2^L. Taking
- // log on both size we have log2(NumChunks) + i < L.
+ // register to let the carries pile up safely.
+ // Max sum is NumChunks * (2^i - 1) so by approximation we need
+ // NumChunks × 2^i < 2^L. Taking log on both size we will have
+ // log2(NumChunks) + i < L.
if (i + Log2_32_Ceil(NumChunks) < LegalWidth) {
BestChunkWidth = i;
break;
>From fb9511d7811141ce90cd56b22ab7d49affe40845 Mon Sep 17 00:00:00 2001
From: Shivam Gupta <shivam98.tkg at gmail.com>
Date: Sat, 14 Mar 2026 17:50:42 +0530
Subject: [PATCH 13/15] Address review comments on style
---
.../CodeGen/SelectionDAG/TargetLowering.cpp | 22 +++++++++----------
1 file changed, 11 insertions(+), 11 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index dcaeffdc50148..72df8cf5481c5 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8245,26 +8245,26 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
// Precompute 2^MaxChunk mod Divisor
APInt Mod(Divisor.getBitWidth(), 1);
- for (unsigned k = 0; k < MaxChunk; ++k)
+ for (unsigned K = 0; K != MaxChunk; ++K)
Mod = Mod.shl(1).urem(Divisor);
// Since Divisor is odd, modular inverse of 2 is (Divisor + 1) / 2
APInt Inv2 = (Divisor + 1).lshr(1);
// Search for W where 2^W % Divisor == 1
- for (unsigned i = MaxChunk; i > MaxChunk / 2; --i) {
+ for (unsigned I = MaxChunk, E = MaxChunk / 2; I > E; --I) {
if (Mod.isOne()) {
// Safety Check: Ensure (NumChunks * MaxChunkValue) doesn't overflow
// LegalVT
- unsigned NumChunks = divideCeil(BitWidth, i);
- // if the ChunkWidth (i) plus the Potential Carry Bits is less than the
+ unsigned NumChunks = divideCeil(BitWidth, I);
+ // if the ChunkWidth (I) plus the Potential Carry Bits is less than the
// Register Width (64), we have enough "slack" at the top of the
// register to let the carries pile up safely.
- // Max sum is NumChunks * (2^i - 1) so by approximation we need
- // NumChunks × 2^i < 2^L. Taking log on both size we will have
- // log2(NumChunks) + i < L.
- if (i + Log2_32_Ceil(NumChunks) < LegalWidth) {
- BestChunkWidth = i;
+ // Max sum is NumChunks * (2^I - 1) so by approximation we need
+ // NumChunks × 2^I < 2^L. Taking log on both size we will have
+ // log2(NumChunks) + I < L.
+ if (I + Log2_32_Ceil(NumChunks) < LegalWidth) {
+ BestChunkWidth = I;
break;
}
}
@@ -8280,8 +8280,8 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
APInt MaskVal = APInt::getLowBitsSet(LegalWidth, BestChunkWidth);
SDValue Mask = DAG.getConstant(MaskVal, dl, LegalVT);
- for (unsigned i = 0; i < BitWidth; i += BestChunkWidth) {
- SDValue Shift = DAG.getShiftAmountConstant(i, VT, dl);
+ for (unsigned I = 0; I < BitWidth; I += BestChunkWidth) {
+ SDValue Shift = DAG.getShiftAmountConstant(I, VT, dl);
SDValue Chunk = DAG.getNode(ISD::SRL, dl, VT, In, Shift);
// Truncate to LegalVT
SDValue TruncChunk = DAG.getNode(ISD::TRUNCATE, dl, LegalVT, Chunk);
>From 99d1b8edea871ee31a88a80601b4dbbb396657cf Mon Sep 17 00:00:00 2001
From: Shivam Gupta <shivam98.tkg at gmail.com>
Date: Mon, 16 Mar 2026 11:08:39 +0530
Subject: [PATCH 14/15] Remove redundant urem
---
.../CodeGen/SelectionDAG/TargetLowering.cpp | 13 +-
llvm/test/CodeGen/AArch64/rem-by-const.ll | 19 +-
llvm/test/CodeGen/RISCV/div-by-constant.ll | 38 ++-
.../CodeGen/RISCV/split-udiv-by-constant.ll | 106 ++++----
.../CodeGen/RISCV/split-urem-by-constant.ll | 22 +-
llvm/test/CodeGen/RISCV/urem-vector-lkk.ll | 3 -
llvm/test/CodeGen/X86/divide-by-constant.ll | 26 +-
llvm/test/CodeGen/X86/divmod128.ll | 14 --
llvm/test/CodeGen/X86/i128-udiv.ll | 40 +--
llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll | 238 ++++++------------
10 files changed, 156 insertions(+), 363 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 72df8cf5481c5..91e1f9bd9201a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8263,7 +8263,8 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
// Max sum is NumChunks * (2^I - 1) so by approximation we need
// NumChunks × 2^I < 2^L. Taking log on both size we will have
// log2(NumChunks) + I < L.
- if (I + Log2_32_Ceil(NumChunks) < LegalWidth) {
+ // if (I + llvm::bit_width(NumChunks - 1) <= LegalWidth) {
+ if (I + Log2_32_Ceil(NumChunks) < LegalWidth) {
BestChunkWidth = I;
break;
}
@@ -8291,15 +8292,7 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
DAG.getNode(ISD::AND, dl, LegalVT, TruncChunk, Mask);
TotalSum = DAG.getNode(ISD::ADD, dl, LegalVT, TotalSum, MaskedChunk);
}
-
- // Final reduction: TotalSum % Divisor.
- // Since TotalSum is in LegalVT, this UREM will be lowered via magic
- // multiplication.
- SDValue ResRem =
- DAG.getNode(ISD::UREM, dl, LegalVT, TotalSum,
- DAG.getConstant(Divisor.trunc(LegalWidth), dl, LegalVT));
-
- Sum = DAG.getNode(ISD::ZERO_EXTEND, dl, HiLoVT, ResRem);
+ Sum = DAG.getNode(ISD::ZERO_EXTEND, dl, HiLoVT, TotalSum);
}
// If we didn't find a sum, we can't do the expansion.
diff --git a/llvm/test/CodeGen/AArch64/rem-by-const.ll b/llvm/test/CodeGen/AArch64/rem-by-const.ll
index 23655839ae164..ae82a73d60efa 100644
--- a/llvm/test/CodeGen/AArch64/rem-by-const.ll
+++ b/llvm/test/CodeGen/AArch64/rem-by-const.ll
@@ -513,13 +513,6 @@ define i128 @ui128_7(i128 %a, i128 %b) {
; CHECK-SD-NEXT: umulh x9, x8, x9
; CHECK-SD-NEXT: lsr x9, x9, #1
; CHECK-SD-NEXT: sub x9, x9, x9, lsl #3
-; CHECK-SD-NEXT: add x8, x8, x9
-; CHECK-SD-NEXT: mov x9, #9363 // =0x2493
-; CHECK-SD-NEXT: movk x9, #37449, lsl #16
-; CHECK-SD-NEXT: movk x9, #18724, lsl #32
-; CHECK-SD-NEXT: movk x9, #9362, lsl #48
-; CHECK-SD-NEXT: umulh x9, x8, x9
-; CHECK-SD-NEXT: sub x9, x9, x9, lsl #3
; CHECK-SD-NEXT: add x0, x8, x9
; CHECK-SD-NEXT: ret
;
@@ -3098,7 +3091,6 @@ define <2 x i128> @uv2i128_7(<2 x i128> %d, <2 x i128> %e) {
; CHECK-SD-NEXT: extr x8, x3, x2, #60
; CHECK-SD-NEXT: and x10, x0, #0xfffffffffffffff
; CHECK-SD-NEXT: mov x11, #18725 // =0x4925
-; CHECK-SD-NEXT: mov x12, #9363 // =0x2493
; CHECK-SD-NEXT: and x9, x9, #0xfffffffffffffff
; CHECK-SD-NEXT: and x8, x8, #0xfffffffffffffff
; CHECK-SD-NEXT: movk x11, #9362, lsl #16
@@ -3109,23 +3101,14 @@ define <2 x i128> @uv2i128_7(<2 x i128> %d, <2 x i128> %e) {
; CHECK-SD-NEXT: add x9, x9, x1, lsr #56
; CHECK-SD-NEXT: movk x11, #18724, lsl #48
; CHECK-SD-NEXT: add x8, x8, x3, lsr #56
-; CHECK-SD-NEXT: movk x12, #37449, lsl #16
; CHECK-SD-NEXT: mov x1, xzr
-; CHECK-SD-NEXT: umulh x10, x9, x11
-; CHECK-SD-NEXT: movk x12, #18724, lsl #32
; CHECK-SD-NEXT: mov x3, xzr
-; CHECK-SD-NEXT: movk x12, #9362, lsl #48
+; CHECK-SD-NEXT: umulh x10, x9, x11
; CHECK-SD-NEXT: umulh x11, x8, x11
; CHECK-SD-NEXT: lsr x10, x10, #1
; CHECK-SD-NEXT: lsr x11, x11, #1
; CHECK-SD-NEXT: sub x10, x10, x10, lsl #3
; CHECK-SD-NEXT: sub x11, x11, x11, lsl #3
-; CHECK-SD-NEXT: add x9, x9, x10
-; CHECK-SD-NEXT: umulh x10, x9, x12
-; CHECK-SD-NEXT: add x8, x8, x11
-; CHECK-SD-NEXT: umulh x11, x8, x12
-; CHECK-SD-NEXT: sub x10, x10, x10, lsl #3
-; CHECK-SD-NEXT: sub x11, x11, x11, lsl #3
; CHECK-SD-NEXT: add x0, x9, x10
; CHECK-SD-NEXT: add x2, x8, x11
; CHECK-SD-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/div-by-constant.ll b/llvm/test/CodeGen/RISCV/div-by-constant.ll
index 062b93843c80a..7293a549ac065 100644
--- a/llvm/test/CodeGen/RISCV/div-by-constant.ll
+++ b/llvm/test/CodeGen/RISCV/div-by-constant.ll
@@ -121,31 +121,27 @@ define i64 @udiv64_constant_add(i64 %a) nounwind {
; RV32IM-NEXT: or a3, a4, a3
; RV32IM-NEXT: lui a4, 449390
; RV32IM-NEXT: addi a2, a2, -1
-; RV32IM-NEXT: addi a6, a6, -1755
; RV32IM-NEXT: and a3, a3, a2
; RV32IM-NEXT: and a2, a0, a2
; RV32IM-NEXT: add a2, a2, a3
-; RV32IM-NEXT: add a2, a2, a5
-; RV32IM-NEXT: mulhu a3, a2, a6
-; RV32IM-NEXT: slli a5, a3, 3
-; RV32IM-NEXT: sub a3, a3, a5
-; RV32IM-NEXT: lui a5, 748983
+; RV32IM-NEXT: lui a3, 748983
+; RV32IM-NEXT: addi a6, a6, -1755
; RV32IM-NEXT: addi a4, a4, -1171
-; RV32IM-NEXT: addi a5, a5, -585
-; RV32IM-NEXT: add a2, a2, a3
-; RV32IM-NEXT: mulhu a3, a2, a6
-; RV32IM-NEXT: slli a6, a3, 3
-; RV32IM-NEXT: sub a3, a3, a6
-; RV32IM-NEXT: add a2, a2, a3
-; RV32IM-NEXT: sub a3, a0, a2
+; RV32IM-NEXT: addi a3, a3, -585
+; RV32IM-NEXT: add a2, a2, a5
+; RV32IM-NEXT: mulhu a5, a2, a6
+; RV32IM-NEXT: slli a6, a5, 3
+; RV32IM-NEXT: sub a5, a5, a6
+; RV32IM-NEXT: add a2, a2, a5
+; RV32IM-NEXT: sub a5, a0, a2
; RV32IM-NEXT: sltu a0, a0, a2
-; RV32IM-NEXT: mul a2, a3, a4
-; RV32IM-NEXT: mulhu a4, a3, a5
+; RV32IM-NEXT: mul a2, a5, a4
+; RV32IM-NEXT: mulhu a4, a5, a3
; RV32IM-NEXT: sub a1, a1, a0
; RV32IM-NEXT: add a2, a4, a2
-; RV32IM-NEXT: mul a1, a1, a5
+; RV32IM-NEXT: mul a1, a1, a3
; RV32IM-NEXT: add a1, a2, a1
-; RV32IM-NEXT: mul a0, a3, a5
+; RV32IM-NEXT: mul a0, a5, a3
; RV32IM-NEXT: ret
;
; RV32IMZB-LABEL: udiv64_constant_add:
@@ -160,16 +156,12 @@ define i64 @udiv64_constant_add(i64 %a) nounwind {
; RV32IMZB-NEXT: andn a2, a2, a4
; RV32IMZB-NEXT: lui a4, 449390
; RV32IMZB-NEXT: srli a5, a5, 5
-; RV32IMZB-NEXT: addi a3, a3, -1755
; RV32IMZB-NEXT: add a5, a5, a6
-; RV32IMZB-NEXT: add a2, a5, a2
-; RV32IMZB-NEXT: mulhu a5, a2, a3
-; RV32IMZB-NEXT: slli a6, a5, 3
-; RV32IMZB-NEXT: sub a5, a5, a6
; RV32IMZB-NEXT: lui a6, 748983
+; RV32IMZB-NEXT: addi a3, a3, -1755
; RV32IMZB-NEXT: addi a4, a4, -1171
; RV32IMZB-NEXT: addi a6, a6, -585
-; RV32IMZB-NEXT: add a2, a2, a5
+; RV32IMZB-NEXT: add a2, a5, a2
; RV32IMZB-NEXT: mulhu a3, a2, a3
; RV32IMZB-NEXT: slli a5, a3, 3
; RV32IMZB-NEXT: sub a3, a3, a5
diff --git a/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll b/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll
index b3f673415f923..6fbc81e61a973 100644
--- a/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll
+++ b/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll
@@ -125,31 +125,27 @@ define iXLen2 @test_udiv_7(iXLen2 %x) nounwind {
; RV32-NEXT: or a3, a4, a3
; RV32-NEXT: lui a4, 449390
; RV32-NEXT: addi a2, a2, -1
-; RV32-NEXT: addi a6, a6, -1755
; RV32-NEXT: and a3, a3, a2
; RV32-NEXT: and a2, a0, a2
; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a2, a2, a5
-; RV32-NEXT: mulhu a3, a2, a6
-; RV32-NEXT: slli a5, a3, 3
-; RV32-NEXT: sub a3, a3, a5
-; RV32-NEXT: lui a5, 748983
+; RV32-NEXT: lui a3, 748983
+; RV32-NEXT: addi a6, a6, -1755
; RV32-NEXT: addi a4, a4, -1171
-; RV32-NEXT: addi a5, a5, -585
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: mulhu a3, a2, a6
-; RV32-NEXT: slli a6, a3, 3
-; RV32-NEXT: sub a3, a3, a6
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: sub a3, a0, a2
+; RV32-NEXT: addi a3, a3, -585
+; RV32-NEXT: add a2, a2, a5
+; RV32-NEXT: mulhu a5, a2, a6
+; RV32-NEXT: slli a6, a5, 3
+; RV32-NEXT: sub a5, a5, a6
+; RV32-NEXT: add a2, a2, a5
+; RV32-NEXT: sub a5, a0, a2
; RV32-NEXT: sltu a0, a0, a2
-; RV32-NEXT: mul a2, a3, a4
-; RV32-NEXT: mulhu a4, a3, a5
+; RV32-NEXT: mul a2, a5, a4
+; RV32-NEXT: mulhu a4, a5, a3
; RV32-NEXT: sub a1, a1, a0
; RV32-NEXT: add a2, a4, a2
-; RV32-NEXT: mul a1, a1, a5
+; RV32-NEXT: mul a1, a1, a3
; RV32-NEXT: add a1, a2, a1
-; RV32-NEXT: mul a0, a3, a5
+; RV32-NEXT: mul a0, a5, a3
; RV32-NEXT: ret
;
; RV64-LABEL: test_udiv_7:
@@ -176,22 +172,16 @@ define iXLen2 @test_udiv_7(iXLen2 %x) nounwind {
; RV64-NEXT: srli a5, a5, 1
; RV64-NEXT: slli a6, a5, 3
; RV64-NEXT: sub a5, a5, a6
-; RV64-NEXT: lui a6, %hi(.LCPI2_2)
-; RV64-NEXT: ld a6, %lo(.LCPI2_2)(a6)
; RV64-NEXT: add a2, a2, a5
-; RV64-NEXT: mulhu a4, a2, a4
-; RV64-NEXT: slli a5, a4, 3
-; RV64-NEXT: sub a4, a4, a5
-; RV64-NEXT: add a2, a2, a4
-; RV64-NEXT: sub a4, a0, a2
+; RV64-NEXT: sub a5, a0, a2
; RV64-NEXT: sltu a0, a0, a2
-; RV64-NEXT: mul a2, a4, a6
-; RV64-NEXT: mulhu a5, a4, a3
+; RV64-NEXT: mul a2, a5, a4
+; RV64-NEXT: mulhu a4, a5, a3
; RV64-NEXT: sub a1, a1, a0
-; RV64-NEXT: add a2, a5, a2
+; RV64-NEXT: add a2, a4, a2
; RV64-NEXT: mul a1, a1, a3
; RV64-NEXT: add a1, a2, a1
-; RV64-NEXT: mul a0, a4, a3
+; RV64-NEXT: mul a0, a5, a3
; RV64-NEXT: ret
%a = udiv iXLen2 %x, 7
ret iXLen2 %a
@@ -208,31 +198,27 @@ define iXLen2 @test_udiv_9(iXLen2 %x) nounwind {
; RV32-NEXT: or a3, a4, a3
; RV32-NEXT: lui a4, 582542
; RV32-NEXT: addi a2, a2, -1
-; RV32-NEXT: addi a6, a6, 1821
; RV32-NEXT: and a3, a3, a2
; RV32-NEXT: and a2, a0, a2
; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a2, a2, a5
-; RV32-NEXT: mulhu a3, a2, a6
-; RV32-NEXT: slli a5, a3, 3
-; RV32-NEXT: add a3, a5, a3
-; RV32-NEXT: lui a5, 233017
+; RV32-NEXT: lui a3, 233017
+; RV32-NEXT: addi a6, a6, 1821
; RV32-NEXT: addi a4, a4, 910
-; RV32-NEXT: addi a5, a5, -455
-; RV32-NEXT: sub a2, a2, a3
-; RV32-NEXT: mulhu a3, a2, a6
-; RV32-NEXT: slli a6, a3, 3
-; RV32-NEXT: or a3, a6, a3
-; RV32-NEXT: sub a2, a2, a3
-; RV32-NEXT: sub a3, a0, a2
+; RV32-NEXT: addi a3, a3, -455
+; RV32-NEXT: add a2, a2, a5
+; RV32-NEXT: mulhu a5, a2, a6
+; RV32-NEXT: slli a6, a5, 3
+; RV32-NEXT: add a5, a6, a5
+; RV32-NEXT: sub a2, a2, a5
+; RV32-NEXT: sub a5, a0, a2
; RV32-NEXT: sltu a0, a0, a2
-; RV32-NEXT: mul a2, a3, a4
-; RV32-NEXT: mulhu a4, a3, a5
+; RV32-NEXT: mul a2, a5, a4
+; RV32-NEXT: mulhu a4, a5, a3
; RV32-NEXT: sub a1, a1, a0
; RV32-NEXT: add a2, a4, a2
-; RV32-NEXT: mul a1, a1, a5
+; RV32-NEXT: mul a1, a1, a3
; RV32-NEXT: add a1, a2, a1
-; RV32-NEXT: mul a0, a3, a5
+; RV32-NEXT: mul a0, a5, a3
; RV32-NEXT: ret
;
; RV64-LABEL: test_udiv_9:
@@ -245,31 +231,27 @@ define iXLen2 @test_udiv_9(iXLen2 %x) nounwind {
; RV64-NEXT: or a3, a4, a3
; RV64-NEXT: lui a4, %hi(.LCPI3_1)
; RV64-NEXT: srli a2, a2, 4
-; RV64-NEXT: ld a6, %lo(.LCPI3_0)(a6)
; RV64-NEXT: and a3, a3, a2
; RV64-NEXT: and a2, a0, a2
; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: add a2, a2, a5
-; RV64-NEXT: mulhu a3, a2, a6
-; RV64-NEXT: slli a5, a3, 3
-; RV64-NEXT: add a3, a5, a3
-; RV64-NEXT: lui a5, %hi(.LCPI3_2)
+; RV64-NEXT: lui a3, %hi(.LCPI3_2)
+; RV64-NEXT: ld a6, %lo(.LCPI3_0)(a6)
; RV64-NEXT: ld a4, %lo(.LCPI3_1)(a4)
-; RV64-NEXT: ld a5, %lo(.LCPI3_2)(a5)
-; RV64-NEXT: sub a2, a2, a3
-; RV64-NEXT: mulhu a3, a2, a6
-; RV64-NEXT: slli a6, a3, 3
-; RV64-NEXT: or a3, a6, a3
-; RV64-NEXT: sub a2, a2, a3
-; RV64-NEXT: sub a3, a0, a2
+; RV64-NEXT: ld a3, %lo(.LCPI3_2)(a3)
+; RV64-NEXT: add a2, a2, a5
+; RV64-NEXT: mulhu a5, a2, a6
+; RV64-NEXT: slli a6, a5, 3
+; RV64-NEXT: add a5, a6, a5
+; RV64-NEXT: sub a2, a2, a5
+; RV64-NEXT: sub a5, a0, a2
; RV64-NEXT: sltu a0, a0, a2
-; RV64-NEXT: mul a2, a3, a4
-; RV64-NEXT: mulhu a4, a3, a5
+; RV64-NEXT: mul a2, a5, a4
+; RV64-NEXT: mulhu a4, a5, a3
; RV64-NEXT: sub a1, a1, a0
; RV64-NEXT: add a2, a4, a2
-; RV64-NEXT: mul a1, a1, a5
+; RV64-NEXT: mul a1, a1, a3
; RV64-NEXT: add a1, a2, a1
-; RV64-NEXT: mul a0, a3, a5
+; RV64-NEXT: mul a0, a5, a3
; RV64-NEXT: ret
%a = udiv iXLen2 %x, 9
ret iXLen2 %a
diff --git a/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll b/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll
index d2033d7caf602..bbd320a9381c9 100644
--- a/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll
+++ b/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll
@@ -91,10 +91,6 @@ define iXLen2 @test_urem_7(iXLen2 %x) nounwind {
; RV32-NEXT: add a0, a0, a2
; RV32-NEXT: add a0, a0, a1
; RV32-NEXT: addi a1, a4, -1755
-; RV32-NEXT: mulhu a2, a0, a1
-; RV32-NEXT: slli a3, a2, 3
-; RV32-NEXT: sub a2, a2, a3
-; RV32-NEXT: add a0, a0, a2
; RV32-NEXT: mulhu a1, a0, a1
; RV32-NEXT: slli a2, a1, 3
; RV32-NEXT: sub a1, a1, a2
@@ -107,23 +103,17 @@ define iXLen2 @test_urem_7(iXLen2 %x) nounwind {
; RV64-NEXT: li a2, -1
; RV64-NEXT: slli a3, a1, 4
; RV64-NEXT: srli a4, a0, 60
-; RV64-NEXT: srli a1, a1, 56
; RV64-NEXT: or a3, a4, a3
; RV64-NEXT: lui a4, %hi(.LCPI2_0)
; RV64-NEXT: srli a2, a2, 4
; RV64-NEXT: ld a4, %lo(.LCPI2_0)(a4)
; RV64-NEXT: and a0, a0, a2
; RV64-NEXT: and a2, a3, a2
-; RV64-NEXT: lui a3, %hi(.LCPI2_1)
; RV64-NEXT: add a0, a0, a2
+; RV64-NEXT: srli a1, a1, 56
; RV64-NEXT: add a0, a0, a1
; RV64-NEXT: mulhu a1, a0, a4
; RV64-NEXT: srli a1, a1, 1
-; RV64-NEXT: ld a2, %lo(.LCPI2_1)(a3)
-; RV64-NEXT: slli a3, a1, 3
-; RV64-NEXT: sub a1, a1, a3
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: mulhu a1, a0, a2
; RV64-NEXT: slli a2, a1, 3
; RV64-NEXT: sub a1, a1, a2
; RV64-NEXT: add a0, a0, a1
@@ -148,13 +138,9 @@ define iXLen2 @test_urem_9(iXLen2 %x) nounwind {
; RV32-NEXT: add a0, a0, a2
; RV32-NEXT: add a0, a0, a1
; RV32-NEXT: addi a1, a4, 1821
-; RV32-NEXT: mulhu a2, a0, a1
-; RV32-NEXT: slli a3, a2, 3
-; RV32-NEXT: add a2, a3, a2
-; RV32-NEXT: sub a0, a0, a2
; RV32-NEXT: mulhu a1, a0, a1
; RV32-NEXT: slli a2, a1, 3
-; RV32-NEXT: or a1, a2, a1
+; RV32-NEXT: add a1, a2, a1
; RV32-NEXT: sub a0, a0, a1
; RV32-NEXT: li a1, 0
; RV32-NEXT: ret
@@ -177,10 +163,6 @@ define iXLen2 @test_urem_9(iXLen2 %x) nounwind {
; RV64-NEXT: slli a2, a1, 3
; RV64-NEXT: add a1, a2, a1
; RV64-NEXT: sub a0, a0, a1
-; RV64-NEXT: mulhu a1, a0, a4
-; RV64-NEXT: slli a2, a1, 3
-; RV64-NEXT: or a1, a2, a1
-; RV64-NEXT: sub a0, a0, a1
; RV64-NEXT: li a1, 0
; RV64-NEXT: ret
%a = urem iXLen2 %x, 9
diff --git a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
index 003df634699ae..180fa6fd2b2f6 100644
--- a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
+++ b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
@@ -886,9 +886,6 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind {
; RV32IM-NEXT: mulhu a2, a1, a6
; RV32IM-NEXT: li a4, 23
; RV32IM-NEXT: mul a2, a2, a4
-; RV32IM-NEXT: sub a1, a1, a2
-; RV32IM-NEXT: mulhu a2, a1, a6
-; RV32IM-NEXT: mul a2, a2, a4
; RV32IM-NEXT: sub s7, a1, a2
; RV32IM-NEXT: li a2, 1
; RV32IM-NEXT: mv a1, a3
diff --git a/llvm/test/CodeGen/X86/divide-by-constant.ll b/llvm/test/CodeGen/X86/divide-by-constant.ll
index b825c2a5befbb..546e375a7f8c6 100644
--- a/llvm/test/CodeGen/X86/divide-by-constant.ll
+++ b/llvm/test/CodeGen/X86/divide-by-constant.ll
@@ -294,11 +294,9 @@ entry:
define i64 @PR23590(i64 %x) nounwind {
; X86-LABEL: PR23590:
; X86: # %bb.0: # %entry
-; X86-NEXT: pushl %ebp
-; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: subl $12, %esp
+; X86-NEXT: pushl %eax
; X86-NEXT: pushl $0
; X86-NEXT: pushl $12345 # imm = 0x3039
; X86-NEXT: pushl {{[0-9]+}}(%esp)
@@ -311,19 +309,13 @@ define i64 @PR23590(i64 %x) nounwind {
; X86-NEXT: movl %esi, %edx
; X86-NEXT: shrdl $27, %ecx, %edx
; X86-NEXT: andl $134217727, %edx # imm = 0x7FFFFFF
-; X86-NEXT: movl %ecx, %ebx
-; X86-NEXT: shrl $22, %ebx
-; X86-NEXT: addl %eax, %ebx
-; X86-NEXT: addl %edx, %ebx
-; X86-NEXT: movl $613566757, %ebp # imm = 0x24924925
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: mull %ebp
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: leal (,%edx,8), %eax
-; X86-NEXT: subl %eax, %edi
-; X86-NEXT: addl %ebx, %edi
+; X86-NEXT: movl %ecx, %edi
+; X86-NEXT: shrl $22, %edi
+; X86-NEXT: addl %eax, %edi
+; X86-NEXT: addl %edx, %edi
+; X86-NEXT: movl $613566757, %edx # imm = 0x24924925
; X86-NEXT: movl %edi, %eax
-; X86-NEXT: mull %ebp
+; X86-NEXT: mull %edx
; X86-NEXT: leal (,%edx,8), %eax
; X86-NEXT: subl %eax, %edx
; X86-NEXT: addl %edi, %edx
@@ -336,11 +328,9 @@ define i64 @PR23590(i64 %x) nounwind {
; X86-NEXT: addl %esi, %edx
; X86-NEXT: imull $-1227133513, %ecx, %ecx # imm = 0xB6DB6DB7
; X86-NEXT: addl %ecx, %edx
-; X86-NEXT: addl $12, %esp
+; X86-NEXT: addl $4, %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; X64-FAST-LABEL: PR23590:
diff --git a/llvm/test/CodeGen/X86/divmod128.ll b/llvm/test/CodeGen/X86/divmod128.ll
index 262d31b44be91..44f4b8227cac8 100644
--- a/llvm/test/CodeGen/X86/divmod128.ll
+++ b/llvm/test/CodeGen/X86/divmod128.ll
@@ -82,13 +82,6 @@ define i64 @umod128(i128 %x) nounwind {
; X86-64-NEXT: leaq (%rdx,%rdx,4), %rax
; X86-64-NEXT: leaq (%rdx,%rax,2), %rax
; X86-64-NEXT: subq %rax, %rcx
-; X86-64-NEXT: movabsq $1676976733973595602, %rdx # imm = 0x1745D1745D1745D2
-; X86-64-NEXT: movq %rcx, %rax
-; X86-64-NEXT: mulq %rdx
-; X86-64-NEXT: leaq (%rdx,%rdx,4), %rax
-; X86-64-NEXT: movl %eax, %eax
-; X86-64-NEXT: leaq (%rdx,%rax,2), %rax
-; X86-64-NEXT: subq %rax, %rcx
; X86-64-NEXT: movq %rcx, %rax
; X86-64-NEXT: retq
;
@@ -109,13 +102,6 @@ define i64 @umod128(i128 %x) nounwind {
; WIN64-NEXT: leaq (%rdx,%rdx,4), %rax
; WIN64-NEXT: leaq (%rdx,%rax,2), %rax
; WIN64-NEXT: subq %rax, %r8
-; WIN64-NEXT: movabsq $1676976733973595602, %rcx # imm = 0x1745D1745D1745D2
-; WIN64-NEXT: movq %r8, %rax
-; WIN64-NEXT: mulq %rcx
-; WIN64-NEXT: leaq (%rdx,%rdx,4), %rax
-; WIN64-NEXT: movl %eax, %eax
-; WIN64-NEXT: leaq (%rdx,%rax,2), %rax
-; WIN64-NEXT: subq %rax, %r8
; WIN64-NEXT: movq %r8, %rax
; WIN64-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/i128-udiv.ll b/llvm/test/CodeGen/X86/i128-udiv.ll
index 5e55e072e0d63..8dbef71c2225c 100644
--- a/llvm/test/CodeGen/X86/i128-udiv.ll
+++ b/llvm/test/CodeGen/X86/i128-udiv.ll
@@ -954,17 +954,10 @@ define i128 @div_by_7(i128 %x) {
; X64-NEXT: movabsq $5270498306774157605, %rdx # imm = 0x4924924924924925
; X64-NEXT: movq %rcx, %rax
; X64-NEXT: mulq %rdx
-; X64-NEXT: movq %rdx, %r8
-; X64-NEXT: shrq %r8
-; X64-NEXT: leaq (,%r8,8), %rax
-; X64-NEXT: subq %rax, %r8
-; X64-NEXT: addq %rcx, %r8
-; X64-NEXT: movabsq $2635249153387078803, %rcx # imm = 0x2492492492492493
-; X64-NEXT: movq %r8, %rax
-; X64-NEXT: mulq %rcx
-; X64-NEXT: leal (,%rdx,8), %eax
+; X64-NEXT: shrq %rdx
+; X64-NEXT: leaq (,%rdx,8), %rax
; X64-NEXT: subq %rax, %rdx
-; X64-NEXT: addq %r8, %rdx
+; X64-NEXT: addq %rcx, %rdx
; X64-NEXT: subq %rdx, %rdi
; X64-NEXT: sbbq $0, %rsi
; X64-NEXT: movabsq $-5270498306774157605, %rcx # imm = 0xB6DB6DB6DB6DB6DB
@@ -1292,13 +1285,9 @@ define i128 @div_by_9(i128 %x) {
; X64-NEXT: movq %rsi, %rcx
; X64-NEXT: shrq $56, %rcx
; X64-NEXT: addq %rdx, %rcx
-; X64-NEXT: movabsq $2049638230412172402, %r8 # imm = 0x1C71C71C71C71C72
-; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: mulq %r8
-; X64-NEXT: leaq (%rdx,%rdx,8), %rax
-; X64-NEXT: subq %rax, %rcx
+; X64-NEXT: movabsq $2049638230412172402, %rdx # imm = 0x1C71C71C71C71C72
; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: mulq %r8
+; X64-NEXT: mulq %rdx
; X64-NEXT: leaq (%rdx,%rdx,8), %rax
; X64-NEXT: subq %rax, %rcx
; X64-NEXT: subq %rcx, %rdi
@@ -1635,13 +1624,6 @@ define i128 @div_by_11(i128 %x) {
; X64-NEXT: leaq (%rdx,%rdx,4), %rax
; X64-NEXT: leaq (%rdx,%rax,2), %rax
; X64-NEXT: subq %rax, %rcx
-; X64-NEXT: movabsq $1676976733973595602, %rdx # imm = 0x1745D1745D1745D2
-; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: mulq %rdx
-; X64-NEXT: leaq (%rdx,%rdx,4), %rax
-; X64-NEXT: movl %eax, %eax
-; X64-NEXT: leaq (%rdx,%rax,2), %rax
-; X64-NEXT: subq %rax, %rcx
; X64-NEXT: subq %rcx, %rdi
; X64-NEXT: sbbq $0, %rsi
; X64-NEXT: movabsq $-6707906935894382406, %rcx # imm = 0xA2E8BA2E8BA2E8BA
@@ -2288,12 +2270,6 @@ define i128 @div_by_25(i128 %x) {
; X64-NEXT: leaq (%rdx,%rdx,4), %rax
; X64-NEXT: leaq (%rax,%rax,4), %rax
; X64-NEXT: subq %rax, %rcx
-; X64-NEXT: movabsq $737869762948382065, %rdx # imm = 0xA3D70A3D70A3D71
-; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: mulq %rdx
-; X64-NEXT: leaq (%rdx,%rdx,4), %rax
-; X64-NEXT: leaq (%rax,%rax,4), %rax
-; X64-NEXT: subq %rax, %rcx
; X64-NEXT: subq %rcx, %rdi
; X64-NEXT: sbbq $0, %rsi
; X64-NEXT: movabsq $2951479051793528258, %rcx # imm = 0x28F5C28F5C28F5C2
@@ -2958,12 +2934,6 @@ define i128 @rem_by_7(i128 %x) {
; X64-NEXT: leaq (,%rdx,8), %rax
; X64-NEXT: subq %rax, %rdx
; X64-NEXT: addq %rdx, %rcx
-; X64-NEXT: movabsq $2635249153387078803, %rdx # imm = 0x2492492492492493
-; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: mulq %rdx
-; X64-NEXT: leal (,%rdx,8), %eax
-; X64-NEXT: subq %rax, %rdx
-; X64-NEXT: addq %rdx, %rcx
; X64-NEXT: movq %rcx, %rax
; X64-NEXT: xorl %edx, %edx
; X64-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
index e99013a09182b..e44295601cbac 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
@@ -1217,83 +1217,64 @@ define <2 x i128> @v2i128_div_by_7(<2 x i128> %x) {
; SSE-NEXT: .cfi_def_cfa_offset 16
; SSE-NEXT: pushq %r14
; SSE-NEXT: .cfi_def_cfa_offset 24
-; SSE-NEXT: pushq %r13
-; SSE-NEXT: .cfi_def_cfa_offset 32
-; SSE-NEXT: pushq %r12
-; SSE-NEXT: .cfi_def_cfa_offset 40
; SSE-NEXT: pushq %rbx
-; SSE-NEXT: .cfi_def_cfa_offset 48
-; SSE-NEXT: .cfi_offset %rbx, -48
-; SSE-NEXT: .cfi_offset %r12, -40
-; SSE-NEXT: .cfi_offset %r13, -32
+; SSE-NEXT: .cfi_def_cfa_offset 32
+; SSE-NEXT: .cfi_offset %rbx, -32
; SSE-NEXT: .cfi_offset %r14, -24
; SSE-NEXT: .cfi_offset %r15, -16
; SSE-NEXT: movq %rcx, %r9
; SSE-NEXT: movq %rdx, %rcx
-; SSE-NEXT: movabsq $1152921504606846975, %r15 # imm = 0xFFFFFFFFFFFFFFF
+; SSE-NEXT: movabsq $1152921504606846975, %r14 # imm = 0xFFFFFFFFFFFFFFF
; SSE-NEXT: movq %rsi, %rax
-; SSE-NEXT: andq %r15, %rax
+; SSE-NEXT: andq %r14, %rax
; SSE-NEXT: movq %rsi, %rdx
; SSE-NEXT: shrdq $60, %rcx, %rdx
-; SSE-NEXT: andq %r15, %rdx
+; SSE-NEXT: andq %r14, %rdx
; SSE-NEXT: addq %rax, %rdx
; SSE-NEXT: movq %rcx, %r10
; SSE-NEXT: shrq $56, %r10
; SSE-NEXT: addq %rdx, %r10
-; SSE-NEXT: movabsq $5270498306774157605, %r12 # imm = 0x4924924924924925
+; SSE-NEXT: movabsq $5270498306774157605, %r15 # imm = 0x4924924924924925
; SSE-NEXT: movq %r10, %rax
-; SSE-NEXT: mulq %r12
-; SSE-NEXT: movq %rdx, %r11
-; SSE-NEXT: shrq %r11
-; SSE-NEXT: leaq (,%r11,8), %rax
-; SSE-NEXT: subq %rax, %r11
-; SSE-NEXT: addq %r10, %r11
-; SSE-NEXT: movabsq $2635249153387078803, %r13 # imm = 0x2492492492492493
-; SSE-NEXT: movq %r11, %rax
-; SSE-NEXT: mulq %r13
-; SSE-NEXT: leal (,%rdx,8), %eax
+; SSE-NEXT: mulq %r15
+; SSE-NEXT: shrq %rdx
+; SSE-NEXT: leaq (,%rdx,8), %rax
; SSE-NEXT: subq %rax, %rdx
-; SSE-NEXT: addq %r11, %rdx
+; SSE-NEXT: addq %r10, %rdx
; SSE-NEXT: subq %rdx, %rsi
; SSE-NEXT: sbbq $0, %rcx
-; SSE-NEXT: movabsq $-5270498306774157605, %rbx # imm = 0xB6DB6DB6DB6DB6DB
+; SSE-NEXT: movabsq $-5270498306774157605, %r11 # imm = 0xB6DB6DB6DB6DB6DB
; SSE-NEXT: movq %rsi, %r10
-; SSE-NEXT: imulq %rbx, %r10
-; SSE-NEXT: movabsq $7905747460161236407, %r14 # imm = 0x6DB6DB6DB6DB6DB7
+; SSE-NEXT: imulq %r11, %r10
+; SSE-NEXT: movabsq $7905747460161236407, %rbx # imm = 0x6DB6DB6DB6DB6DB7
; SSE-NEXT: movq %rsi, %rax
-; SSE-NEXT: mulq %r14
+; SSE-NEXT: mulq %rbx
; SSE-NEXT: movq %rax, %rsi
; SSE-NEXT: addq %r10, %rdx
-; SSE-NEXT: imulq %r14, %rcx
+; SSE-NEXT: imulq %rbx, %rcx
; SSE-NEXT: addq %rdx, %rcx
; SSE-NEXT: movq %r9, %rax
-; SSE-NEXT: andq %r15, %rax
+; SSE-NEXT: andq %r14, %rax
; SSE-NEXT: movq %r9, %rdx
; SSE-NEXT: shrdq $60, %r8, %rdx
-; SSE-NEXT: andq %r15, %rdx
+; SSE-NEXT: andq %r14, %rdx
; SSE-NEXT: addq %rax, %rdx
; SSE-NEXT: movq %r8, %r10
; SSE-NEXT: shrq $56, %r10
; SSE-NEXT: addq %rdx, %r10
; SSE-NEXT: movq %r10, %rax
-; SSE-NEXT: mulq %r12
-; SSE-NEXT: movq %rdx, %r11
-; SSE-NEXT: shrq %r11
-; SSE-NEXT: leaq (,%r11,8), %rax
-; SSE-NEXT: subq %rax, %r11
-; SSE-NEXT: addq %r10, %r11
-; SSE-NEXT: movq %r11, %rax
-; SSE-NEXT: mulq %r13
-; SSE-NEXT: leal (,%rdx,8), %eax
+; SSE-NEXT: mulq %r15
+; SSE-NEXT: shrq %rdx
+; SSE-NEXT: leaq (,%rdx,8), %rax
; SSE-NEXT: subq %rax, %rdx
-; SSE-NEXT: addq %r11, %rdx
+; SSE-NEXT: addq %r10, %rdx
; SSE-NEXT: subq %rdx, %r9
; SSE-NEXT: sbbq $0, %r8
-; SSE-NEXT: imulq %r9, %rbx
+; SSE-NEXT: imulq %r9, %r11
; SSE-NEXT: movq %r9, %rax
-; SSE-NEXT: mulq %r14
-; SSE-NEXT: addq %rbx, %rdx
-; SSE-NEXT: imulq %r14, %r8
+; SSE-NEXT: mulq %rbx
+; SSE-NEXT: addq %r11, %rdx
+; SSE-NEXT: imulq %rbx, %r8
; SSE-NEXT: addq %rdx, %r8
; SSE-NEXT: movq %rax, 16(%rdi)
; SSE-NEXT: movq %rsi, (%rdi)
@@ -1301,10 +1282,6 @@ define <2 x i128> @v2i128_div_by_7(<2 x i128> %x) {
; SSE-NEXT: movq %rcx, 8(%rdi)
; SSE-NEXT: movq %rdi, %rax
; SSE-NEXT: popq %rbx
-; SSE-NEXT: .cfi_def_cfa_offset 40
-; SSE-NEXT: popq %r12
-; SSE-NEXT: .cfi_def_cfa_offset 32
-; SSE-NEXT: popq %r13
; SSE-NEXT: .cfi_def_cfa_offset 24
; SSE-NEXT: popq %r14
; SSE-NEXT: .cfi_def_cfa_offset 16
@@ -1318,83 +1295,64 @@ define <2 x i128> @v2i128_div_by_7(<2 x i128> %x) {
; AVX-NEXT: .cfi_def_cfa_offset 16
; AVX-NEXT: pushq %r14
; AVX-NEXT: .cfi_def_cfa_offset 24
-; AVX-NEXT: pushq %r13
-; AVX-NEXT: .cfi_def_cfa_offset 32
-; AVX-NEXT: pushq %r12
-; AVX-NEXT: .cfi_def_cfa_offset 40
; AVX-NEXT: pushq %rbx
-; AVX-NEXT: .cfi_def_cfa_offset 48
-; AVX-NEXT: .cfi_offset %rbx, -48
-; AVX-NEXT: .cfi_offset %r12, -40
-; AVX-NEXT: .cfi_offset %r13, -32
+; AVX-NEXT: .cfi_def_cfa_offset 32
+; AVX-NEXT: .cfi_offset %rbx, -32
; AVX-NEXT: .cfi_offset %r14, -24
; AVX-NEXT: .cfi_offset %r15, -16
; AVX-NEXT: movq %rcx, %r9
; AVX-NEXT: movq %rdx, %rcx
-; AVX-NEXT: movabsq $1152921504606846975, %r15 # imm = 0xFFFFFFFFFFFFFFF
+; AVX-NEXT: movabsq $1152921504606846975, %r14 # imm = 0xFFFFFFFFFFFFFFF
; AVX-NEXT: movq %rsi, %rax
-; AVX-NEXT: andq %r15, %rax
+; AVX-NEXT: andq %r14, %rax
; AVX-NEXT: movq %rsi, %rdx
; AVX-NEXT: shrdq $60, %rcx, %rdx
-; AVX-NEXT: andq %r15, %rdx
+; AVX-NEXT: andq %r14, %rdx
; AVX-NEXT: addq %rax, %rdx
; AVX-NEXT: movq %rcx, %r10
; AVX-NEXT: shrq $56, %r10
; AVX-NEXT: addq %rdx, %r10
-; AVX-NEXT: movabsq $5270498306774157605, %r12 # imm = 0x4924924924924925
+; AVX-NEXT: movabsq $5270498306774157605, %r15 # imm = 0x4924924924924925
; AVX-NEXT: movq %r10, %rax
-; AVX-NEXT: mulq %r12
-; AVX-NEXT: movq %rdx, %r11
-; AVX-NEXT: shrq %r11
-; AVX-NEXT: leaq (,%r11,8), %rax
-; AVX-NEXT: subq %rax, %r11
-; AVX-NEXT: addq %r10, %r11
-; AVX-NEXT: movabsq $2635249153387078803, %r13 # imm = 0x2492492492492493
-; AVX-NEXT: movq %r11, %rax
-; AVX-NEXT: mulq %r13
-; AVX-NEXT: leal (,%rdx,8), %eax
+; AVX-NEXT: mulq %r15
+; AVX-NEXT: shrq %rdx
+; AVX-NEXT: leaq (,%rdx,8), %rax
; AVX-NEXT: subq %rax, %rdx
-; AVX-NEXT: addq %r11, %rdx
+; AVX-NEXT: addq %r10, %rdx
; AVX-NEXT: subq %rdx, %rsi
; AVX-NEXT: sbbq $0, %rcx
-; AVX-NEXT: movabsq $-5270498306774157605, %rbx # imm = 0xB6DB6DB6DB6DB6DB
+; AVX-NEXT: movabsq $-5270498306774157605, %r11 # imm = 0xB6DB6DB6DB6DB6DB
; AVX-NEXT: movq %rsi, %r10
-; AVX-NEXT: imulq %rbx, %r10
-; AVX-NEXT: movabsq $7905747460161236407, %r14 # imm = 0x6DB6DB6DB6DB6DB7
+; AVX-NEXT: imulq %r11, %r10
+; AVX-NEXT: movabsq $7905747460161236407, %rbx # imm = 0x6DB6DB6DB6DB6DB7
; AVX-NEXT: movq %rsi, %rax
-; AVX-NEXT: mulq %r14
+; AVX-NEXT: mulq %rbx
; AVX-NEXT: movq %rax, %rsi
; AVX-NEXT: addq %r10, %rdx
-; AVX-NEXT: imulq %r14, %rcx
+; AVX-NEXT: imulq %rbx, %rcx
; AVX-NEXT: addq %rdx, %rcx
; AVX-NEXT: movq %r9, %rax
-; AVX-NEXT: andq %r15, %rax
+; AVX-NEXT: andq %r14, %rax
; AVX-NEXT: movq %r9, %rdx
; AVX-NEXT: shrdq $60, %r8, %rdx
-; AVX-NEXT: andq %r15, %rdx
+; AVX-NEXT: andq %r14, %rdx
; AVX-NEXT: addq %rax, %rdx
; AVX-NEXT: movq %r8, %r10
; AVX-NEXT: shrq $56, %r10
; AVX-NEXT: addq %rdx, %r10
; AVX-NEXT: movq %r10, %rax
-; AVX-NEXT: mulq %r12
-; AVX-NEXT: movq %rdx, %r11
-; AVX-NEXT: shrq %r11
-; AVX-NEXT: leaq (,%r11,8), %rax
-; AVX-NEXT: subq %rax, %r11
-; AVX-NEXT: addq %r10, %r11
-; AVX-NEXT: movq %r11, %rax
-; AVX-NEXT: mulq %r13
-; AVX-NEXT: leal (,%rdx,8), %eax
+; AVX-NEXT: mulq %r15
+; AVX-NEXT: shrq %rdx
+; AVX-NEXT: leaq (,%rdx,8), %rax
; AVX-NEXT: subq %rax, %rdx
-; AVX-NEXT: addq %r11, %rdx
+; AVX-NEXT: addq %r10, %rdx
; AVX-NEXT: subq %rdx, %r9
; AVX-NEXT: sbbq $0, %r8
-; AVX-NEXT: imulq %r9, %rbx
+; AVX-NEXT: imulq %r9, %r11
; AVX-NEXT: movq %r9, %rax
-; AVX-NEXT: mulq %r14
-; AVX-NEXT: addq %rbx, %rdx
-; AVX-NEXT: imulq %r14, %r8
+; AVX-NEXT: mulq %rbx
+; AVX-NEXT: addq %r11, %rdx
+; AVX-NEXT: imulq %rbx, %r8
; AVX-NEXT: addq %rdx, %r8
; AVX-NEXT: movq %rax, 16(%rdi)
; AVX-NEXT: movq %rsi, (%rdi)
@@ -1402,10 +1360,6 @@ define <2 x i128> @v2i128_div_by_7(<2 x i128> %x) {
; AVX-NEXT: movq %rcx, 8(%rdi)
; AVX-NEXT: movq %rdi, %rax
; AVX-NEXT: popq %rbx
-; AVX-NEXT: .cfi_def_cfa_offset 40
-; AVX-NEXT: popq %r12
-; AVX-NEXT: .cfi_def_cfa_offset 32
-; AVX-NEXT: popq %r13
; AVX-NEXT: .cfi_def_cfa_offset 24
; AVX-NEXT: popq %r14
; AVX-NEXT: .cfi_def_cfa_offset 16
@@ -1525,116 +1479,80 @@ entry:
define <2 x i128> @v2i128_rem_by_7(<2 x i128> %x) {
; SSE-LABEL: v2i128_rem_by_7:
; SSE: # %bb.0: # %entry
-; SSE-NEXT: pushq %rbx
-; SSE-NEXT: .cfi_def_cfa_offset 16
-; SSE-NEXT: .cfi_offset %rbx, -16
; SSE-NEXT: movq %rdx, %r9
-; SSE-NEXT: movabsq $1152921504606846975, %r11 # imm = 0xFFFFFFFFFFFFFFF
+; SSE-NEXT: movabsq $1152921504606846975, %r10 # imm = 0xFFFFFFFFFFFFFFF
; SSE-NEXT: movq %rsi, %rax
-; SSE-NEXT: andq %r11, %rax
+; SSE-NEXT: andq %r10, %rax
; SSE-NEXT: shrdq $60, %rdx, %rsi
-; SSE-NEXT: andq %r11, %rsi
+; SSE-NEXT: andq %r10, %rsi
; SSE-NEXT: addq %rax, %rsi
; SSE-NEXT: shrq $56, %r9
; SSE-NEXT: addq %rsi, %r9
-; SSE-NEXT: movabsq $5270498306774157605, %rbx # imm = 0x4924924924924925
+; SSE-NEXT: movabsq $5270498306774157605, %r11 # imm = 0x4924924924924925
; SSE-NEXT: movq %r9, %rax
-; SSE-NEXT: mulq %rbx
-; SSE-NEXT: movq %rdx, %r10
-; SSE-NEXT: shrq %r10
-; SSE-NEXT: leaq (,%r10,8), %rax
-; SSE-NEXT: subq %rax, %r10
-; SSE-NEXT: addq %r9, %r10
-; SSE-NEXT: movabsq $2635249153387078803, %r9 # imm = 0x2492492492492493
-; SSE-NEXT: movq %r10, %rax
-; SSE-NEXT: mulq %r9
+; SSE-NEXT: mulq %r11
; SSE-NEXT: movq %rdx, %rsi
-; SSE-NEXT: leal (,%rdx,8), %eax
+; SSE-NEXT: shrq %rsi
+; SSE-NEXT: leaq (,%rsi,8), %rax
; SSE-NEXT: subq %rax, %rsi
-; SSE-NEXT: addq %r10, %rsi
+; SSE-NEXT: addq %r9, %rsi
; SSE-NEXT: movq %rcx, %rax
-; SSE-NEXT: andq %r11, %rax
+; SSE-NEXT: andq %r10, %rax
; SSE-NEXT: shrdq $60, %r8, %rcx
-; SSE-NEXT: andq %r11, %rcx
+; SSE-NEXT: andq %r10, %rcx
; SSE-NEXT: addq %rax, %rcx
; SSE-NEXT: shrq $56, %r8
; SSE-NEXT: addq %rcx, %r8
; SSE-NEXT: movq %r8, %rax
-; SSE-NEXT: mulq %rbx
-; SSE-NEXT: movq %rdx, %rcx
-; SSE-NEXT: shrq %rcx
-; SSE-NEXT: leaq (,%rcx,8), %rax
-; SSE-NEXT: subq %rax, %rcx
-; SSE-NEXT: addq %r8, %rcx
-; SSE-NEXT: movq %rcx, %rax
-; SSE-NEXT: mulq %r9
-; SSE-NEXT: leal (,%rdx,8), %eax
+; SSE-NEXT: mulq %r11
+; SSE-NEXT: shrq %rdx
+; SSE-NEXT: leaq (,%rdx,8), %rax
; SSE-NEXT: subq %rax, %rdx
-; SSE-NEXT: addq %rcx, %rdx
+; SSE-NEXT: addq %r8, %rdx
; SSE-NEXT: movq %rdx, 16(%rdi)
; SSE-NEXT: movq %rsi, (%rdi)
; SSE-NEXT: movq $0, 24(%rdi)
; SSE-NEXT: movq $0, 8(%rdi)
; SSE-NEXT: movq %rdi, %rax
-; SSE-NEXT: popq %rbx
-; SSE-NEXT: .cfi_def_cfa_offset 8
; SSE-NEXT: retq
;
; AVX-LABEL: v2i128_rem_by_7:
; AVX: # %bb.0: # %entry
-; AVX-NEXT: pushq %rbx
-; AVX-NEXT: .cfi_def_cfa_offset 16
-; AVX-NEXT: .cfi_offset %rbx, -16
; AVX-NEXT: movq %rdx, %r9
-; AVX-NEXT: movabsq $1152921504606846975, %r11 # imm = 0xFFFFFFFFFFFFFFF
+; AVX-NEXT: movabsq $1152921504606846975, %r10 # imm = 0xFFFFFFFFFFFFFFF
; AVX-NEXT: movq %rsi, %rax
-; AVX-NEXT: andq %r11, %rax
+; AVX-NEXT: andq %r10, %rax
; AVX-NEXT: shrdq $60, %rdx, %rsi
-; AVX-NEXT: andq %r11, %rsi
+; AVX-NEXT: andq %r10, %rsi
; AVX-NEXT: addq %rax, %rsi
; AVX-NEXT: shrq $56, %r9
; AVX-NEXT: addq %rsi, %r9
-; AVX-NEXT: movabsq $5270498306774157605, %rbx # imm = 0x4924924924924925
+; AVX-NEXT: movabsq $5270498306774157605, %r11 # imm = 0x4924924924924925
; AVX-NEXT: movq %r9, %rax
-; AVX-NEXT: mulq %rbx
-; AVX-NEXT: movq %rdx, %r10
-; AVX-NEXT: shrq %r10
-; AVX-NEXT: leaq (,%r10,8), %rax
-; AVX-NEXT: subq %rax, %r10
-; AVX-NEXT: addq %r9, %r10
-; AVX-NEXT: movabsq $2635249153387078803, %r9 # imm = 0x2492492492492493
-; AVX-NEXT: movq %r10, %rax
-; AVX-NEXT: mulq %r9
+; AVX-NEXT: mulq %r11
; AVX-NEXT: movq %rdx, %rsi
-; AVX-NEXT: leal (,%rdx,8), %eax
+; AVX-NEXT: shrq %rsi
+; AVX-NEXT: leaq (,%rsi,8), %rax
; AVX-NEXT: subq %rax, %rsi
-; AVX-NEXT: addq %r10, %rsi
+; AVX-NEXT: addq %r9, %rsi
; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: andq %r11, %rax
+; AVX-NEXT: andq %r10, %rax
; AVX-NEXT: shrdq $60, %r8, %rcx
-; AVX-NEXT: andq %r11, %rcx
+; AVX-NEXT: andq %r10, %rcx
; AVX-NEXT: addq %rax, %rcx
; AVX-NEXT: shrq $56, %r8
; AVX-NEXT: addq %rcx, %r8
; AVX-NEXT: movq %r8, %rax
-; AVX-NEXT: mulq %rbx
-; AVX-NEXT: movq %rdx, %rcx
-; AVX-NEXT: shrq %rcx
-; AVX-NEXT: leaq (,%rcx,8), %rax
-; AVX-NEXT: subq %rax, %rcx
-; AVX-NEXT: addq %r8, %rcx
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: mulq %r9
-; AVX-NEXT: leal (,%rdx,8), %eax
+; AVX-NEXT: mulq %r11
+; AVX-NEXT: shrq %rdx
+; AVX-NEXT: leaq (,%rdx,8), %rax
; AVX-NEXT: subq %rax, %rdx
-; AVX-NEXT: addq %rcx, %rdx
+; AVX-NEXT: addq %r8, %rdx
; AVX-NEXT: movq %rdx, 16(%rdi)
; AVX-NEXT: movq %rsi, (%rdi)
; AVX-NEXT: movq $0, 24(%rdi)
; AVX-NEXT: movq $0, 8(%rdi)
; AVX-NEXT: movq %rdi, %rax
-; AVX-NEXT: popq %rbx
-; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%rem = urem <2 x i128> %x, <i128 7, i128 7>
>From ce8641dca5e966d2d72ffd3723ba74add4e49566 Mon Sep 17 00:00:00 2001
From: Shivam Gupta <shivam98.tkg at gmail.com>
Date: Mon, 16 Mar 2026 11:48:27 +0530
Subject: [PATCH 15/15] Have a tighter bound
---
.../CodeGen/SelectionDAG/TargetLowering.cpp | 12 ++--
llvm/test/CodeGen/RISCV/div-by-constant.ll | 67 ++++++++++---------
.../CodeGen/RISCV/split-udiv-by-constant.ll | 47 +++++++------
.../CodeGen/RISCV/split-urem-by-constant.ll | 25 ++++---
llvm/test/CodeGen/X86/divide-by-constant.ll | 21 +++---
5 files changed, 96 insertions(+), 76 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 91e1f9bd9201a..1385ad4c80490 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8258,13 +8258,13 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
// LegalVT
unsigned NumChunks = divideCeil(BitWidth, I);
// if the ChunkWidth (I) plus the Potential Carry Bits is less than the
- // Register Width (64), we have enough "slack" at the top of the
+ // Register Width, we have enough "slack" at the top of the
// register to let the carries pile up safely.
- // Max sum is NumChunks * (2^I - 1) so by approximation we need
- // NumChunks × 2^I < 2^L. Taking log on both size we will have
- // log2(NumChunks) + I < L.
- // if (I + llvm::bit_width(NumChunks - 1) <= LegalWidth) {
- if (I + Log2_32_Ceil(NumChunks) < LegalWidth) {
+ // Adding NumChunks I-bit values can produce at most
+ // ceil(log2(NumChunks)) carry bits.
+ // Therefore the total number of bits required for the sum is:
+ // I + ceil(log2(NumChunks)) <= LegalWidth
+ if (I + llvm::bit_width(NumChunks - 1) <= LegalWidth) {
BestChunkWidth = I;
break;
}
diff --git a/llvm/test/CodeGen/RISCV/div-by-constant.ll b/llvm/test/CodeGen/RISCV/div-by-constant.ll
index 7293a549ac065..1aa0cd053f3ed 100644
--- a/llvm/test/CodeGen/RISCV/div-by-constant.ll
+++ b/llvm/test/CodeGen/RISCV/div-by-constant.ll
@@ -113,10 +113,10 @@ define i64 @udiv64_constant_no_add(i64 %a) nounwind {
define i64 @udiv64_constant_add(i64 %a) nounwind {
; RV32IM-LABEL: udiv64_constant_add:
; RV32IM: # %bb.0:
-; RV32IM-NEXT: lui a2, 32768
-; RV32IM-NEXT: slli a3, a1, 5
-; RV32IM-NEXT: srli a4, a0, 27
-; RV32IM-NEXT: srli a5, a1, 22
+; RV32IM-NEXT: lui a2, 262144
+; RV32IM-NEXT: slli a3, a1, 2
+; RV32IM-NEXT: srli a4, a0, 30
+; RV32IM-NEXT: srli a5, a1, 28
; RV32IM-NEXT: lui a6, 149797
; RV32IM-NEXT: or a3, a4, a3
; RV32IM-NEXT: lui a4, 449390
@@ -130,6 +130,10 @@ define i64 @udiv64_constant_add(i64 %a) nounwind {
; RV32IM-NEXT: addi a3, a3, -585
; RV32IM-NEXT: add a2, a2, a5
; RV32IM-NEXT: mulhu a5, a2, a6
+; RV32IM-NEXT: sub a6, a2, a5
+; RV32IM-NEXT: srli a6, a6, 1
+; RV32IM-NEXT: add a5, a6, a5
+; RV32IM-NEXT: srli a5, a5, 2
; RV32IM-NEXT: slli a6, a5, 3
; RV32IM-NEXT: sub a5, a5, a6
; RV32IM-NEXT: add a2, a2, a5
@@ -146,35 +150,38 @@ define i64 @udiv64_constant_add(i64 %a) nounwind {
;
; RV32IMZB-LABEL: udiv64_constant_add:
; RV32IMZB: # %bb.0:
-; RV32IMZB-NEXT: slli a2, a1, 5
-; RV32IMZB-NEXT: srli a3, a0, 27
-; RV32IMZB-NEXT: lui a4, 1015808
-; RV32IMZB-NEXT: slli a5, a0, 5
-; RV32IMZB-NEXT: srli a6, a1, 22
-; RV32IMZB-NEXT: or a2, a3, a2
-; RV32IMZB-NEXT: lui a3, 149797
-; RV32IMZB-NEXT: andn a2, a2, a4
-; RV32IMZB-NEXT: lui a4, 449390
-; RV32IMZB-NEXT: srli a5, a5, 5
-; RV32IMZB-NEXT: add a5, a5, a6
-; RV32IMZB-NEXT: lui a6, 748983
-; RV32IMZB-NEXT: addi a3, a3, -1755
-; RV32IMZB-NEXT: addi a4, a4, -1171
-; RV32IMZB-NEXT: addi a6, a6, -585
-; RV32IMZB-NEXT: add a2, a5, a2
-; RV32IMZB-NEXT: mulhu a3, a2, a3
-; RV32IMZB-NEXT: slli a5, a3, 3
-; RV32IMZB-NEXT: sub a3, a3, a5
-; RV32IMZB-NEXT: add a2, a2, a3
-; RV32IMZB-NEXT: sub a3, a0, a2
+; RV32IMZB-NEXT: srli a2, a0, 30
+; RV32IMZB-NEXT: lui a3, 786432
+; RV32IMZB-NEXT: slli a4, a0, 2
+; RV32IMZB-NEXT: srli a5, a1, 28
+; RV32IMZB-NEXT: lui a6, 149797
+; RV32IMZB-NEXT: sh2add a2, a1, a2
+; RV32IMZB-NEXT: andn a2, a2, a3
+; RV32IMZB-NEXT: lui a3, 449390
+; RV32IMZB-NEXT: srli a4, a4, 2
+; RV32IMZB-NEXT: add a4, a4, a5
+; RV32IMZB-NEXT: lui a5, 748983
+; RV32IMZB-NEXT: addi a6, a6, -1755
+; RV32IMZB-NEXT: addi a3, a3, -1171
+; RV32IMZB-NEXT: addi a5, a5, -585
+; RV32IMZB-NEXT: add a2, a4, a2
+; RV32IMZB-NEXT: mulhu a4, a2, a6
+; RV32IMZB-NEXT: sub a6, a2, a4
+; RV32IMZB-NEXT: srli a6, a6, 1
+; RV32IMZB-NEXT: add a4, a6, a4
+; RV32IMZB-NEXT: srli a4, a4, 2
+; RV32IMZB-NEXT: slli a6, a4, 3
+; RV32IMZB-NEXT: sub a4, a4, a6
+; RV32IMZB-NEXT: add a2, a2, a4
+; RV32IMZB-NEXT: sub a4, a0, a2
; RV32IMZB-NEXT: sltu a0, a0, a2
-; RV32IMZB-NEXT: mul a2, a3, a4
-; RV32IMZB-NEXT: mulhu a4, a3, a6
+; RV32IMZB-NEXT: mul a2, a4, a3
+; RV32IMZB-NEXT: mulhu a3, a4, a5
; RV32IMZB-NEXT: sub a1, a1, a0
-; RV32IMZB-NEXT: add a2, a4, a2
-; RV32IMZB-NEXT: mul a1, a1, a6
+; RV32IMZB-NEXT: add a2, a3, a2
+; RV32IMZB-NEXT: mul a1, a1, a5
; RV32IMZB-NEXT: add a1, a2, a1
-; RV32IMZB-NEXT: mul a0, a3, a6
+; RV32IMZB-NEXT: mul a0, a4, a5
; RV32IMZB-NEXT: ret
;
; RV64-LABEL: udiv64_constant_add:
diff --git a/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll b/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll
index 6fbc81e61a973..3ded13cc31c7b 100644
--- a/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll
+++ b/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll
@@ -117,10 +117,10 @@ define iXLen2 @test_udiv_5(iXLen2 %x) nounwind {
define iXLen2 @test_udiv_7(iXLen2 %x) nounwind {
; RV32-LABEL: test_udiv_7:
; RV32: # %bb.0:
-; RV32-NEXT: lui a2, 32768
-; RV32-NEXT: slli a3, a1, 5
-; RV32-NEXT: srli a4, a0, 27
-; RV32-NEXT: srli a5, a1, 22
+; RV32-NEXT: lui a2, 262144
+; RV32-NEXT: slli a3, a1, 2
+; RV32-NEXT: srli a4, a0, 30
+; RV32-NEXT: srli a5, a1, 28
; RV32-NEXT: lui a6, 149797
; RV32-NEXT: or a3, a4, a3
; RV32-NEXT: lui a4, 449390
@@ -134,6 +134,10 @@ define iXLen2 @test_udiv_7(iXLen2 %x) nounwind {
; RV32-NEXT: addi a3, a3, -585
; RV32-NEXT: add a2, a2, a5
; RV32-NEXT: mulhu a5, a2, a6
+; RV32-NEXT: sub a6, a2, a5
+; RV32-NEXT: srli a6, a6, 1
+; RV32-NEXT: add a5, a6, a5
+; RV32-NEXT: srli a5, a5, 2
; RV32-NEXT: slli a6, a5, 3
; RV32-NEXT: sub a5, a5, a6
; RV32-NEXT: add a2, a2, a5
@@ -190,35 +194,34 @@ define iXLen2 @test_udiv_7(iXLen2 %x) nounwind {
define iXLen2 @test_udiv_9(iXLen2 %x) nounwind {
; RV32-LABEL: test_udiv_9:
; RV32: # %bb.0:
-; RV32-NEXT: lui a2, 4096
-; RV32-NEXT: slli a3, a1, 8
-; RV32-NEXT: srli a4, a0, 24
-; RV32-NEXT: srli a5, a1, 16
-; RV32-NEXT: lui a6, 116508
+; RV32-NEXT: lui a2, 262144
+; RV32-NEXT: slli a3, a1, 2
+; RV32-NEXT: srli a4, a0, 30
+; RV32-NEXT: srli a5, a1, 28
+; RV32-NEXT: lui a6, 233017
; RV32-NEXT: or a3, a4, a3
; RV32-NEXT: lui a4, 582542
; RV32-NEXT: addi a2, a2, -1
+; RV32-NEXT: addi a6, a6, -455
+; RV32-NEXT: addi a4, a4, 910
; RV32-NEXT: and a3, a3, a2
; RV32-NEXT: and a2, a0, a2
; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: lui a3, 233017
-; RV32-NEXT: addi a6, a6, 1821
-; RV32-NEXT: addi a4, a4, 910
-; RV32-NEXT: addi a3, a3, -455
; RV32-NEXT: add a2, a2, a5
-; RV32-NEXT: mulhu a5, a2, a6
-; RV32-NEXT: slli a6, a5, 3
-; RV32-NEXT: add a5, a6, a5
-; RV32-NEXT: sub a2, a2, a5
-; RV32-NEXT: sub a5, a0, a2
+; RV32-NEXT: mulhu a3, a2, a6
+; RV32-NEXT: srli a3, a3, 1
+; RV32-NEXT: slli a5, a3, 3
+; RV32-NEXT: add a3, a5, a3
+; RV32-NEXT: sub a2, a2, a3
+; RV32-NEXT: sub a3, a0, a2
; RV32-NEXT: sltu a0, a0, a2
-; RV32-NEXT: mul a2, a5, a4
-; RV32-NEXT: mulhu a4, a5, a3
+; RV32-NEXT: mul a2, a3, a4
+; RV32-NEXT: mulhu a4, a3, a6
; RV32-NEXT: sub a1, a1, a0
; RV32-NEXT: add a2, a4, a2
-; RV32-NEXT: mul a1, a1, a3
+; RV32-NEXT: mul a1, a1, a6
; RV32-NEXT: add a1, a2, a1
-; RV32-NEXT: mul a0, a5, a3
+; RV32-NEXT: mul a0, a3, a6
; RV32-NEXT: ret
;
; RV64-LABEL: test_udiv_9:
diff --git a/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll b/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll
index bbd320a9381c9..2a890e8bb1aa4 100644
--- a/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll
+++ b/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll
@@ -79,10 +79,10 @@ define iXLen2 @test_urem_5(iXLen2 %x) nounwind {
define iXLen2 @test_urem_7(iXLen2 %x) nounwind {
; RV32-LABEL: test_urem_7:
; RV32: # %bb.0:
-; RV32-NEXT: lui a2, 32768
-; RV32-NEXT: slli a3, a1, 5
-; RV32-NEXT: srli a4, a0, 27
-; RV32-NEXT: srli a1, a1, 22
+; RV32-NEXT: lui a2, 262144
+; RV32-NEXT: slli a3, a1, 2
+; RV32-NEXT: srli a4, a0, 30
+; RV32-NEXT: srli a1, a1, 28
; RV32-NEXT: or a3, a4, a3
; RV32-NEXT: lui a4, 149797
; RV32-NEXT: addi a2, a2, -1
@@ -92,6 +92,10 @@ define iXLen2 @test_urem_7(iXLen2 %x) nounwind {
; RV32-NEXT: add a0, a0, a1
; RV32-NEXT: addi a1, a4, -1755
; RV32-NEXT: mulhu a1, a0, a1
+; RV32-NEXT: sub a2, a0, a1
+; RV32-NEXT: srli a2, a2, 1
+; RV32-NEXT: add a1, a2, a1
+; RV32-NEXT: srli a1, a1, 2
; RV32-NEXT: slli a2, a1, 3
; RV32-NEXT: sub a1, a1, a2
; RV32-NEXT: add a0, a0, a1
@@ -126,19 +130,20 @@ define iXLen2 @test_urem_7(iXLen2 %x) nounwind {
define iXLen2 @test_urem_9(iXLen2 %x) nounwind {
; RV32-LABEL: test_urem_9:
; RV32: # %bb.0:
-; RV32-NEXT: lui a2, 4096
-; RV32-NEXT: slli a3, a1, 8
-; RV32-NEXT: srli a4, a0, 24
-; RV32-NEXT: srli a1, a1, 16
+; RV32-NEXT: lui a2, 262144
+; RV32-NEXT: slli a3, a1, 2
+; RV32-NEXT: srli a4, a0, 30
+; RV32-NEXT: srli a1, a1, 28
; RV32-NEXT: or a3, a4, a3
-; RV32-NEXT: lui a4, 116508
+; RV32-NEXT: lui a4, 233017
; RV32-NEXT: addi a2, a2, -1
; RV32-NEXT: and a0, a0, a2
; RV32-NEXT: and a2, a3, a2
; RV32-NEXT: add a0, a0, a2
; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: addi a1, a4, 1821
+; RV32-NEXT: addi a1, a4, -455
; RV32-NEXT: mulhu a1, a0, a1
+; RV32-NEXT: srli a1, a1, 1
; RV32-NEXT: slli a2, a1, 3
; RV32-NEXT: add a1, a2, a1
; RV32-NEXT: sub a0, a0, a1
diff --git a/llvm/test/CodeGen/X86/divide-by-constant.ll b/llvm/test/CodeGen/X86/divide-by-constant.ll
index 546e375a7f8c6..aa77c1da512e4 100644
--- a/llvm/test/CodeGen/X86/divide-by-constant.ll
+++ b/llvm/test/CodeGen/X86/divide-by-constant.ll
@@ -305,21 +305,26 @@ define i64 @PR23590(i64 %x) nounwind {
; X86-NEXT: addl $16, %esp
; X86-NEXT: movl %eax, %esi
; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: andl $134217727, %eax # imm = 0x7FFFFFF
+; X86-NEXT: andl $1073741823, %eax # imm = 0x3FFFFFFF
; X86-NEXT: movl %esi, %edx
-; X86-NEXT: shrdl $27, %ecx, %edx
-; X86-NEXT: andl $134217727, %edx # imm = 0x7FFFFFF
+; X86-NEXT: shrdl $30, %ecx, %edx
+; X86-NEXT: andl $1073741823, %edx # imm = 0x3FFFFFFF
; X86-NEXT: movl %ecx, %edi
-; X86-NEXT: shrl $22, %edi
+; X86-NEXT: shrl $28, %edi
; X86-NEXT: addl %eax, %edi
; X86-NEXT: addl %edx, %edi
; X86-NEXT: movl $613566757, %edx # imm = 0x24924925
; X86-NEXT: movl %edi, %eax
; X86-NEXT: mull %edx
-; X86-NEXT: leal (,%edx,8), %eax
-; X86-NEXT: subl %eax, %edx
-; X86-NEXT: addl %edi, %edx
-; X86-NEXT: subl %edx, %esi
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: subl %edx, %eax
+; X86-NEXT: shrl %eax
+; X86-NEXT: addl %edx, %eax
+; X86-NEXT: shrl $2, %eax
+; X86-NEXT: leal (,%eax,8), %edx
+; X86-NEXT: subl %edx, %eax
+; X86-NEXT: addl %edi, %eax
+; X86-NEXT: subl %eax, %esi
; X86-NEXT: sbbl $0, %ecx
; X86-NEXT: movl $-1227133513, %edx # imm = 0xB6DB6DB7
; X86-NEXT: movl %esi, %eax
More information about the llvm-commits
mailing list