[llvm] make div more assumption aware for better code (PR #185784)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 10 18:55:05 PDT 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: Takashi Idobe (Takashiidobe)
<details>
<summary>Changes</summary>
Related: https://github.com/llvm/llvm-project/issues/115158
The linked issue points out that there are some cases where division and modulo will have enough information to prove that there doesn't need to be a branch of a division (e.g. long / long or long / short division). In which case there shouldn't need to be a branch emitted because the compiler knows which div operation is going to be better and should just emit that code without a runtime branch.
This handles the following cases:
1. long dividend, short divisor, unsigned divide
- udiv -> single divq
2. long dividend, short divisor, unsigned remainder
- urem -> single divq
3. long signed dividend, short signed divisor, signed divide
- sdiv -> single idivq
4. long signed dividend, short signed divisor, signed remainder
- srem -> single idivq
and the negative case:
1. unknown dividend, unknown divisor without enough information, still emits a branch.
This is just a few of the cases the issue points out, so this just handles some basic cases.
On x86 the changes are:
For these cases:
```c
#include <stdint.h>
uint32_t div_u64_u32_if_div_gt_u32_max(uint64_t n, uint32_t d) {
if (n <= UINT32_MAX) {
__builtin_unreachable();
}
return (uint32_t)(n / (uint64_t)d);
}
uint32_t rem_u64_u32_if_div_gt_u32_max(uint64_t n, uint32_t d) {
if (n <= UINT32_MAX) {
__builtin_unreachable();
}
return (uint32_t)(n % (uint64_t)d);
}
int32_t div_i64_i32_if_div_gt_u32_max(int64_t n, int32_t d) {
if (n <= 4294967295LL) {
__builtin_unreachable();
}
return (int32_t)(n / (int64_t)d);
}
int32_t rem_i64_i32_if_div_gt_u32_max(int64_t n, int32_t d) {
if (n <= 4294967295LL) {
__builtin_unreachable();
}
return (int32_t)(n % (int64_t)d);
}
```
Current head emits this code (all with branches)
```asm
div_u64_u32_if_div_gt_u32_max: # @<!-- -->div_u64_u32_if_div_gt_u32_max
mov rax, rdi
mov ecx, esi
mov rdx, rdi
shr rdx, 32
je .LBB0_1
xor edx, edx
div rcx
ret
.LBB0_1:
xor edx, edx
div ecx
ret
rem_u64_u32_if_div_gt_u32_max: # @<!-- -->rem_u64_u32_if_div_gt_u32_max
mov rax, rdi
mov ecx, esi
mov rdx, rdi
shr rdx, 32
je .LBB1_1
xor edx, edx
div rcx
mov rax, rdx
ret
.LBB1_1:
xor edx, edx
div ecx
mov eax, edx
ret
div_i64_i32_if_div_gt_u32_max: # @<!-- -->div_i64_i32_if_div_gt_u32_max
mov rax, rdi
movsxd rcx, esi
mov rdx, rdi
or rdx, rcx
shr rdx, 32
je .LBB2_1
cqo
idiv rcx
ret
.LBB2_1:
xor edx, edx
div esi
ret
rem_i64_i32_if_div_gt_u32_max: # @<!-- -->rem_i64_i32_if_div_gt_u32_max
mov rax, rdi
movsxd rcx, esi
mov rdx, rdi
or rdx, rcx
shr rdx, 32
je .LBB3_1
cqo
idiv rcx
mov rax, rdx
ret
.LBB3_1:
xor edx, edx
div esi
mov eax, edx
ret
```
And after the fix:
```asm
div_u64_u32_if_div_gt_u32_max: # @<!-- -->div_u64_u32_if_div_gt_u32_max
mov rax, rdi
mov ecx, esi
xor edx, edx
div rcx
ret
rem_u64_u32_if_div_gt_u32_max: # @<!-- -->rem_u64_u32_if_div_gt_u32_max
mov rax, rdi
mov ecx, esi
xor edx, edx
div rcx
mov rax, rdx
ret
div_i64_i32_if_div_gt_u32_max: # @<!-- -->div_i64_i32_if_div_gt_u32_max
mov rax, rdi
movsxd rcx, esi
cqo
idiv rcx
ret
rem_i64_i32_if_div_gt_u32_max: # @<!-- -->rem_i64_i32_if_div_gt_u32_max
mov rax, rdi
movsxd rcx, esi
cqo
idiv rcx
mov rax, rdx
ret
```
---
Full diff: https://github.com/llvm/llvm-project/pull/185784.diff
4 Files Affected:
- (modified) llvm/include/llvm/Transforms/Utils/BypassSlowDivision.h (+4-2)
- (modified) llvm/lib/CodeGen/CodeGenPrepare.cpp (+7-1)
- (modified) llvm/lib/Transforms/Utils/BypassSlowDivision.cpp (+19-5)
- (modified) llvm/test/CodeGen/X86/bypass-slow-division-64.ll (+118)
``````````diff
diff --git a/llvm/include/llvm/Transforms/Utils/BypassSlowDivision.h b/llvm/include/llvm/Transforms/Utils/BypassSlowDivision.h
index bd98c902d1ab4..ffe88f6e232a9 100644
--- a/llvm/include/llvm/Transforms/Utils/BypassSlowDivision.h
+++ b/llvm/include/llvm/Transforms/Utils/BypassSlowDivision.h
@@ -24,6 +24,7 @@
namespace llvm {
+class AssumptionCache;
class BasicBlock;
class Value;
@@ -66,8 +67,9 @@ template <> struct DenseMapInfo<DivRemMapKey> {
///
/// This optimization may add basic blocks immediately after BB; for obvious
/// reasons, you shouldn't pass those blocks to bypassSlowDivision.
-bool bypassSlowDivision(
- BasicBlock *BB, const DenseMap<unsigned int, unsigned int> &BypassWidth);
+bool bypassSlowDivision(BasicBlock *BB,
+ const DenseMap<unsigned int, unsigned int> &BypassWidth,
+ AssumptionCache *AC = nullptr);
} // end namespace llvm
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 537b91bc39e0e..a5e12c7bb4627 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -22,6 +22,7 @@
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/BranchProbabilityInfo.h"
#include "llvm/Analysis/FloatingPointPredicateUtils.h"
@@ -316,6 +317,7 @@ class CodeGenPrepare {
std::unique_ptr<BlockFrequencyInfo> BFI;
std::unique_ptr<BranchProbabilityInfo> BPI;
ProfileSummaryInfo *PSI = nullptr;
+ AssumptionCache *AC = nullptr;
/// As we scan instructions optimizing them, this is the next instruction
/// to optimize. Transforms that can invalidate this should update it.
@@ -494,6 +496,7 @@ class CodeGenPrepareLegacyPass : public FunctionPass {
void getAnalysisUsage(AnalysisUsage &AU) const override {
// FIXME: When we can selectively preserve passes, preserve the domtree.
AU.addRequired<ProfileSummaryInfoWrapperPass>();
+ AU.addRequired<AssumptionCacheTracker>();
AU.addRequired<TargetLibraryInfoWrapperPass>();
AU.addRequired<TargetPassConfig>();
AU.addRequired<TargetTransformInfoWrapperPass>();
@@ -521,6 +524,7 @@ bool CodeGenPrepareLegacyPass::runOnFunction(Function &F) {
CGP.BPI.reset(new BranchProbabilityInfo(F, *CGP.LI));
CGP.BFI.reset(new BlockFrequencyInfo(F, *CGP.BPI, *CGP.LI));
CGP.PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+ CGP.AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
auto BBSPRWP =
getAnalysisIfAvailable<BasicBlockSectionsProfileReaderWrapperPass>();
CGP.BBSectionsProfileReader = BBSPRWP ? &BBSPRWP->getBBSPR() : nullptr;
@@ -533,6 +537,7 @@ INITIALIZE_PASS_BEGIN(CodeGenPrepareLegacyPass, DEBUG_TYPE,
INITIALIZE_PASS_DEPENDENCY(BasicBlockSectionsProfileReaderWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
@@ -569,6 +574,7 @@ bool CodeGenPrepare::run(Function &F, FunctionAnalysisManager &AM) {
BFI.reset(new BlockFrequencyInfo(F, *BPI, *LI));
auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
+ AC = &AM.getResult<AssumptionAnalysis>(F);
BBSectionsProfileReader =
AM.getCachedResult<BasicBlockSectionsProfileReaderAnalysis>(F);
return _run(F);
@@ -612,7 +618,7 @@ bool CodeGenPrepare::_run(Function &F) {
// optimization to those blocks.
BasicBlock *Next = BB->getNextNode();
if (!llvm::shouldOptimizeForSize(BB, PSI, BFI.get()))
- EverMadeChange |= bypassSlowDivision(BB, BypassWidths);
+ EverMadeChange |= bypassSlowDivision(BB, BypassWidths, AC);
BB = Next;
}
}
diff --git a/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp b/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp
index 66d8fea251cbd..ecce920134d03 100644
--- a/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp
+++ b/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp
@@ -18,6 +18,7 @@
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constants.h"
@@ -76,6 +77,7 @@ class FastDivInsertionTask {
Instruction *SlowDivOrRem = nullptr;
IntegerType *BypassType = nullptr;
BasicBlock *MainBB = nullptr;
+ AssumptionCache *AC = nullptr;
bool isHashLikeValue(Value *V, VisitedSetTy &Visited);
ValueRange getValueRange(Value *Op, VisitedSetTy &Visited);
@@ -99,7 +101,8 @@ class FastDivInsertionTask {
Type *getSlowType() { return SlowDivOrRem->getType(); }
public:
- FastDivInsertionTask(Instruction *I, const BypassWidthsTy &BypassWidths);
+ FastDivInsertionTask(Instruction *I, const BypassWidthsTy &BypassWidths,
+ AssumptionCache *AC);
Value *getReplacement(DivCacheTy &Cache);
};
@@ -107,7 +110,9 @@ class FastDivInsertionTask {
} // end anonymous namespace
FastDivInsertionTask::FastDivInsertionTask(Instruction *I,
- const BypassWidthsTy &BypassWidths) {
+ const BypassWidthsTy &BypassWidths,
+ AssumptionCache *AC)
+ : AC(AC) {
switch (I->getOpcode()) {
case Instruction::UDiv:
case Instruction::SDiv:
@@ -232,10 +237,18 @@ ValueRange FastDivInsertionTask::getValueRange(Value *V,
assert(LongLen > ShortLen && "Value type must be wider than BypassType");
unsigned HiBits = LongLen - ShortLen;
+ APInt BypassLimit = APInt(LongLen, 1).shl(ShortLen);
+ ConstantRange CR = computeConstantRange(
+ V, /*ForSigned=*/false, /*UseInstrInfo=*/true, AC, SlowDivOrRem);
+ if (CR.getUnsignedMax().ult(BypassLimit))
+ return VALRNG_KNOWN_SHORT;
+ if (CR.getUnsignedMin().uge(BypassLimit))
+ return VALRNG_LIKELY_LONG;
+
const DataLayout &DL = SlowDivOrRem->getDataLayout();
KnownBits Known(LongLen);
- computeKnownBits(V, Known, DL);
+ computeKnownBits(V, Known, DL, AC, SlowDivOrRem);
if (Known.countMinLeadingZeros() >= HiBits)
return VALRNG_KNOWN_SHORT;
@@ -445,7 +458,8 @@ std::optional<QuotRemPair> FastDivInsertionTask::insertFastDivAndRem() {
/// This optimization identifies DIV/REM instructions in a BB that can be
/// profitably bypassed and carried out with a shorter, faster divide.
bool llvm::bypassSlowDivision(BasicBlock *BB,
- const BypassWidthsTy &BypassWidths) {
+ const BypassWidthsTy &BypassWidths,
+ AssumptionCache *AC) {
DivCacheTy PerBBDivCache;
bool MadeChange = false;
@@ -460,7 +474,7 @@ bool llvm::bypassSlowDivision(BasicBlock *BB,
if (I->use_empty())
continue;
- FastDivInsertionTask Task(I, BypassWidths);
+ FastDivInsertionTask Task(I, BypassWidths, AC);
if (Value *Replacement = Task.getReplacement(PerBBDivCache)) {
I->replaceAllUsesWith(Replacement);
I->eraseFromParent();
diff --git a/llvm/test/CodeGen/X86/bypass-slow-division-64.ll b/llvm/test/CodeGen/X86/bypass-slow-division-64.ll
index 821b7b8e4144f..95a5ad95cd9da 100644
--- a/llvm/test/CodeGen/X86/bypass-slow-division-64.ll
+++ b/llvm/test/CodeGen/X86/bypass-slow-division-64.ll
@@ -383,3 +383,121 @@ define void @PR43514(i32 %x, i32 %y) {
%s = srem i64 %z1, %z2
ret void
}
+
+; Width-relevant assumption: dividend is known long, so bypass split should
+; collapse to a single long divide path (no width-check branch).
+define i32 @udiv_i64_i32_assume_dividend_gt_u32_max(i64 %n, i32 %d) {
+; FAST-DIVQ-LABEL: udiv_i64_i32_assume_dividend_gt_u32_max:
+; FAST-DIVQ: # %bb.0:
+; FAST-DIVQ-NOT: je
+; FAST-DIVQ-NOT: divl
+; FAST-DIVQ: divq
+;
+; SLOW-DIVQ-LABEL: udiv_i64_i32_assume_dividend_gt_u32_max:
+; SLOW-DIVQ: # %bb.0:
+; SLOW-DIVQ-NOT: je
+; SLOW-DIVQ-NOT: divl
+; SLOW-DIVQ: divq
+ %cmp = icmp ugt i64 %n, 4294967295
+ call void @llvm.assume(i1 %cmp)
+ %d.ext = zext i32 %d to i64
+ %q = udiv i64 %n, %d.ext
+ %tr = trunc i64 %q to i32
+ ret i32 %tr
+}
+
+; Width-relevant assumption: dividend is known short, so bypass split should
+; collapse to a single short divide path (no width-check branch).
+define i32 @udiv_i64_i32_assume_dividend_le_u32_max(i64 %n, i32 %d) {
+; SLOW-DIVQ-LABEL: udiv_i64_i32_assume_dividend_le_u32_max:
+; SLOW-DIVQ: # %bb.0:
+; SLOW-DIVQ-NOT: je
+; SLOW-DIVQ: divl
+; SLOW-DIVQ-NOT: divq
+ %cmp = icmp ule i64 %n, 4294967295
+ call void @llvm.assume(i1 %cmp)
+ %d.ext = zext i32 %d to i64
+ %q = udiv i64 %n, %d.ext
+ %tr = trunc i64 %q to i32
+ ret i32 %tr
+}
+
+; Width-relevant assumption: dividend is known long, so bypass split should
+; collapse to a single long divide path (no width-check branch).
+define i32 @urem_i64_i32_assume_dividend_gt_u32_max(i64 %n, i32 %d) {
+; SLOW-DIVQ-LABEL: urem_i64_i32_assume_dividend_gt_u32_max:
+; SLOW-DIVQ: # %bb.0:
+; SLOW-DIVQ-NOT: je
+; SLOW-DIVQ-NOT: divl
+; SLOW-DIVQ: divq
+ %cmp = icmp ugt i64 %n, 4294967295
+ call void @llvm.assume(i1 %cmp)
+ %d.ext = zext i32 %d to i64
+ %r = urem i64 %n, %d.ext
+ %tr = trunc i64 %r to i32
+ ret i32 %tr
+}
+
+; Width-relevant signed assumption: dividend is known long, so bypass split
+; should collapse to a single long signed-divide path (no width-check branch).
+define i32 @sdiv_i64_i32_assume_dividend_gt_u32_max(i64 %n, i32 %d) {
+; SLOW-DIVQ-LABEL: sdiv_i64_i32_assume_dividend_gt_u32_max:
+; SLOW-DIVQ: # %bb.0:
+; SLOW-DIVQ-NOT: je
+; SLOW-DIVQ-NOT: idivl
+; SLOW-DIVQ: idivq
+ %cmp = icmp sgt i64 %n, 4294967295
+ call void @llvm.assume(i1 %cmp)
+ %d.ext = sext i32 %d to i64
+ %q = sdiv i64 %n, %d.ext
+ %tr = trunc i64 %q to i32
+ ret i32 %tr
+}
+
+; Width-relevant signed assumption: dividend is known long, so bypass split
+; should collapse to a single long signed-divide path (no width-check branch).
+define i32 @srem_i64_i32_assume_dividend_gt_u32_max(i64 %n, i32 %d) {
+; SLOW-DIVQ-LABEL: srem_i64_i32_assume_dividend_gt_u32_max:
+; SLOW-DIVQ: # %bb.0:
+; SLOW-DIVQ-NOT: je
+; SLOW-DIVQ-NOT: idivl
+; SLOW-DIVQ: idivq
+ %cmp = icmp sgt i64 %n, 4294967295
+ call void @llvm.assume(i1 %cmp)
+ %d.ext = sext i32 %d to i64
+ %r = srem i64 %n, %d.ext
+ %tr = trunc i64 %r to i32
+ ret i32 %tr
+}
+
+; Width-irrelevant assumption (nonzero divisor): bypass should still emit the
+; runtime width-check split and branch between divq/divl paths.
+define i32 @udiv_i64_i32_assume_divisor_nonzero_no_width_fact(i64 %n, i32 %d) {
+; SLOW-DIVQ-LABEL: udiv_i64_i32_assume_divisor_nonzero_no_width_fact:
+; SLOW-DIVQ: # %bb.0:
+; SLOW-DIVQ: je
+; SLOW-DIVQ: divq
+; SLOW-DIVQ: divl
+ %cmp = icmp ne i32 %d, 0
+ call void @llvm.assume(i1 %cmp)
+ %d.ext = zext i32 %d to i64
+ %q = udiv i64 %n, %d.ext
+ %tr = trunc i64 %q to i32
+ ret i32 %tr
+}
+
+; Width-irrelevant assumption (nonzero dividend): bypass should still emit the
+; runtime width-check split and branch between divq/divl paths.
+define i32 @udiv_i64_i32_assume_dividend_nonzero_no_width_fact(i64 %n, i32 %d) {
+; SLOW-DIVQ-LABEL: udiv_i64_i32_assume_dividend_nonzero_no_width_fact:
+; SLOW-DIVQ: # %bb.0:
+; SLOW-DIVQ: je
+; SLOW-DIVQ: divq
+; SLOW-DIVQ: divl
+ %cmp = icmp ne i64 %n, 0
+ call void @llvm.assume(i1 %cmp)
+ %d.ext = zext i32 %d to i64
+ %q = udiv i64 %n, %d.ext
+ %tr = trunc i64 %q to i32
+ ret i32 %tr
+}
``````````
</details>
https://github.com/llvm/llvm-project/pull/185784
More information about the llvm-commits
mailing list