[llvm] make div more assumption aware for better code (PR #185784)

Tue Mar 10 18:55:05 PDT 2026

llvmbot wrote:




@llvm/pr-subscribers-backend-x86

Author: Takashi Idobe (Takashiidobe)

<details>
<summary>Changes</summary>

Related: https://github.com/llvm/llvm-project/issues/115158

The linked issue points out that there are some cases where division and modulo will have enough information to prove that there doesn't need to be a branch of a division (e.g. long / long or long / short division). In which case there shouldn't need to be a branch emitted because the compiler knows which div operation is going to be better and should just emit that code without a runtime branch.

This handles the following cases:

1. long dividend, short divisor, unsigned divide
      - udiv -> single divq
2. long dividend, short divisor, unsigned remainder
      - urem -> single divq
3. long signed dividend, short signed divisor, signed divide
      - sdiv -> single idivq
4.  long signed dividend, short signed divisor, signed remainder
      - srem -> single idivq


and the negative case:
1. unknown dividend, unknown divisor without enough information, still emits a branch.

This is just a few of the cases the issue points out, so this just handles some basic cases.

On x86 the changes are:

For these cases:

```c
#include <stdint.h>

uint32_t div_u64_u32_if_div_gt_u32_max(uint64_t n, uint32_t d) {
  if (n <= UINT32_MAX) {
    __builtin_unreachable();
  }
  return (uint32_t)(n / (uint64_t)d);
}

uint32_t rem_u64_u32_if_div_gt_u32_max(uint64_t n, uint32_t d) {
  if (n <= UINT32_MAX) {
    __builtin_unreachable();
  }
  return (uint32_t)(n % (uint64_t)d);
}

int32_t div_i64_i32_if_div_gt_u32_max(int64_t n, int32_t d) {
  if (n <= 4294967295LL) {
    __builtin_unreachable();
  }
  return (int32_t)(n / (int64_t)d);
}

int32_t rem_i64_i32_if_div_gt_u32_max(int64_t n, int32_t d) {
  if (n <= 4294967295LL) {
    __builtin_unreachable();
  }
  return (int32_t)(n % (int64_t)d);
}
```

Current head emits this code (all with branches)

```asm
div_u64_u32_if_div_gt_u32_max:          # @div_u64_u32_if_div_gt_u32_max
	mov	rax, rdi
	mov	ecx, esi
	mov	rdx, rdi
	shr	rdx, 32
	je	.LBB0_1
	xor	edx, edx
	div	rcx
	ret
.LBB0_1:
	xor	edx, edx
	div	ecx
	ret
rem_u64_u32_if_div_gt_u32_max:          # @rem_u64_u32_if_div_gt_u32_max
	mov	rax, rdi
	mov	ecx, esi
	mov	rdx, rdi
	shr	rdx, 32
	je	.LBB1_1
	xor	edx, edx
	div	rcx
	mov	rax, rdx
	ret
.LBB1_1:
	xor	edx, edx
	div	ecx
	mov	eax, edx
	ret
div_i64_i32_if_div_gt_u32_max:          # @div_i64_i32_if_div_gt_u32_max
	mov	rax, rdi
	movsxd	rcx, esi
	mov	rdx, rdi
	or	rdx, rcx
	shr	rdx, 32
	je	.LBB2_1
	cqo
	idiv	rcx
	ret
.LBB2_1:
	xor	edx, edx
	div	esi
	ret
rem_i64_i32_if_div_gt_u32_max:          # @rem_i64_i32_if_div_gt_u32_max
	mov	rax, rdi
	movsxd	rcx, esi
	mov	rdx, rdi
	or	rdx, rcx
	shr	rdx, 32
	je	.LBB3_1
	cqo
	idiv	rcx
	mov	rax, rdx
	ret
.LBB3_1:
	xor	edx, edx
	div	esi
	mov	eax, edx
	ret
```

And after the fix:

```asm
div_u64_u32_if_div_gt_u32_max:          # @div_u64_u32_if_div_gt_u32_max
	mov	rax, rdi
	mov	ecx, esi
	xor	edx, edx
	div	rcx
	ret
rem_u64_u32_if_div_gt_u32_max:          # @rem_u64_u32_if_div_gt_u32_max
	mov	rax, rdi
	mov	ecx, esi
	xor	edx, edx
	div	rcx
	mov	rax, rdx
	ret
div_i64_i32_if_div_gt_u32_max:          # @div_i64_i32_if_div_gt_u32_max
	mov	rax, rdi
	movsxd	rcx, esi
	cqo
	idiv	rcx
	ret
rem_i64_i32_if_div_gt_u32_max:          # @rem_i64_i32_if_div_gt_u32_max
	mov	rax, rdi
	movsxd	rcx, esi
	cqo
	idiv	rcx
	mov	rax, rdx
	ret
```

---
Full diff: https://github.com/llvm/llvm-project/pull/185784.diff


4 Files Affected:

- (modified) llvm/include/llvm/Transforms/Utils/BypassSlowDivision.h (+4-2) 
- (modified) llvm/lib/CodeGen/CodeGenPrepare.cpp (+7-1) 
- (modified) llvm/lib/Transforms/Utils/BypassSlowDivision.cpp (+19-5) 
- (modified) llvm/test/CodeGen/X86/bypass-slow-division-64.ll (+118) 


``````````diff

diff --git a/llvm/include/llvm/Transforms/Utils/BypassSlowDivision.h b/llvm/include/llvm/Transforms/Utils/BypassSlowDivision.h
index bd98c902d1ab4..ffe88f6e232a9 100644
--- a/llvm/include/llvm/Transforms/Utils/BypassSlowDivision.h
+++ b/llvm/include/llvm/Transforms/Utils/BypassSlowDivision.h
@@ -24,6 +24,7 @@
 
 namespace llvm {
 
+class AssumptionCache;
 class BasicBlock;
 class Value;
 
@@ -66,8 +67,9 @@ template <> struct DenseMapInfo<DivRemMapKey> {
 ///
 /// This optimization may add basic blocks immediately after BB; for obvious
 /// reasons, you shouldn't pass those blocks to bypassSlowDivision.
-bool bypassSlowDivision(
-    BasicBlock *BB, const DenseMap<unsigned int, unsigned int> &BypassWidth);
+bool bypassSlowDivision(BasicBlock *BB,
+                        const DenseMap<unsigned int, unsigned int> &BypassWidth,
+                        AssumptionCache *AC = nullptr);
 
 } // end namespace llvm
 
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 537b91bc39e0e..a5e12c7bb4627 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -22,6 +22,7 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/FloatingPointPredicateUtils.h"
@@ -316,6 +317,7 @@ class CodeGenPrepare {
   std::unique_ptr<BlockFrequencyInfo> BFI;
   std::unique_ptr<BranchProbabilityInfo> BPI;
   ProfileSummaryInfo *PSI = nullptr;
+  AssumptionCache *AC = nullptr;
 
   /// As we scan instructions optimizing them, this is the next instruction
   /// to optimize. Transforms that can invalidate this should update it.
@@ -494,6 +496,7 @@ class CodeGenPrepareLegacyPass : public FunctionPass {
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     // FIXME: When we can selectively preserve passes, preserve the domtree.
     AU.addRequired<ProfileSummaryInfoWrapperPass>();
+    AU.addRequired<AssumptionCacheTracker>();
     AU.addRequired<TargetLibraryInfoWrapperPass>();
     AU.addRequired<TargetPassConfig>();
     AU.addRequired<TargetTransformInfoWrapperPass>();
@@ -521,6 +524,7 @@ bool CodeGenPrepareLegacyPass::runOnFunction(Function &F) {
   CGP.BPI.reset(new BranchProbabilityInfo(F, *CGP.LI));
   CGP.BFI.reset(new BlockFrequencyInfo(F, *CGP.BPI, *CGP.LI));
   CGP.PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+  CGP.AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   auto BBSPRWP =
       getAnalysisIfAvailable<BasicBlockSectionsProfileReaderWrapperPass>();
   CGP.BBSectionsProfileReader = BBSPRWP ? &BBSPRWP->getBBSPR() : nullptr;
@@ -533,6 +537,7 @@ INITIALIZE_PASS_BEGIN(CodeGenPrepareLegacyPass, DEBUG_TYPE,
 INITIALIZE_PASS_DEPENDENCY(BasicBlockSectionsProfileReaderWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
@@ -569,6 +574,7 @@ bool CodeGenPrepare::run(Function &F, FunctionAnalysisManager &AM) {
   BFI.reset(new BlockFrequencyInfo(F, *BPI, *LI));
   auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
   PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
+  AC = &AM.getResult<AssumptionAnalysis>(F);
   BBSectionsProfileReader =
       AM.getCachedResult<BasicBlockSectionsProfileReaderAnalysis>(F);
   return _run(F);
@@ -612,7 +618,7 @@ bool CodeGenPrepare::_run(Function &F) {
       // optimization to those blocks.
       BasicBlock *Next = BB->getNextNode();
       if (!llvm::shouldOptimizeForSize(BB, PSI, BFI.get()))
-        EverMadeChange |= bypassSlowDivision(BB, BypassWidths);
+        EverMadeChange |= bypassSlowDivision(BB, BypassWidths, AC);
       BB = Next;
     }
   }
diff --git a/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp b/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp
index 66d8fea251cbd..ecce920134d03 100644
--- a/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp
+++ b/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
@@ -76,6 +77,7 @@ class FastDivInsertionTask {
   Instruction *SlowDivOrRem = nullptr;
   IntegerType *BypassType = nullptr;
   BasicBlock *MainBB = nullptr;
+  AssumptionCache *AC = nullptr;
 
   bool isHashLikeValue(Value *V, VisitedSetTy &Visited);
   ValueRange getValueRange(Value *Op, VisitedSetTy &Visited);
@@ -99,7 +101,8 @@ class FastDivInsertionTask {
   Type *getSlowType() { return SlowDivOrRem->getType(); }
 
 public:
-  FastDivInsertionTask(Instruction *I, const BypassWidthsTy &BypassWidths);
+  FastDivInsertionTask(Instruction *I, const BypassWidthsTy &BypassWidths,
+                       AssumptionCache *AC);
 
   Value *getReplacement(DivCacheTy &Cache);
 };
@@ -107,7 +110,9 @@ class FastDivInsertionTask {
 } // end anonymous namespace
 
 FastDivInsertionTask::FastDivInsertionTask(Instruction *I,
-                                           const BypassWidthsTy &BypassWidths) {
+                                           const BypassWidthsTy &BypassWidths,
+                                           AssumptionCache *AC)
+    : AC(AC) {
   switch (I->getOpcode()) {
   case Instruction::UDiv:
   case Instruction::SDiv:
@@ -232,10 +237,18 @@ ValueRange FastDivInsertionTask::getValueRange(Value *V,
   assert(LongLen > ShortLen && "Value type must be wider than BypassType");
   unsigned HiBits = LongLen - ShortLen;
 
+  APInt BypassLimit = APInt(LongLen, 1).shl(ShortLen);
+  ConstantRange CR = computeConstantRange(
+      V, /*ForSigned=*/false, /*UseInstrInfo=*/true, AC, SlowDivOrRem);
+  if (CR.getUnsignedMax().ult(BypassLimit))
+    return VALRNG_KNOWN_SHORT;
+  if (CR.getUnsignedMin().uge(BypassLimit))
+    return VALRNG_LIKELY_LONG;
+
   const DataLayout &DL = SlowDivOrRem->getDataLayout();
   KnownBits Known(LongLen);
 
-  computeKnownBits(V, Known, DL);
+  computeKnownBits(V, Known, DL, AC, SlowDivOrRem);
 
   if (Known.countMinLeadingZeros() >= HiBits)
     return VALRNG_KNOWN_SHORT;
@@ -445,7 +458,8 @@ std::optional<QuotRemPair> FastDivInsertionTask::insertFastDivAndRem() {
 /// This optimization identifies DIV/REM instructions in a BB that can be
 /// profitably bypassed and carried out with a shorter, faster divide.
 bool llvm::bypassSlowDivision(BasicBlock *BB,
-                              const BypassWidthsTy &BypassWidths) {
+                              const BypassWidthsTy &BypassWidths,
+                              AssumptionCache *AC) {
   DivCacheTy PerBBDivCache;
 
   bool MadeChange = false;
@@ -460,7 +474,7 @@ bool llvm::bypassSlowDivision(BasicBlock *BB,
     if (I->use_empty())
       continue;
 
-    FastDivInsertionTask Task(I, BypassWidths);
+    FastDivInsertionTask Task(I, BypassWidths, AC);
     if (Value *Replacement = Task.getReplacement(PerBBDivCache)) {
       I->replaceAllUsesWith(Replacement);
       I->eraseFromParent();
diff --git a/llvm/test/CodeGen/X86/bypass-slow-division-64.ll b/llvm/test/CodeGen/X86/bypass-slow-division-64.ll
index 821b7b8e4144f..95a5ad95cd9da 100644
--- a/llvm/test/CodeGen/X86/bypass-slow-division-64.ll
+++ b/llvm/test/CodeGen/X86/bypass-slow-division-64.ll
@@ -383,3 +383,121 @@ define void @PR43514(i32 %x, i32 %y) {
   %s = srem i64 %z1, %z2
   ret void
 }
+
+; Width-relevant assumption: dividend is known long, so bypass split should
+; collapse to a single long divide path (no width-check branch).
+define i32 @udiv_i64_i32_assume_dividend_gt_u32_max(i64 %n, i32 %d) {
+; FAST-DIVQ-LABEL: udiv_i64_i32_assume_dividend_gt_u32_max:
+; FAST-DIVQ:       # %bb.0:
+; FAST-DIVQ-NOT:    je
+; FAST-DIVQ-NOT:    divl
+; FAST-DIVQ:        divq
+;
+; SLOW-DIVQ-LABEL: udiv_i64_i32_assume_dividend_gt_u32_max:
+; SLOW-DIVQ:       # %bb.0:
+; SLOW-DIVQ-NOT:    je
+; SLOW-DIVQ-NOT:    divl
+; SLOW-DIVQ:        divq
+  %cmp = icmp ugt i64 %n, 4294967295
+  call void @llvm.assume(i1 %cmp)
+  %d.ext = zext i32 %d to i64
+  %q = udiv i64 %n, %d.ext
+  %tr = trunc i64 %q to i32
+  ret i32 %tr
+}
+
+; Width-relevant assumption: dividend is known short, so bypass split should
+; collapse to a single short divide path (no width-check branch).
+define i32 @udiv_i64_i32_assume_dividend_le_u32_max(i64 %n, i32 %d) {
+; SLOW-DIVQ-LABEL: udiv_i64_i32_assume_dividend_le_u32_max:
+; SLOW-DIVQ:       # %bb.0:
+; SLOW-DIVQ-NOT:    je
+; SLOW-DIVQ:        divl
+; SLOW-DIVQ-NOT:    divq
+  %cmp = icmp ule i64 %n, 4294967295
+  call void @llvm.assume(i1 %cmp)
+  %d.ext = zext i32 %d to i64
+  %q = udiv i64 %n, %d.ext
+  %tr = trunc i64 %q to i32
+  ret i32 %tr
+}
+
+; Width-relevant assumption: dividend is known long, so bypass split should
+; collapse to a single long divide path (no width-check branch).
+define i32 @urem_i64_i32_assume_dividend_gt_u32_max(i64 %n, i32 %d) {
+; SLOW-DIVQ-LABEL: urem_i64_i32_assume_dividend_gt_u32_max:
+; SLOW-DIVQ:       # %bb.0:
+; SLOW-DIVQ-NOT:    je
+; SLOW-DIVQ-NOT:    divl
+; SLOW-DIVQ:        divq
+  %cmp = icmp ugt i64 %n, 4294967295
+  call void @llvm.assume(i1 %cmp)
+  %d.ext = zext i32 %d to i64
+  %r = urem i64 %n, %d.ext
+  %tr = trunc i64 %r to i32
+  ret i32 %tr
+}
+
+; Width-relevant signed assumption: dividend is known long, so bypass split
+; should collapse to a single long signed-divide path (no width-check branch).
+define i32 @sdiv_i64_i32_assume_dividend_gt_u32_max(i64 %n, i32 %d) {
+; SLOW-DIVQ-LABEL: sdiv_i64_i32_assume_dividend_gt_u32_max:
+; SLOW-DIVQ:       # %bb.0:
+; SLOW-DIVQ-NOT:    je
+; SLOW-DIVQ-NOT:    idivl
+; SLOW-DIVQ:        idivq
+  %cmp = icmp sgt i64 %n, 4294967295
+  call void @llvm.assume(i1 %cmp)
+  %d.ext = sext i32 %d to i64
+  %q = sdiv i64 %n, %d.ext
+  %tr = trunc i64 %q to i32
+  ret i32 %tr
+}
+
+; Width-relevant signed assumption: dividend is known long, so bypass split
+; should collapse to a single long signed-divide path (no width-check branch).
+define i32 @srem_i64_i32_assume_dividend_gt_u32_max(i64 %n, i32 %d) {
+; SLOW-DIVQ-LABEL: srem_i64_i32_assume_dividend_gt_u32_max:
+; SLOW-DIVQ:       # %bb.0:
+; SLOW-DIVQ-NOT:    je
+; SLOW-DIVQ-NOT:    idivl
+; SLOW-DIVQ:        idivq
+  %cmp = icmp sgt i64 %n, 4294967295
+  call void @llvm.assume(i1 %cmp)
+  %d.ext = sext i32 %d to i64
+  %r = srem i64 %n, %d.ext
+  %tr = trunc i64 %r to i32
+  ret i32 %tr
+}
+
+; Width-irrelevant assumption (nonzero divisor): bypass should still emit the
+; runtime width-check split and branch between divq/divl paths.
+define i32 @udiv_i64_i32_assume_divisor_nonzero_no_width_fact(i64 %n, i32 %d) {
+; SLOW-DIVQ-LABEL: udiv_i64_i32_assume_divisor_nonzero_no_width_fact:
+; SLOW-DIVQ:       # %bb.0:
+; SLOW-DIVQ:        je
+; SLOW-DIVQ:        divq
+; SLOW-DIVQ:        divl
+  %cmp = icmp ne i32 %d, 0
+  call void @llvm.assume(i1 %cmp)
+  %d.ext = zext i32 %d to i64
+  %q = udiv i64 %n, %d.ext
+  %tr = trunc i64 %q to i32
+  ret i32 %tr
+}
+
+; Width-irrelevant assumption (nonzero dividend): bypass should still emit the
+; runtime width-check split and branch between divq/divl paths.
+define i32 @udiv_i64_i32_assume_dividend_nonzero_no_width_fact(i64 %n, i32 %d) {
+; SLOW-DIVQ-LABEL: udiv_i64_i32_assume_dividend_nonzero_no_width_fact:
+; SLOW-DIVQ:       # %bb.0:
+; SLOW-DIVQ:        je
+; SLOW-DIVQ:        divq
+; SLOW-DIVQ:        divl
+  %cmp = icmp ne i64 %n, 0
+  call void @llvm.assume(i1 %cmp)
+  %d.ext = zext i32 %d to i64
+  %q = udiv i64 %n, %d.ext
+  %tr = trunc i64 %q to i32
+  ret i32 %tr
+}

``````````

</details>


https://github.com/llvm/llvm-project/pull/185784