[llvm] 1e072ae - [CGP] [CodeGenPrepare] Folding `urem` with loop invariant value plus offset (#104724)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Oct 31 07:14:37 PDT 2024
Author: goldsteinn
Date: 2024-10-31T09:14:33-05:00
New Revision: 1e072ae289d77c3c9704a9ae832c076a303c435b
URL: https://github.com/llvm/llvm-project/commit/1e072ae289d77c3c9704a9ae832c076a303c435b
DIFF: https://github.com/llvm/llvm-project/commit/1e072ae289d77c3c9704a9ae832c076a303c435b.diff
LOG: [CGP] [CodeGenPrepare] Folding `urem` with loop invariant value plus offset (#104724)
This extends the existing fold:
```
for(i = Start; i < End; ++i)
Rem = (i nuw+- IncrLoopInvariant) u% RemAmtLoopInvariant;
```
->
```
Rem = (Start nuw+- IncrLoopInvariant) % RemAmtLoopInvariant;
for(i = Start; i < End; ++i, ++rem)
Rem = rem == RemAmtLoopInvariant ? 0 : Rem;
```
To work with a non-zero `IncrLoopInvariant`.
This is a common usage in cases such as:
```
for(i = 0; i < N; ++i)
if ((i + 1) % X) == 0)
do_something_occasionally_but_not_first_iter();
```
Alive2 w/ i4/unrolled 6x (needs to be ran locally due to timeout):
https://alive2.llvm.org/ce/z/6tgyN3
Exhaust proof over all uint8_t combinations in C++:
https://godbolt.org/z/WYa561388
Added:
Modified:
llvm/lib/CodeGen/CodeGenPrepare.cpp
llvm/test/Transforms/CodeGenPrepare/X86/fold-loop-of-urem.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 5224a6c8d1a373..f1ac3d95a8dd87 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -1981,17 +1981,36 @@ static bool foldFCmpToFPClassTest(CmpInst *Cmp, const TargetLowering &TLI,
return true;
}
-static bool isRemOfLoopIncrementWithLoopInvariant(Instruction *Rem,
- const LoopInfo *LI,
- Value *&RemAmtOut,
- PHINode *&LoopIncrPNOut) {
+static bool isRemOfLoopIncrementWithLoopInvariant(
+ Instruction *Rem, const LoopInfo *LI, Value *&RemAmtOut, Value *&AddInstOut,
+ Value *&AddOffsetOut, PHINode *&LoopIncrPNOut) {
Value *Incr, *RemAmt;
// NB: If RemAmt is a power of 2 it *should* have been transformed by now.
if (!match(Rem, m_URem(m_Value(Incr), m_Value(RemAmt))))
return false;
+ Value *AddInst, *AddOffset;
// Find out loop increment PHI.
auto *PN = dyn_cast<PHINode>(Incr);
+ if (PN != nullptr) {
+ AddInst = nullptr;
+ AddOffset = nullptr;
+ } else {
+ // Search through a NUW add on top of the loop increment.
+ Value *V0, *V1;
+ if (!match(Incr, m_NUWAdd(m_Value(V0), m_Value(V1))))
+ return false;
+
+ AddInst = Incr;
+ PN = dyn_cast<PHINode>(V0);
+ if (PN != nullptr) {
+ AddOffset = V1;
+ } else {
+ PN = dyn_cast<PHINode>(V1);
+ AddOffset = V0;
+ }
+ }
+
if (!PN)
return false;
@@ -2031,6 +2050,8 @@ static bool isRemOfLoopIncrementWithLoopInvariant(Instruction *Rem,
// Set output variables.
RemAmtOut = RemAmt;
LoopIncrPNOut = PN;
+ AddInstOut = AddInst;
+ AddOffsetOut = AddOffset;
return true;
}
@@ -2045,15 +2066,14 @@ static bool isRemOfLoopIncrementWithLoopInvariant(Instruction *Rem,
// Rem = (Start nuw+ IncrLoopInvariant) % RemAmtLoopInvariant;
// for(i = Start; i < End; ++i, ++rem)
// Rem = rem == RemAmtLoopInvariant ? 0 : Rem;
-//
-// Currently only implemented for `IncrLoopInvariant` being zero.
static bool foldURemOfLoopIncrement(Instruction *Rem, const DataLayout *DL,
const LoopInfo *LI,
SmallSet<BasicBlock *, 32> &FreshBBs,
bool IsHuge) {
- Value *RemAmt;
+ Value *AddOffset, *RemAmt, *AddInst;
PHINode *LoopIncrPN;
- if (!isRemOfLoopIncrementWithLoopInvariant(Rem, LI, RemAmt, LoopIncrPN))
+ if (!isRemOfLoopIncrementWithLoopInvariant(Rem, LI, RemAmt, AddInst,
+ AddOffset, LoopIncrPN))
return false;
// Only non-constant remainder as the extra IV is probably not profitable
@@ -2071,6 +2091,23 @@ static bool foldURemOfLoopIncrement(Instruction *Rem, const DataLayout *DL,
Loop *L = LI->getLoopFor(LoopIncrPN->getParent());
Value *Start = LoopIncrPN->getIncomingValueForBlock(L->getLoopPreheader());
+ // If we have add create initial value for remainder.
+ // The logic here is:
+ // (urem (add nuw Start, IncrLoopInvariant), RemAmtLoopInvariant
+ //
+ // Only proceed if the expression simplifies (otherwise we can't fully
+ // optimize out the urem).
+ if (AddInst) {
+ assert(AddOffset && "We found an add but missing values");
+ // Without dom-condition/assumption cache we aren't likely to get much out
+ // of a context instruction.
+ Start = simplifyAddInst(Start, AddOffset,
+ match(AddInst, m_NSWAdd(m_Value(), m_Value())),
+ /*IsNUW=*/true, *DL);
+ if (!Start)
+ return false;
+ }
+
// If we can't fully optimize out the `rem`, skip this transform.
Start = simplifyURemInst(Start, RemAmt, *DL);
if (!Start)
@@ -2098,9 +2135,12 @@ static bool foldURemOfLoopIncrement(Instruction *Rem, const DataLayout *DL,
FreshBBs.insert(LoopIncrPN->getParent());
FreshBBs.insert(L->getLoopLatch());
FreshBBs.insert(Rem->getParent());
-
+ if (AddInst)
+ FreshBBs.insert(cast<Instruction>(AddInst)->getParent());
replaceAllUsesWith(Rem, NewRem, FreshBBs, IsHuge);
Rem->eraseFromParent();
+ if (AddInst && AddInst->use_empty())
+ cast<Instruction>(AddInst)->eraseFromParent();
return true;
}
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/fold-loop-of-urem.ll b/llvm/test/Transforms/CodeGenPrepare/X86/fold-loop-of-urem.ll
index abfbf2e5e582eb..33d18d0e2a795b 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/fold-loop-of-urem.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/fold-loop-of-urem.ll
@@ -319,20 +319,20 @@ for.body.tail:
define void @simple_urem_to_sel_vec(<2 x i64> %rem_amt) nounwind {
; CHECK-LABEL: define void @simple_urem_to_sel_vec(
; CHECK-SAME: <2 x i64> [[REM_AMT:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[FOR_COND_CLEANUP:.*]]:
+; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: br label %[[FOR_BODY:.*]]
-; CHECK: [[ENTRY:.*]]:
+; CHECK: [[FOR_COND_CLEANUP:.*]]:
; CHECK-NEXT: ret void
; CHECK: [[FOR_BODY]]:
-; CHECK-NEXT: [[REM:%.*]] = phi <2 x i64> [ zeroinitializer, %[[FOR_COND_CLEANUP]] ], [ [[TMP3:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT: [[I_04:%.*]] = phi <2 x i64> [ [[INC:%.*]], %[[FOR_BODY]] ], [ zeroinitializer, %[[FOR_COND_CLEANUP]] ]
+; CHECK-NEXT: [[REM:%.*]] = phi <2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP3:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT: [[I_04:%.*]] = phi <2 x i64> [ [[INC:%.*]], %[[FOR_BODY]] ], [ zeroinitializer, %[[ENTRY]] ]
; CHECK-NEXT: tail call void @use.2xi64(<2 x i64> [[REM]])
; CHECK-NEXT: [[TMP1:%.*]] = add nuw <2 x i64> [[REM]], <i64 1, i64 1>
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq <2 x i64> [[TMP1]], [[REM_AMT]]
; CHECK-NEXT: [[TMP3]] = select <2 x i1> [[TMP2]], <2 x i64> zeroinitializer, <2 x i64> [[TMP1]]
; CHECK-NEXT: [[INC]] = add nuw <2 x i64> [[I_04]], <i64 1, i64 1>
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = call i1 @get.i1()
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[ENTRY]], label %[[FOR_BODY]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
;
entry:
br label %for.body
@@ -892,10 +892,12 @@ define void @simple_urem_to_sel_non_zero_start_through_add(i32 %N, i32 %rem_amt_
; CHECK: [[FOR_COND_CLEANUP]]:
; CHECK-NEXT: ret void
; CHECK: [[FOR_BODY]]:
+; CHECK-NEXT: [[REM:%.*]] = phi i32 [ 7, %[[FOR_BODY_PREHEADER]] ], [ [[TMP3:%.*]], %[[FOR_BODY]] ]
; CHECK-NEXT: [[I_04:%.*]] = phi i32 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 2, %[[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT: [[I_WITH_OFF:%.*]] = add nuw i32 [[I_04]], 5
-; CHECK-NEXT: [[REM:%.*]] = urem i32 [[I_WITH_OFF]], [[REM_AMT]]
; CHECK-NEXT: tail call void @use.i32(i32 [[REM]])
+; CHECK-NEXT: [[TMP1:%.*]] = add nuw i32 [[REM]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], [[REM_AMT]]
+; CHECK-NEXT: [[TMP3]] = select i1 [[TMP2]], i32 0, i32 [[TMP1]]
; CHECK-NEXT: [[INC]] = add nuw i32 [[I_04]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
More information about the llvm-commits
mailing list