[llvm] [LoopFlatten] Use loop versioning when overflow can't be disproven (PR #78576)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Jan 18 05:42:14 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-transforms
Author: John Brawn (john-brawn-arm)
<details>
<summary>Changes</summary>
Implement the TODO in loop flattening to version the loop when we can't prove that the trip count calculation won't overflow.
---
Patch is 34.19 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/78576.diff
4 Files Affected:
- (modified) llvm/lib/Transforms/Scalar/LoopFlatten.cpp (+60-13)
- (modified) llvm/test/Transforms/LoopFlatten/loop-flatten-negative.ll (-114)
- (added) llvm/test/Transforms/LoopFlatten/loop-flatten-version.ll (+418)
- (modified) llvm/test/Transforms/LoopFlatten/widen-iv.ll (+2-2)
``````````diff
diff --git a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
index 533cefaf106133..697304f513a085 100644
--- a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
@@ -70,6 +70,7 @@
#include "llvm/Transforms/Scalar/LoopPassManager.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/LoopVersioning.h"
#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
#include "llvm/Transforms/Utils/SimplifyIndVar.h"
#include <optional>
@@ -97,6 +98,10 @@ static cl::opt<bool>
cl::desc("Widen the loop induction variables, if possible, so "
"overflow checks won't reject flattening"));
+static cl::opt<bool>
+ VersionLoops("loop-flatten-version-loops", cl::Hidden, cl::init(true),
+ cl::desc("Version loops if flattened loop could overflow"));
+
namespace {
// We require all uses of both induction variables to match this pattern:
//
@@ -141,6 +146,8 @@ struct FlattenInfo {
// has been applied. Used to skip
// checks on phi nodes.
+ Value *NewTripCount = nullptr; // The tripcount of the flattened loop.
+
FlattenInfo(Loop *OL, Loop *IL) : OuterLoop(OL), InnerLoop(IL){};
bool isNarrowInductionPhi(PHINode *Phi) {
@@ -752,11 +759,13 @@ static bool DoFlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI,
ORE.emit(Remark);
}
- Value *NewTripCount = BinaryOperator::CreateMul(
- FI.InnerTripCount, FI.OuterTripCount, "flatten.tripcount",
- FI.OuterLoop->getLoopPreheader()->getTerminator());
- LLVM_DEBUG(dbgs() << "Created new trip count in preheader: ";
- NewTripCount->dump());
+ if (!FI.NewTripCount) {
+ FI.NewTripCount = BinaryOperator::CreateMul(
+ FI.InnerTripCount, FI.OuterTripCount, "flatten.tripcount",
+ FI.OuterLoop->getLoopPreheader()->getTerminator());
+ LLVM_DEBUG(dbgs() << "Created new trip count in preheader: ";
+ FI.NewTripCount->dump());
+ }
// Fix up PHI nodes that take values from the inner loop back-edge, which
// we are about to remove.
@@ -769,7 +778,7 @@ static bool DoFlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI,
// Modify the trip count of the outer loop to be the product of the two
// trip counts.
- cast<User>(FI.OuterBranch->getCondition())->setOperand(1, NewTripCount);
+ cast<User>(FI.OuterBranch->getCondition())->setOperand(1, FI.NewTripCount);
// Replace the inner loop backedge with an unconditional branch to the exit.
BasicBlock *InnerExitBlock = FI.InnerLoop->getExitBlock();
@@ -891,7 +900,8 @@ static bool CanWidenIV(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI,
static bool FlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI,
ScalarEvolution *SE, AssumptionCache *AC,
const TargetTransformInfo *TTI, LPMUpdater *U,
- MemorySSAUpdater *MSSAU) {
+ MemorySSAUpdater *MSSAU,
+ const LoopAccessInfo &LAI) {
LLVM_DEBUG(
dbgs() << "Loop flattening running on outer loop "
<< FI.OuterLoop->getHeader()->getName() << " and inner loop "
@@ -926,18 +936,53 @@ static bool FlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI,
// variable might overflow. In this case, we need to version the loop, and
// select the original version at runtime if the iteration space is too
// large.
- // TODO: We currently don't version the loop.
OverflowResult OR = checkOverflow(FI, DT, AC);
if (OR == OverflowResult::AlwaysOverflowsHigh ||
OR == OverflowResult::AlwaysOverflowsLow) {
LLVM_DEBUG(dbgs() << "Multiply would always overflow, so not profitable\n");
return false;
} else if (OR == OverflowResult::MayOverflow) {
- LLVM_DEBUG(dbgs() << "Multiply might overflow, not flattening\n");
- return false;
+ Module *M = FI.OuterLoop->getHeader()->getParent()->getParent();
+ const DataLayout &DL = M->getDataLayout();
+ if (!VersionLoops) {
+ LLVM_DEBUG(dbgs() << "Multiply might overflow, not flattening\n");
+ return false;
+ } else if (!DL.isLegalInteger(
+ FI.OuterTripCount->getType()->getScalarSizeInBits())) {
+ // If the trip count type isn't legal then it won't be possible to check
+ // for overflow using only a single multiply instruction, so don't
+ // flatten.
+ LLVM_DEBUG(
+ dbgs() << "Can't check overflow efficiently, not flattening\n");
+ return false;
+ }
+ LLVM_DEBUG(dbgs() << "Multiply might overflow, versioning loop\n");
+
+ // Version the loop. The overflow check isn't a runtime pointer check, so we
+ // pass an empty list of runtime pointer checks and add our own check
+ // afterwards.
+ BasicBlock *CheckBlock = FI.OuterLoop->getLoopPreheader();
+ ArrayRef<RuntimePointerCheck> Checks(nullptr, nullptr);
+ LoopVersioning LVer(LAI, Checks, FI.OuterLoop, LI, DT, SE);
+ LVer.versionLoop();
+
+ // Check for overflow by calculating the new tripcount using
+ // umul_with_overflow and then checking if it overflowed.
+ BranchInst *Br = cast<BranchInst>(CheckBlock->getTerminator());
+ assert(Br->isConditional() &&
+ "Expected LoopVersioning to generate a conditional branch");
+ IRBuilder<> Builder(Br);
+ Function *F = Intrinsic::getDeclaration(M, Intrinsic::umul_with_overflow,
+ FI.OuterTripCount->getType());
+ Value *Call = Builder.CreateCall(F, {FI.OuterTripCount, FI.InnerTripCount},
+ "flatten.mul");
+ FI.NewTripCount = Builder.CreateExtractValue(Call, 0, "flatten.tripcount");
+ Value *Overflow = Builder.CreateExtractValue(Call, 1, "flatten.overflow");
+ Br->setCondition(Overflow);
+ } else {
+ LLVM_DEBUG(dbgs() << "Multiply cannot overflow, modifying loop in-place\n");
}
- LLVM_DEBUG(dbgs() << "Multiply cannot overflow, modifying loop in-place\n");
return DoFlattenLoopPair(FI, DT, LI, SE, AC, TTI, U, MSSAU);
}
@@ -958,13 +1003,15 @@ PreservedAnalyses LoopFlattenPass::run(LoopNest &LN, LoopAnalysisManager &LAM,
// in simplified form, and also needs LCSSA. Running
// this pass will simplify all loops that contain inner loops,
// regardless of whether anything ends up being flattened.
+ LoopAccessInfoManager LAIM(AR.SE, AR.AA, AR.DT, AR.LI, nullptr);
for (Loop *InnerLoop : LN.getLoops()) {
auto *OuterLoop = InnerLoop->getParentLoop();
if (!OuterLoop)
continue;
FlattenInfo FI(OuterLoop, InnerLoop);
- Changed |= FlattenLoopPair(FI, &AR.DT, &AR.LI, &AR.SE, &AR.AC, &AR.TTI, &U,
- MSSAU ? &*MSSAU : nullptr);
+ Changed |=
+ FlattenLoopPair(FI, &AR.DT, &AR.LI, &AR.SE, &AR.AC, &AR.TTI, &U,
+ MSSAU ? &*MSSAU : nullptr, LAIM.getInfo(*OuterLoop));
}
if (!Changed)
diff --git a/llvm/test/Transforms/LoopFlatten/loop-flatten-negative.ll b/llvm/test/Transforms/LoopFlatten/loop-flatten-negative.ll
index 23ea09747cee74..479b5c3388f89c 100644
--- a/llvm/test/Transforms/LoopFlatten/loop-flatten-negative.ll
+++ b/llvm/test/Transforms/LoopFlatten/loop-flatten-negative.ll
@@ -568,72 +568,6 @@ for.cond.cleanup:
ret void
}
-; A 3d loop corresponding to:
-;
-; for (int k = 0; k < N; ++k)
-; for (int i = 0; i < N; ++i)
-; for (int j = 0; j < M; ++j)
-; f(&A[i*M+j]);
-;
-; This could be supported, but isn't at the moment.
-;
-define void @d3_2(i32* %A, i32 %N, i32 %M) {
-entry:
- %cmp30 = icmp sgt i32 %N, 0
- br i1 %cmp30, label %for.cond1.preheader.lr.ph, label %for.cond.cleanup
-
-for.cond1.preheader.lr.ph:
- %cmp625 = icmp sgt i32 %M, 0
- br label %for.cond1.preheader.us
-
-for.cond1.preheader.us:
- %k.031.us = phi i32 [ 0, %for.cond1.preheader.lr.ph ], [ %inc13.us, %for.cond1.for.cond.cleanup3_crit_edge.us ]
- br i1 %cmp625, label %for.cond5.preheader.us.us.preheader, label %for.cond5.preheader.us43.preheader
-
-for.cond5.preheader.us43.preheader:
- br label %for.cond1.for.cond.cleanup3_crit_edge.us.loopexit50
-
-for.cond5.preheader.us.us.preheader:
- br label %for.cond5.preheader.us.us
-
-for.cond1.for.cond.cleanup3_crit_edge.us.loopexit:
- br label %for.cond1.for.cond.cleanup3_crit_edge.us
-
-for.cond1.for.cond.cleanup3_crit_edge.us.loopexit50:
- br label %for.cond1.for.cond.cleanup3_crit_edge.us
-
-for.cond1.for.cond.cleanup3_crit_edge.us:
- %inc13.us = add nuw nsw i32 %k.031.us, 1
- %exitcond52 = icmp ne i32 %inc13.us, %N
- br i1 %exitcond52, label %for.cond1.preheader.us, label %for.cond.cleanup.loopexit
-
-for.cond5.preheader.us.us:
- %i.028.us.us = phi i32 [ %inc10.us.us, %for.cond5.for.cond.cleanup7_crit_edge.us.us ], [ 0, %for.cond5.preheader.us.us.preheader ]
- %mul.us.us = mul nsw i32 %i.028.us.us, %M
- br label %for.body8.us.us
-
-for.cond5.for.cond.cleanup7_crit_edge.us.us:
- %inc10.us.us = add nuw nsw i32 %i.028.us.us, 1
- %exitcond51 = icmp ne i32 %inc10.us.us, %N
- br i1 %exitcond51, label %for.cond5.preheader.us.us, label %for.cond1.for.cond.cleanup3_crit_edge.us.loopexit
-
-for.body8.us.us:
- %j.026.us.us = phi i32 [ 0, %for.cond5.preheader.us.us ], [ %inc.us.us, %for.body8.us.us ]
- %add.us.us = add nsw i32 %j.026.us.us, %mul.us.us
- %idxprom.us.us = sext i32 %add.us.us to i64
- %arrayidx.us.us = getelementptr inbounds i32, ptr %A, i64 %idxprom.us.us
- tail call void @f(ptr %arrayidx.us.us) #2
- %inc.us.us = add nuw nsw i32 %j.026.us.us, 1
- %exitcond = icmp ne i32 %inc.us.us, %M
- br i1 %exitcond, label %for.body8.us.us, label %for.cond5.for.cond.cleanup7_crit_edge.us.us
-
-for.cond.cleanup.loopexit:
- br label %for.cond.cleanup
-
-for.cond.cleanup:
- ret void
-}
-
; A 3d loop corresponding to:
;
; for (int i = 0; i < N; ++i)
@@ -785,54 +719,6 @@ for.empty:
ret void
}
-; GEP doesn't dominate the loop latch so can't guarantee N*M won't overflow.
- at first = global i32 1, align 4
- at a = external global [0 x i8], align 1
-define void @overflow(i32 %lim, ptr %a) {
-entry:
- %cmp17.not = icmp eq i32 %lim, 0
- br i1 %cmp17.not, label %for.cond.cleanup, label %for.cond1.preheader.preheader
-
-for.cond1.preheader.preheader:
- br label %for.cond1.preheader
-
-for.cond1.preheader:
- %i.018 = phi i32 [ %inc6, %for.cond.cleanup3 ], [ 0, %for.cond1.preheader.preheader ]
- %mul = mul i32 %i.018, 100000
- br label %for.body4
-
-for.cond.cleanup.loopexit:
- br label %for.cond.cleanup
-
-for.cond.cleanup:
- ret void
-
-for.cond.cleanup3:
- %inc6 = add i32 %i.018, 1
- %cmp = icmp ult i32 %inc6, %lim
- br i1 %cmp, label %for.cond1.preheader, label %for.cond.cleanup.loopexit
-
-for.body4:
- %j.016 = phi i32 [ 0, %for.cond1.preheader ], [ %inc, %if.end ]
- %add = add i32 %j.016, %mul
- %0 = load i32, ptr @first, align 4
- %tobool.not = icmp eq i32 %0, 0
- br i1 %tobool.not, label %if.end, label %if.then
-
-if.then:
- %arrayidx = getelementptr inbounds [0 x i8], ptr @a, i32 0, i32 %add
- %1 = load i8, ptr %arrayidx, align 1
- tail call void asm sideeffect "", "r"(i8 %1)
- store i32 0, ptr @first, align 4
- br label %if.end
-
-if.end:
- tail call void asm sideeffect "", "r"(i32 %add)
- %inc = add nuw nsw i32 %j.016, 1
- %cmp2 = icmp ult i32 %j.016, 99999
- br i1 %cmp2, label %for.body4, label %for.cond.cleanup3
-}
-
declare void @objc_enumerationMutation(ptr)
declare dso_local void @f(ptr)
declare dso_local void @g(...)
diff --git a/llvm/test/Transforms/LoopFlatten/loop-flatten-version.ll b/llvm/test/Transforms/LoopFlatten/loop-flatten-version.ll
new file mode 100644
index 00000000000000..dec323d135f35b
--- /dev/null
+++ b/llvm/test/Transforms/LoopFlatten/loop-flatten-version.ll
@@ -0,0 +1,418 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt %s -S -passes='loop(loop-flatten),verify' -verify-loop-info -verify-dom-info -verify-scev -o - | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+
+; We need to version the loop as the GEPs are not inbounds
+define void @noinbounds_gep(i32 %N, ptr %A) {
+; CHECK-LABEL: define void @noinbounds_gep(
+; CHECK-SAME: i32 [[N:%.*]], ptr [[A:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP3:%.*]] = icmp ult i32 0, [[N]]
+; CHECK-NEXT: br i1 [[CMP3]], label [[FOR_INNER_PREHEADER_LVER_CHECK:%.*]], label [[FOR_END:%.*]]
+; CHECK: for.inner.preheader.lver.check:
+; CHECK-NEXT: [[FLATTEN_MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[N]], i32 [[N]])
+; CHECK-NEXT: [[FLATTEN_TRIPCOUNT:%.*]] = extractvalue { i32, i1 } [[FLATTEN_MUL]], 0
+; CHECK-NEXT: [[FLATTEN_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[FLATTEN_MUL]], 1
+; CHECK-NEXT: br i1 [[FLATTEN_OVERFLOW]], label [[FOR_INNER_PREHEADER_PH_LVER_ORIG:%.*]], label [[FOR_INNER_PREHEADER_PH:%.*]]
+; CHECK: for.inner.preheader.ph.lver.orig:
+; CHECK-NEXT: br label [[FOR_INNER_PREHEADER_LVER_ORIG:%.*]]
+; CHECK: for.inner.preheader.lver.orig:
+; CHECK-NEXT: [[I_LVER_ORIG:%.*]] = phi i32 [ 0, [[FOR_INNER_PREHEADER_PH_LVER_ORIG]] ], [ [[INC2_LVER_ORIG:%.*]], [[FOR_OUTER_LVER_ORIG:%.*]] ]
+; CHECK-NEXT: br label [[FOR_INNER_LVER_ORIG:%.*]]
+; CHECK: for.inner.lver.orig:
+; CHECK-NEXT: [[J_LVER_ORIG:%.*]] = phi i32 [ 0, [[FOR_INNER_PREHEADER_LVER_ORIG]] ], [ [[INC1_LVER_ORIG:%.*]], [[FOR_INNER_LVER_ORIG]] ]
+; CHECK-NEXT: [[MUL_LVER_ORIG:%.*]] = mul i32 [[I_LVER_ORIG]], [[N]]
+; CHECK-NEXT: [[GEP_LVER_ORIG:%.*]] = getelementptr i32, ptr [[A]], i32 [[MUL_LVER_ORIG]]
+; CHECK-NEXT: [[ARRAYIDX_LVER_ORIG:%.*]] = getelementptr i32, ptr [[GEP_LVER_ORIG]], i32 [[J_LVER_ORIG]]
+; CHECK-NEXT: store i32 0, ptr [[ARRAYIDX_LVER_ORIG]], align 4
+; CHECK-NEXT: [[INC1_LVER_ORIG]] = add nuw i32 [[J_LVER_ORIG]], 1
+; CHECK-NEXT: [[CMP2_LVER_ORIG:%.*]] = icmp ult i32 [[INC1_LVER_ORIG]], [[N]]
+; CHECK-NEXT: br i1 [[CMP2_LVER_ORIG]], label [[FOR_INNER_LVER_ORIG]], label [[FOR_OUTER_LVER_ORIG]]
+; CHECK: for.outer.lver.orig:
+; CHECK-NEXT: [[INC2_LVER_ORIG]] = add i32 [[I_LVER_ORIG]], 1
+; CHECK-NEXT: [[CMP1_LVER_ORIG:%.*]] = icmp ult i32 [[INC2_LVER_ORIG]], [[N]]
+; CHECK-NEXT: br i1 [[CMP1_LVER_ORIG]], label [[FOR_INNER_PREHEADER_LVER_ORIG]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]]
+; CHECK: for.inner.preheader.ph:
+; CHECK-NEXT: br label [[FOR_INNER_PREHEADER:%.*]]
+; CHECK: for.inner.preheader:
+; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[FOR_INNER_PREHEADER_PH]] ], [ [[INC2:%.*]], [[FOR_OUTER:%.*]] ]
+; CHECK-NEXT: [[FLATTEN_ARRAYIDX:%.*]] = getelementptr i32, ptr [[A]], i32 [[I]]
+; CHECK-NEXT: br label [[FOR_INNER:%.*]]
+; CHECK: for.inner:
+; CHECK-NEXT: [[J:%.*]] = phi i32 [ 0, [[FOR_INNER_PREHEADER]] ]
+; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[I]], [[N]]
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[A]], i32 [[MUL]]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr i32, ptr [[GEP]], i32 [[J]]
+; CHECK-NEXT: store i32 0, ptr [[FLATTEN_ARRAYIDX]], align 4
+; CHECK-NEXT: [[INC1:%.*]] = add nuw i32 [[J]], 1
+; CHECK-NEXT: [[CMP2:%.*]] = icmp ult i32 [[INC1]], [[N]]
+; CHECK-NEXT: br label [[FOR_OUTER]]
+; CHECK: for.outer:
+; CHECK-NEXT: [[INC2]] = add i32 [[I]], 1
+; CHECK-NEXT: [[CMP1:%.*]] = icmp ult i32 [[INC2]], [[FLATTEN_TRIPCOUNT]]
+; CHECK-NEXT: br i1 [[CMP1]], label [[FOR_INNER_PREHEADER]], label [[FOR_END_LOOPEXIT_LOOPEXIT1:%.*]]
+; CHECK: for.end.loopexit.loopexit:
+; CHECK-NEXT: br label [[FOR_END_LOOPEXIT:%.*]]
+; CHECK: for.end.loopexit.loopexit1:
+; CHECK-NEXT: br label [[FOR_END_LOOPEXIT]]
+; CHECK: for.end.loopexit:
+; CHECK-NEXT: br label [[FOR_END]]
+; CHECK: for.end:
+; CHECK-NEXT: ret void
+;
+entry:
+ %cmp3 = icmp ult i32 0, %N
+ br i1 %cmp3, label %for.outer.preheader, label %for.end
+
+for.outer.preheader:
+ br label %for.inner.preheader
+
+for.inner.preheader:
+ %i = phi i32 [ 0, %for.outer.preheader ], [ %inc2, %for.outer ]
+ br label %for.inner
+
+for.inner:
+ %j = phi i32 [ 0, %for.inner.preheader ], [ %inc1, %for.inner ]
+ %mul = mul i32 %i, %N
+ %gep = getelementptr i32, ptr %A, i32 %mul
+ %arrayidx = getelementptr i32, ptr %gep, i32 %j
+ store i32 0, ptr %arrayidx, align 4
+ %inc1 = add nuw i32 %j, 1
+ %cmp2 = icmp ult i32 %inc1, %N
+ br i1 %cmp2, label %for.inner, label %for.outer
+
+for.outer:
+ %inc2 = add i32 %i, 1
+ %cmp1 = icmp ult i32 %inc2, %N
+ br i1 %cmp1, label %for.inner.preheader, label %for.end.loopexit
+
+for.end.loopexit:
+ br label %for.end
+
+for.end:
+ ret void
+}
+
+; We shouldn't version the loop here as the multiply would use an illegal type.
+define void @noinbounds_gep_too_large_mul(i64 %N, ptr %A) {
+; CHECK-LABEL: define void @noinbounds_gep_too_large_mul(
+; CHECK-SAME: i64 [[N:%.*]], ptr [[A:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP3:%.*]] = icmp ult i64 0, [[N]]
+; CHECK-NEXT: br i1 [[CMP3]], label [[FOR_OUTER_PREHEADER:%.*]], label [[FOR_END:%.*]]
+; CHECK: for.outer.preheader:
+; CHECK-NEXT: br label [[FOR_INNER_PREHEADER:%.*]]
+; CHECK: for.inner.preheader:
+; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[FOR_OUTER_PREHEADER]] ], [ [[INC2:%.*]], [[FOR_OUTER:%.*]] ]
+; CHECK-NEXT: br label [[FOR_INNER:%.*]]
+; CHECK: for.inner:
+; CHECK-NEXT: [[J:%.*]] = phi i64 [ 0, [[FOR_INNER_PREHEADER]] ], [ [[INC1:%.*]], [[FOR_INNER]] ]
+; CHECK-NEXT: [[MUL:%.*]] = mul i64 [[I]], [[N]]
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[A]], i64 [[MUL]]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr i32, ptr [[GEP]], i64 [[J]]
+; CHECK-NEXT: store i32 0, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT: [[INC1]] = add nuw i64 [[J]], 1
+; CHECK-NEXT: [[CMP2:%.*]] = icmp ult i64 [[INC1]], [[N]]
+; CHECK-NEXT: br i1 [[CMP2]], label [[FOR_INNER]], label [[FOR_OUTER]]
+; CHECK: for.outer:
+; CHECK-NEXT: [[INC2]] = add i64 [[I]], 1
+; CHECK-NEXT: [[CMP1:%.*]] = icmp ult i64 [[INC2]], [[N]]
+; CHECK-NEXT: br i1 [[CMP1]], label [[FOR_INNER_PREHEADER]], label [[FOR_END_LOOPEXIT:%.*]]
+; CHECK: for.end.loopexit:
+; CHECK-NEXT: br label [[FOR_END]]
+; CHECK: for.end:
+; CHECK-NEXT: ret void
+;
+entry:
+ %cmp3 = icmp ult i64 0, %N
+ br i1 %cmp3, label %for.outer.preheader, label %for.end
+
+for.outer.preheader:
+ br label %for.inner.preheader
+
+for.inner.preheader:
+ %i = phi i64 [ 0, %for.outer.preheader ], [ %inc2, %for.outer ]
+ br label %for.inner
+
+for.inner:
+ %j = phi i64 [ 0, %for.inner.preheader ], [ %inc1, %for.inner ]
+ %mul = mul i64 %i, %N
+ %gep = getelementptr i32, ptr %A, i64 %mul
+ %arrayidx = getelementptr i32, ptr %gep, i64 %j
+ store i32 0, ptr %arrayidx, align 4
+ %inc1 = add nuw i64 %j, 1
+ %cmp2 = icmp ult i64 %inc1, %N
+ br i1 %cmp2, label %for.inner, label %for.outer
+
+for.outer:
+ %inc2 = add i64 %i, 1
+ %cmp1 = icmp ult i64 %inc2, %N
+ br i1 %cmp1, label %for.inner.preheader, label %for.end.loopexit
+
+for.end.loopexit:
+ br label %for.end
+
+for.end:
+ ret void
+}
+
+; A 3d loop corresponding to:
+;
+; for (int k = 0; k < N; ++k)
+; for (int i = 0; i < N; ++i)
+; for (int j = 0; j < M; ++j)
+; f(&A[i*M+j]);
+;
+define void @d3_2(i32* %A, i32 %N, i32 %M) {
+; CHECK-LABEL: define void @d3_2(
+; CHECK-SAME: ptr [[A:%.*]], i32 [[N:%.*]], i32 [[M:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP30:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT: br i1 [[CMP30]], label [[FOR_COND1_PREHEADER_LR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK: for.cond1.preheader.lr.ph:
+; CHECK-NEXT: [[CMP625:%.*]] = icmp sgt i32 [[M]], 0
+; CHECK-NEXT: br label [[FOR_COND1_PREHEADER_US:%.*]]
+; CHECK: for.cond1.preheader.us:
+; CHECK-NEXT: [[K_031_US:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_LR_PH]] ], [ [[INC13_US:%.*]], [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US:%.*]] ]
+; CHECK-NEXT: br i1 [[CMP625]], label [[FOR_COND5_PREHEADER_US_US_LVER_CHECK:%.*]], label [[FOR_COND5_PREHEADER_US43_PREHEADER:%.*]]
+; CHECK: for.cond5.preheader.us43.preheader:
+; CHECK-NEXT: br label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_LOOPEXIT50:%.*]]
+; CHECK: for.cond5.prehead...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/78576
More information about the llvm-commits
mailing list