[llvm] [LoopInterchange] Support inner-loop simple reductions via UndoSimpleReduction (PR #172970)
Yingying Wang via llvm-commits
llvm-commits at lists.llvm.org
Thu Dec 25 02:07:08 PST 2025
https://github.com/buggfg updated https://github.com/llvm/llvm-project/pull/172970
>From 9e354c81a94d2dcaf6c1603dcf31ef0fd453df79 Mon Sep 17 00:00:00 2001
From: buggfg <3171290993 at qq.com>
Date: Fri, 19 Dec 2025 16:11:26 +0800
Subject: [PATCH 01/10] Support inner-loop simple reductions via
UndoSimpleReduction
Co-Authored-By: ict-ql <168183727+ict-ql at users.noreply.github.com>
Co-Authored-By: Lin Wang <wanglulin at ict.ac.cn>
---
.../lib/Transforms/Scalar/LoopInterchange.cpp | 411 ++++++++++++++----
.../LoopInterchange/simple-reduction.ll | 86 ++++
2 files changed, 414 insertions(+), 83 deletions(-)
create mode 100644 llvm/test/Transforms/LoopInterchange/simple-reduction.ll
diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index 330b4abb9942f..3da23c7f9ae11 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -31,6 +31,7 @@
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
@@ -120,6 +121,12 @@ static cl::list<RuleTy> Profitabilities(
"Ignore profitability, force interchange (does not "
"work with other options)")));
+// Support for simple reduction of inner loop.
+static cl::opt<bool>
+ EnableUndoSimpleReduction("undo-simple-reduction", cl::init(false),
+ cl::Hidden,
+ cl::desc("Support for simple reduction of inner loop."));
+
#ifndef NDEBUG
static bool noDuplicateRulesAndIgnore(ArrayRef<RuleTy> Rules) {
SmallSet<RuleTy, 4> Set;
@@ -446,8 +453,8 @@ namespace {
class LoopInterchangeLegality {
public:
LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
- OptimizationRemarkEmitter *ORE)
- : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
+ OptimizationRemarkEmitter *ORE, DominatorTree *DT)
+ : OuterLoop(Outer), InnerLoop(Inner), SE(SE), DT(DT), ORE(ORE) {}
/// Check if the loops can be interchanged.
bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId,
@@ -475,9 +482,30 @@ class LoopInterchangeLegality {
return HasNoWrapReductions;
}
+ // Record the simple reduction in the inner loop.
+ struct SimpleReduction {
+ // The reduction itself;
+ PHINode *Re;
+ // So far only supports constant initial value.
+ Value *Init;
+ Value *Next;
+ // The Lcssa PHI
+ PHINode *LcssaPhi;
+ // Only supports one user for now
+ // Store reduction result into memory object
+ StoreInst *LcssaStorer;
+ // The memory Location
+ Value *MemRef;
+ Type *ElemTy;
+ };
+
+ const ArrayRef<SimpleReduction *> getInnerSimpleReductions() const {
+ return InnerSimpleReductions;
+ }
+
private:
bool tightlyNested(Loop *Outer, Loop *Inner);
- bool containsUnsafeInstructions(BasicBlock *BB);
+ bool containsUnsafeInstructions(BasicBlock *BB, Instruction *Skip);
/// Discover induction and reduction PHIs in the header of \p L. Induction
/// PHIs are added to \p Inductions, reductions are added to
@@ -487,11 +515,16 @@ class LoopInterchangeLegality {
SmallVector<PHINode *, 8> &Inductions,
Loop *InnerLoop);
+ /// Detect simple-reduction PHIs in the inner loop. Add them to
+ /// InnerSimpleReductions.
+ bool findSimpleReduction(Loop *L, PHINode *Phi,
+ SmallVectorImpl<Instruction *> &HasNoWrapInsts);
+
Loop *OuterLoop;
Loop *InnerLoop;
ScalarEvolution *SE;
-
+ DominatorTree *DT;
/// Interface to emit optimization remarks.
OptimizationRemarkEmitter *ORE;
@@ -506,6 +539,9 @@ class LoopInterchangeLegality {
/// like integer addition/multiplication. Those flags must be dropped when
/// interchanging the loops.
SmallVector<Instruction *, 4> HasNoWrapReductions;
+
+ /// Vector of simple reductions of inner loop.
+ SmallVector<SimpleReduction *, 8> InnerSimpleReductions;
};
/// Manages information utilized by the profitability check for cache. The main
@@ -575,6 +611,7 @@ class LoopInterchangeTransform {
/// Interchange OuterLoop and InnerLoop.
bool transform(ArrayRef<Instruction *> DropNoWrapInsts);
+ void undoSimpleReduction();
void restructureLoops(Loop *NewInner, Loop *NewOuter,
BasicBlock *OrigInnerPreHeader,
BasicBlock *OrigOuterPreHeader);
@@ -693,7 +730,7 @@ struct LoopInterchange {
Loop *InnerLoop = LoopList[InnerLoopId];
LLVM_DEBUG(dbgs() << "Processing InnerLoopId = " << InnerLoopId
<< " and OuterLoopId = " << OuterLoopId << "\n");
- LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, ORE);
+ LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, ORE, DT);
if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) {
LLVM_DEBUG(dbgs() << "Not interchanging loops. Cannot prove legality.\n");
return false;
@@ -734,8 +771,11 @@ struct LoopInterchange {
} // end anonymous namespace
-bool LoopInterchangeLegality::containsUnsafeInstructions(BasicBlock *BB) {
- return any_of(*BB, [](const Instruction &I) {
+bool LoopInterchangeLegality::containsUnsafeInstructions(BasicBlock *BB,
+ Instruction *Skip) {
+ return any_of(*BB, [Skip](const Instruction &I) {
+ if (&I == Skip)
+ return false;
return I.mayHaveSideEffects() || I.mayReadFromMemory();
});
}
@@ -761,17 +801,27 @@ bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
return false;
LLVM_DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch\n");
+
+ // The inner loop simple-reduction pattern requires storing the LCSSA PHI in
+ // the OuterLoop Latch. Therefore, when UndoSimpleReduction is enabled, skip
+ // that store during checks.
+ Instruction *Skip = nullptr;
+ if (EnableUndoSimpleReduction) {
+ if (InnerSimpleReductions.size() == 1)
+ Skip = InnerSimpleReductions[0]->LcssaStorer;
+ }
+
// We do not have any basic block in between now make sure the outer header
// and outer loop latch doesn't contain any unsafe instructions.
- if (containsUnsafeInstructions(OuterLoopHeader) ||
- containsUnsafeInstructions(OuterLoopLatch))
+ if (containsUnsafeInstructions(OuterLoopHeader, Skip) ||
+ containsUnsafeInstructions(OuterLoopLatch, Skip))
return false;
// Also make sure the inner loop preheader does not contain any unsafe
// instructions. Note that all instructions in the preheader will be moved to
// the outer loop header when interchanging.
if (InnerLoopPreHeader != OuterLoopHeader &&
- containsUnsafeInstructions(InnerLoopPreHeader))
+ containsUnsafeInstructions(InnerLoopPreHeader, Skip))
return false;
BasicBlock *InnerLoopExit = InnerLoop->getExitBlock();
@@ -787,7 +837,7 @@ bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
// The inner loop exit block does flow to the outer loop latch and not some
// other BBs, now make sure it contains safe instructions, since it will be
// moved into the (new) inner loop after interchange.
- if (containsUnsafeInstructions(InnerLoopExit))
+ if (containsUnsafeInstructions(InnerLoopExit, Skip))
return false;
LLVM_DEBUG(dbgs() << "Loops are perfectly nested\n");
@@ -898,6 +948,77 @@ static Value *followLCSSA(Value *SV) {
return followLCSSA(PHI->getIncomingValue(0));
}
+bool CheckReductionKind(Loop *L, PHINode *PHI,
+ SmallVectorImpl<Instruction *> &HasNoWrapInsts) {
+ RecurrenceDescriptor RD;
+ if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD)) {
+ // Detect floating point reduction only when it can be reordered.
+ if (RD.getExactFPMathInst() != nullptr)
+ return false;
+
+ RecurKind RK = RD.getRecurrenceKind();
+ switch (RK) {
+ case RecurKind::Or:
+ case RecurKind::And:
+ case RecurKind::Xor:
+ case RecurKind::SMin:
+ case RecurKind::SMax:
+ case RecurKind::UMin:
+ case RecurKind::UMax:
+ case RecurKind::FAdd:
+ case RecurKind::FMul:
+ case RecurKind::FMin:
+ case RecurKind::FMax:
+ case RecurKind::FMinimum:
+ case RecurKind::FMaximum:
+ case RecurKind::FMinimumNum:
+ case RecurKind::FMaximumNum:
+ case RecurKind::FMulAdd:
+ case RecurKind::AnyOf:
+ return true;
+
+ // Change the order of integer addition/multiplication may change the
+ // semantics. Consider the following case:
+ //
+ // int A[2][2] = {{ INT_MAX, INT_MAX }, { INT_MIN, INT_MIN }};
+ // int sum = 0;
+ // for (int i = 0; i < 2; i++)
+ // for (int j = 0; j < 2; j++)
+ // sum += A[j][i];
+ //
+ // If the above loops are exchanged, the addition will cause an
+ // overflow. To prevent this, we must drop the nuw/nsw flags from the
+ // addition/multiplication instructions when we actually exchanges the
+ // loops.
+ case RecurKind::Add:
+ case RecurKind::Mul: {
+ unsigned OpCode = RecurrenceDescriptor::getOpcode(RK);
+ SmallVector<Instruction *, 4> Ops = RD.getReductionOpChain(PHI, L);
+
+ // Bail out when we fail to collect reduction instructions chain.
+ if (Ops.empty())
+ return false;
+
+ for (Instruction *I : Ops) {
+ assert(I->getOpcode() == OpCode &&
+ "Expected the instruction to be the reduction operation");
+ (void)OpCode;
+
+ // If the instruction has nuw/nsw flags, we must drop them when the
+ // transformation is actually performed.
+ if (I->hasNoSignedWrap() || I->hasNoUnsignedWrap())
+ HasNoWrapInsts.push_back(I);
+ }
+ return true;
+ }
+
+ default:
+ return false;
+ }
+ } else
+ return false;
+}
+
// Check V's users to see if it is involved in a reduction in L.
static PHINode *
findInnerReductionPhi(Loop *L, Value *V,
@@ -910,72 +1031,12 @@ findInnerReductionPhi(Loop *L, Value *V,
if (PHINode *PHI = dyn_cast<PHINode>(User)) {
if (PHI->getNumIncomingValues() == 1)
continue;
- RecurrenceDescriptor RD;
- if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD)) {
- // Detect floating point reduction only when it can be reordered.
- if (RD.getExactFPMathInst() != nullptr)
- return nullptr;
-
- RecurKind RK = RD.getRecurrenceKind();
- switch (RK) {
- case RecurKind::Or:
- case RecurKind::And:
- case RecurKind::Xor:
- case RecurKind::SMin:
- case RecurKind::SMax:
- case RecurKind::UMin:
- case RecurKind::UMax:
- case RecurKind::FAdd:
- case RecurKind::FMul:
- case RecurKind::FMin:
- case RecurKind::FMax:
- case RecurKind::FMinimum:
- case RecurKind::FMaximum:
- case RecurKind::FMinimumNum:
- case RecurKind::FMaximumNum:
- case RecurKind::FMulAdd:
- case RecurKind::AnyOf:
- return PHI;
-
- // Change the order of integer addition/multiplication may change the
- // semantics. Consider the following case:
- //
- // int A[2][2] = {{ INT_MAX, INT_MAX }, { INT_MIN, INT_MIN }};
- // int sum = 0;
- // for (int i = 0; i < 2; i++)
- // for (int j = 0; j < 2; j++)
- // sum += A[j][i];
- //
- // If the above loops are exchanged, the addition will cause an
- // overflow. To prevent this, we must drop the nuw/nsw flags from the
- // addition/multiplication instructions when we actually exchanges the
- // loops.
- case RecurKind::Add:
- case RecurKind::Mul: {
- unsigned OpCode = RecurrenceDescriptor::getOpcode(RK);
- SmallVector<Instruction *, 4> Ops = RD.getReductionOpChain(PHI, L);
-
- // Bail out when we fail to collect reduction instructions chain.
- if (Ops.empty())
- return nullptr;
-
- for (Instruction *I : Ops) {
- assert(I->getOpcode() == OpCode &&
- "Expected the instruction to be the reduction operation");
- (void)OpCode;
-
- // If the instruction has nuw/nsw flags, we must drop them when the
- // transformation is actually performed.
- if (I->hasNoSignedWrap() || I->hasNoUnsignedWrap())
- HasNoWrapInsts.push_back(I);
- }
- return PHI;
- }
- default:
- return nullptr;
- }
- }
+ if (CheckReductionKind(L, PHI, HasNoWrapInsts))
+ return PHI;
+ else
+ return nullptr;
+
return nullptr;
}
}
@@ -983,6 +1044,116 @@ findInnerReductionPhi(Loop *L, Value *V,
return nullptr;
}
+// Detect and record the simple reduction of the inner loop.
+//
+// innerloop:
+// Re = phi<0.0, Next>
+// ReUser = Re op ...
+// ...
+// Next = ReUser op ...
+// OuterLoopLatch:
+// Lcssa = phi<Next> ; lcssa phi
+// store Lcssa, MemRef ; LcssaStorer
+//
+bool LoopInterchangeLegality::findSimpleReduction(
+ Loop *L, PHINode *Phi, SmallVectorImpl<Instruction *> &HasNoWrapInsts) {
+
+ // Only support undo simple reduction if the loop nest to be interchanged is
+ // the innermostin two loops.
+ if (!L->isInnermost())
+ return false;
+
+ if (Phi->getNumIncomingValues() != 2)
+ return false;
+
+ Value *Init = Phi->getIncomingValueForBlock(L->getLoopPreheader());
+ Value *Next = Phi->getIncomingValueForBlock(L->getLoopLatch());
+
+ // So far only supports constant initial value.
+ auto *ConstInit = dyn_cast<Constant>(Init);
+ if (!ConstInit)
+ return false;
+
+ // The reduction result must live in the inner loop.
+ if (Instruction *I = dyn_cast<Instruction>(Next)) {
+ BasicBlock *BB = I->getParent();
+ if (!L->contains(BB))
+ return false;
+ }
+
+ // The reduction should have only one user.
+ if (!Phi->hasOneUser())
+ return false;
+ Instruction *ReUser = dyn_cast<Instruction>(Phi->getUniqueUndroppableUser());
+ if (!ReUser || !L->contains(ReUser->getParent()))
+ return false;
+
+ // Check the reduction operation.
+ if (!ReUser->isAssociative() || !ReUser->isBinaryOp() ||
+ (ReUser->getOpcode() == Instruction::Sub &&
+ ReUser->getOperand(0) == Phi) ||
+ (ReUser->getOpcode() == Instruction::FSub &&
+ ReUser->getOperand(0) == Phi))
+ return false;
+
+ // Check the reduction kind.
+ if (ReUser != Next && !CheckReductionKind(L, Phi, HasNoWrapInsts))
+ return false;
+
+ // Find lcssa_phi in OuterLoop's Latch
+ if (!L->getExitingBlock())
+ return false;
+ BranchInst *BI = dyn_cast<BranchInst>(L->getExitingBlock()->getTerminator());
+ if (!BI)
+ return false;
+ BasicBlock *ExitBlock =
+ BI->getSuccessor(L->contains(BI->getSuccessor(0)) ? 1 : 0);
+ if (!ExitBlock)
+ return false;
+
+ PHINode *Lcssa = NULL;
+ for (auto *U : Next->users()) {
+ if (auto *P = dyn_cast<PHINode>(U)) {
+ if (P == Phi)
+ continue;
+
+ if (Lcssa == NULL && P->getParent() == ExitBlock &&
+ P->getIncomingValueForBlock(L->getLoopLatch()) == Next)
+ Lcssa = P;
+ } else
+ return false;
+ }
+ if (!Lcssa || !Lcssa->hasOneUser())
+ return false;
+
+ StoreInst *LcssaStorer =
+ dyn_cast<StoreInst>(Lcssa->getUniqueUndroppableUser());
+ if (!LcssaStorer)
+ return false;
+
+ Value *MemRef = LcssaStorer->getOperand(1);
+ Type *ElemTy = LcssaStorer->getOperand(0)->getType();
+
+ // LcssaStorer stores the reduction result in BB. undoSimpleReduction() will
+ // move it into the inner loop. Here we must ensure that the memory reference
+ // and its operands dominate the target block; otherwise the move is unsafe.
+ if (!DT->dominates(dyn_cast<Instruction>(MemRef), ExitBlock))
+ return false;
+
+ // Found a simple reduction of inner loop.
+ SimpleReduction *SR = new SimpleReduction;
+ SR->Re = Phi;
+ SR->Init = Init;
+ SR->Next = Next;
+ SR->LcssaPhi = Lcssa;
+ SR->LcssaStorer = LcssaStorer;
+ SR->MemRef = MemRef;
+ SR->ElemTy = ElemTy;
+
+ InnerSimpleReductions.push_back(&*SR);
+ return true;
+}
+
bool LoopInterchangeLegality::findInductionAndReductions(
Loop *L, SmallVector<PHINode *, 8> &Inductions, Loop *InnerLoop) {
if (!L->getLoopLatch() || !L->getLoopPredecessor())
@@ -995,11 +1166,14 @@ bool LoopInterchangeLegality::findInductionAndReductions(
// PHIs in inner loops need to be part of a reduction in the outer loop,
// discovered when checking the PHIs of the outer loop earlier.
if (!InnerLoop) {
- if (!OuterInnerReductions.count(&PHI)) {
- LLVM_DEBUG(dbgs() << "Inner loop PHI is not part of reductions "
- "across the outer loop.\n");
+ if (OuterInnerReductions.count(&PHI)) {
+ LLVM_DEBUG(dbgs() << "Found a reduction across the outer loop.\n");
+ } else if (EnableUndoSimpleReduction &&
+ findSimpleReduction(L, &PHI, HasNoWrapReductions)) {
+ LLVM_DEBUG(dbgs() << "Found a simple reduction in the inner loop: \n"
+ << PHI << '\n');
+ } else
return false;
- }
} else {
assert(PHI.getNumIncomingValues() == 2 &&
"Phis in loop header should have exactly 2 incoming values");
@@ -1020,6 +1194,10 @@ bool LoopInterchangeLegality::findInductionAndReductions(
}
}
}
+
+ // For now we only support at most one reduction.
+ if (InnerSimpleReductions.size() > 1)
+ return false;
return true;
}
@@ -1115,12 +1293,15 @@ bool LoopInterchangeLegality::findInductions(
// the we are only interested in the final value after the loop).
static bool
areInnerLoopExitPHIsSupported(Loop *InnerL, Loop *OuterL,
- SmallPtrSetImpl<PHINode *> &Reductions) {
+ SmallPtrSetImpl<PHINode *> &Reductions,
+ PHINode *LcssaSimpleRed) {
BasicBlock *InnerExit = OuterL->getUniqueExitBlock();
for (PHINode &PHI : InnerExit->phis()) {
// Reduction lcssa phi will have only 1 incoming block that from loop latch.
if (PHI.getNumIncomingValues() > 1)
return false;
+ if (&PHI == LcssaSimpleRed)
+ return true;
if (any_of(PHI.users(), [&Reductions, OuterL](User *U) {
PHINode *PN = dyn_cast<PHINode>(U);
return !PN ||
@@ -1270,8 +1451,16 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
return false;
}
- if (!areInnerLoopExitPHIsSupported(OuterLoop, InnerLoop,
- OuterInnerReductions)) {
+ // The LCSSA PHI for the simple reduction has passed checks before; its user
+ // is a store instruction.
+ PHINode *LcssaSimpleRed = nullptr;
+ if (EnableUndoSimpleReduction) {
+ if (InnerSimpleReductions.size() == 1)
+ LcssaSimpleRed = InnerSimpleReductions[0]->LcssaPhi;
+ }
+
+ if (!areInnerLoopExitPHIsSupported(OuterLoop, InnerLoop, OuterInnerReductions,
+ LcssaSimpleRed)) {
LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop exit.\n");
ORE->emit([&]() {
return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI",
@@ -1633,10 +1822,66 @@ void LoopInterchangeTransform::restructureLoops(
SE->forgetLoop(NewOuter);
}
+/*
+ User can write, optimizers can generate simple reduction for inner loop. In
+ order to make interchange valid, we have to undo reduction by moving th
+ initialization and store instructions into the inner loop. So far we only
+ handle cases where the reduction variable is initialized to a constant.
+ For example, below code:
+
+ loop:
+ re = phi<0.0, next>
+ next = re op ...
+ reduc_sum = phi<next> // lcssa phi
+ MEM_REF[idx] = reduc_sum // LcssaStorer
+
+ is transformed into:
+
+ loop:
+ tmp = MEM_REF[idx];
+ new_var = !first_iteration ? tmp : 0.0;
+ next = new_var op ...
+ MEM_REF[idx] = next; // after moving
+
+ In this way the initial const is used in the first iteration of loop.
+*/
+void LoopInterchangeTransform::undoSimpleReduction() {
+
+ auto &InnerSimpleReductions = LIL.getInnerSimpleReductions();
+ LoopInterchangeLegality::SimpleReduction *SR = InnerSimpleReductions[0];
+ BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
+ IRBuilder<> Builder(&*(InnerLoopHeader->getFirstNonPHIIt()));
+
+ // When the reduction is intialized from constant value, we need to add
+ // a stmt loading from the memory object to target basic block in inner
+ // loop during undoing the reduction.
+ Instruction *LoadMem = Builder.CreateLoad(SR->ElemTy, SR->MemRef);
+
+ // Check if it's the first iteration.
+ auto &InductionPHIs = LIL.getInnerLoopInductions();
+ PHINode *IV = InductionPHIs[0];
+ Value *IVInit = IV->getIncomingValueForBlock(InnerLoop->getLoopPreheader());
+ Value *FirstIter = Builder.CreateICmpNE(IV, IVInit, "first.iter");
+
+ // Init new_var to MEM_REF or CONST depending on if it is the first iteration.
+ Value *NewVar = Builder.CreateSelect(FirstIter, LoadMem, SR->Init, "new.var");
+
+ // Replace all uses of reduction var with new variable.
+ SR->Re->replaceAllUsesWith(NewVar);
+
+ // Move store instruction into inner loop, just after reduction next's def.
+ SR->LcssaStorer->setOperand(0, SR->Next);
+ SR->LcssaStorer->moveAfter(dyn_cast<Instruction>(SR->Next));
+}
+
bool LoopInterchangeTransform::transform(
ArrayRef<Instruction *> DropNoWrapInsts) {
bool Transformed = false;
+ auto &InnerSimpleReductions = LIL.getInnerSimpleReductions();
+ if (EnableUndoSimpleReduction && InnerSimpleReductions.size() == 1)
+ undoSimpleReduction();
+
if (InnerLoop->getSubLoops().empty()) {
BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
LLVM_DEBUG(dbgs() << "Splitting the inner loop latch\n");
diff --git a/llvm/test/Transforms/LoopInterchange/simple-reduction.ll b/llvm/test/Transforms/LoopInterchange/simple-reduction.ll
new file mode 100644
index 0000000000000..9a4393f827a36
--- /dev/null
+++ b/llvm/test/Transforms/LoopInterchange/simple-reduction.ll
@@ -0,0 +1,86 @@
+; NOTE: Support simple reduction in the inner loop by undoing the simple reduction.
+; RUN: opt < %s -passes="loop(loop-interchange),dce" -undo-simple-reduction -loop-interchange-profitabilities=ignore -S | FileCheck %s
+
+; for (int i = 0; i < n; i++) {
+; s[i] = 0;
+; for (int j = 0; j < n; j++)
+; s[i] = s[i] + a[j][i] * b[j][i];
+; }
+
+define void @func(ptr noalias noundef readonly captures(none) %a, ptr noalias noundef readonly captures(none) %b, ptr noalias noundef writeonly captures(none) %s, i64 noundef %n) {
+; CHECK-LABEL: define void @func(ptr noalias noundef readonly captures(none) %a, ptr noalias noundef readonly captures(none) %b, ptr noalias noundef writeonly captures(none) %s, i64 noundef %n) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i64 [[N:%.*]], 0
+; CHECK-NEXT: br i1 [[CMP]], label [[INNERLOOP_PREHEADER:%.*]], label [[EXIT:%.*]]
+; CHECK: outerloop_header.preheader:
+; CHECK-NEXT: br label [[OUTERLOOP_HEADER:%.*]]
+; CHECK: outerloop_header:
+; CHECK-NEXT: [[INDEX_I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[OUTERLOOP_LATCH:%.*]] ], [ 0, [[OUTERLOOPHEADER_PREHEADER:%.*]] ]
+; CHECK-NEXT: [[ADDR_S:%.*]] = getelementptr inbounds nuw double, ptr %s, i64 [[INDEX_I]]
+; CHECK-NEXT: [[ADDR_A:%.*]] = getelementptr inbounds nuw [100 x double], ptr %a, i64 0, i64 [[INDEX_I]]
+; CHECK-NEXT: [[ADDR_B:%.*]] = getelementptr inbounds nuw [100 x double], ptr %b, i64 0, i64 [[INDEX_I]]
+; CHECK-NEXT: br label [[INNERLOOP_SPLIT1:%.*]]
+; CHECK: innerloop.preheader:
+; CHECK-NEXT: br label [[INNERLOOP:%.*]]
+; CHECK: innerloop:
+; CHECK-NEXT: [[INDEX_J:%.*]] = phi i64 [ [[J_NEXT:%.*]], [[INNERLOOP_SPLIT:%.*]] ], [ 0, [[INNERLOOP_PREHEADER:%.*]] ]
+; CHECK-NEXT: br label [[OUTERLOOPHEADER_PREHEADER:%.*]]
+; CHECK: innerloop.split1:
+; CHECK-NEXT: [[S:%.*]] = load double, ptr [[ADDR_S]], align 8
+; CHECK-NEXT: [[FIRSTITER:%.*]] = icmp ne i64 [[INDEX_J]], 0
+; CHECK-NEXT: [[NEW_VAR:%.*]] = select i1 [[FIRSTITER]], double [[S]], double 0.000000e+00
+; CHECK-NEXT: [[ADDR_A_J_I:%.*]] = getelementptr inbounds nuw [100 x double], ptr [[ADDR_A]], i64 [[INDEX_J]]
+; CHECK-NEXT: [[A_J_I:%.*]] = load double, ptr [[ADDR_A_J_I]], align 8
+; CHECK-NEXT: [[ADDR_B_J_I:%.*]] = getelementptr inbounds nuw [100 x double], ptr [[ADDR_B]], i64 [[INDEX_J]]
+; CHECK-NEXT: [[B_J_I:%.*]] = load double, ptr [[ADDR_B_J_I]], align 8
+; CHECK-NEXT: [[MUL:%.*]] = fmul fast double [[B_J_I]], [[A_J_I]]
+; CHECK-NEXT: [[ADD:%.*]] = fadd fast double [[MUL]], [[NEW_VAR]]
+; CHECK-NEXT: store double [[ADD]], ptr [[ADDR_S]], align 8
+; CHECK-NEXT: br label [[OUTERLOOP_LATCH:%.*]]
+; CHECK: innerloop.split:
+; CHECK-NEXT: [[J_NEXT:%.*]] = add nuw nsw i64 [[INDEX_J]], 1
+; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i64 [[J_NEXT]], [[N]]
+; CHECK-NEXT: br i1 [[CMP1]], label [[EXIT_LOOPEXIT:%.*]], label [[INNERLOOP]]
+; CHECK: outerloop_latch:
+; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[INDEX_I]], 1
+; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i64 [[I_NEXT]], [[N]]
+; CHECK-NEXT: br i1 [[CMP2]], label [[INNERLOOP_SPLIT:%.*]], label [[OUTERLOOP_HEADER]]
+; CHECK: exit.loopexit:
+; CHECK-NEXT: br label [[EXIT:%.*]]
+; CHECK: exit:
+; CHECK-NEXT: ret void
+;
+entry:
+ %cmp = icmp sgt i64 %n, 0
+ br i1 %cmp, label %outerloop_header, label %exit
+
+outerloop_header:
+ %index_i = phi i64 [ 0, %entry ], [ %index_i.next, %outerloop_latch ]
+ %addr_s = getelementptr inbounds nuw double, ptr %s, i64 %index_i
+ %invariant.gep.us = getelementptr inbounds nuw [100 x double], ptr %a, i64 0, i64 %index_i
+ %invariant.gep32.us = getelementptr inbounds nuw [100 x double], ptr %b, i64 0, i64 %index_i
+ br label %innerloop
+
+innerloop:
+ %index_j = phi i64 [ 0, %outerloop_header ], [ %index_j.next, %innerloop ]
+ %reduction = phi double [ 0.000000e+00, %outerloop_header ], [ %add, %innerloop ]
+ %addr_a_j_i = getelementptr inbounds nuw [100 x double], ptr %invariant.gep.us, i64 %index_j
+ %0 = load double, ptr %addr_a_j_i, align 8
+ %addr_b_j_i = getelementptr inbounds nuw [100 x double], ptr %invariant.gep32.us, i64 %index_j
+ %1 = load double, ptr %addr_b_j_i, align 8
+ %mul = fmul fast double %1, %0
+ %add = fadd fast double %mul, %reduction
+ %index_j.next = add nuw nsw i64 %index_j, 1
+ %cond1 = icmp eq i64 %index_j.next, %n
+ br i1 %cond1, label %outerloop_latch, label %innerloop
+
+outerloop_latch:
+ %lcssa = phi double [ %add, %innerloop ]
+ store double %lcssa, ptr %addr_s, align 8
+ %index_i.next = add nuw nsw i64 %index_i, 1
+ %cond2 = icmp eq i64 %index_i.next, %n
+ br i1 %cond2, label %exit, label %outerloop_header
+
+exit:
+ ret void
+}
>From 404657762e6c223894e5a4635302cba99514ff83 Mon Sep 17 00:00:00 2001
From: buggfg <3171290993 at qq.com>
Date: Fri, 19 Dec 2025 16:29:45 +0800
Subject: [PATCH 02/10] Correct the format.
---
llvm/lib/Transforms/Scalar/LoopInterchange.cpp | 7 +++----
1 file changed, 3 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index 3da23c7f9ae11..329cb2189827e 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -122,10 +122,9 @@ static cl::list<RuleTy> Profitabilities(
"work with other options)")));
// Support for simple reduction of inner loop.
-static cl::opt<bool>
- EnableUndoSimpleReduction("undo-simple-reduction", cl::init(false),
- cl::Hidden,
- cl::desc("Support for simple reduction of inner loop."));
+static cl::opt<bool> EnableUndoSimpleReduction(
+ "undo-simple-reduction", cl::init(false), cl::Hidden,
+ cl::desc("Support for simple reduction of inner loop."));
#ifndef NDEBUG
static bool noDuplicateRulesAndIgnore(ArrayRef<RuleTy> Rules) {
>From cbeeb15da98d7a544a71450fc495191bf2813baf Mon Sep 17 00:00:00 2001
From: Yingying Wang <3171290993 at qq.com>
Date: Mon, 22 Dec 2025 18:07:58 +0800
Subject: [PATCH 03/10] Correct the format of the comments.
Co-authored-by: Ryotaro Kasuga <kasuga.ryotaro at fujitsu.com>
---
.../lib/Transforms/Scalar/LoopInterchange.cpp | 22 +++++++++----------
1 file changed, 11 insertions(+), 11 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index 329cb2189827e..19da518caed2a 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -1043,17 +1043,17 @@ findInnerReductionPhi(Loop *L, Value *V,
return nullptr;
}
-// Detect and record the simple reduction of the inner loop.
-//
-// innerloop:
-// Re = phi<0.0, Next>
-// ReUser = Re op ...
-// ...
-// Next = ReUser op ...
-// OuterLoopLatch:
-// Lcssa = phi<Next> ; lcssa phi
-// store Lcssa, MemRef ; LcssaStorer
-//
+/// Detect and record the simple reduction of the inner loop.
+///
+/// innerloop:
+/// Re = phi<0.0, Next>
+/// ReUser = Re op ...
+/// ...
+/// Next = ReUser op ...
+/// OuterLoopLatch:
+/// Lcssa = phi<Next> ; lcssa phi
+/// store Lcssa, MemRef ; LcssaStorer
+///
bool LoopInterchangeLegality::findSimpleReduction(
Loop *L, PHINode *Phi, SmallVectorImpl<Instruction *> &HasNoWrapInsts) {
>From 66f55c6d1dff845adbc822eff6d230b6120c6e7f Mon Sep 17 00:00:00 2001
From: Yingying Wang <3171290993 at qq.com>
Date: Mon, 22 Dec 2025 18:08:31 +0800
Subject: [PATCH 04/10] Correct the format of the comments.
Co-authored-by: Ryotaro Kasuga <kasuga.ryotaro at fujitsu.com>
---
.../lib/Transforms/Scalar/LoopInterchange.cpp | 44 +++++++++----------
1 file changed, 21 insertions(+), 23 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index 19da518caed2a..31778ea028f00 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -1821,29 +1821,27 @@ void LoopInterchangeTransform::restructureLoops(
SE->forgetLoop(NewOuter);
}
-/*
- User can write, optimizers can generate simple reduction for inner loop. In
- order to make interchange valid, we have to undo reduction by moving th
- initialization and store instructions into the inner loop. So far we only
- handle cases where the reduction variable is initialized to a constant.
- For example, below code:
-
- loop:
- re = phi<0.0, next>
- next = re op ...
- reduc_sum = phi<next> // lcssa phi
- MEM_REF[idx] = reduc_sum // LcssaStorer
-
- is transformed into:
-
- loop:
- tmp = MEM_REF[idx];
- new_var = !first_iteration ? tmp : 0.0;
- next = new_var op ...
- MEM_REF[idx] = next; // after moving
-
- In this way the initial const is used in the first iteration of loop.
-*/
+/// User can write, optimizers can generate simple reduction for inner loop. In
+/// order to make interchange valid, we have to undo reduction by moving th
+/// initialization and store instructions into the inner loop. So far we only
+/// handle cases where the reduction variable is initialized to a constant.
+/// For example, below code:
+///
+/// loop:
+/// re = phi<0.0, next>
+/// next = re op ...
+/// reduc_sum = phi<next> // lcssa phi
+/// MEM_REF[idx] = reduc_sum // LcssaStorer
+///
+/// is transformed into:
+///
+/// loop:
+/// tmp = MEM_REF[idx];
+/// new_var = !first_iteration ? tmp : 0.0;
+/// next = new_var op ...
+/// MEM_REF[idx] = next; // after moving
+///
+/// In this way the initial const is used in the first iteration of loop.
void LoopInterchangeTransform::undoSimpleReduction() {
auto &InnerSimpleReductions = LIL.getInnerSimpleReductions();
>From e234c58b151c7161ce4e85c5a8a2a7be21269792 Mon Sep 17 00:00:00 2001
From: buggfg <3171290993 at qq.com>
Date: Wed, 24 Dec 2025 17:45:38 +0800
Subject: [PATCH 05/10] Add four negative tests and improve security
---
.../lib/Transforms/Scalar/LoopInterchange.cpp | 142 ++++++++---
.../simple-reduction-limitation.ll | 240 ++++++++++++++++++
.../LoopInterchange/simple-reduction.ll | 8 +-
3 files changed, 351 insertions(+), 39 deletions(-)
create mode 100644 llvm/test/Transforms/LoopInterchange/simple-reduction-limitation.ll
diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index 31778ea028f00..7cd6744c34528 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -123,7 +123,7 @@ static cl::list<RuleTy> Profitabilities(
// Support for simple reduction of inner loop.
static cl::opt<bool> EnableUndoSimpleReduction(
- "undo-simple-reduction", cl::init(false), cl::Hidden,
+ "loop-interchange-undo-simple-reduction", cl::init(false), cl::Hidden,
cl::desc("Support for simple reduction of inner loop."));
#ifndef NDEBUG
@@ -496,9 +496,12 @@ class LoopInterchangeLegality {
// The memory Location
Value *MemRef;
Type *ElemTy;
+
+ /// IV used for the loop exit condition.
+ PHINode *CounterIV;
};
- const ArrayRef<SimpleReduction *> getInnerSimpleReductions() const {
+ const ArrayRef<SimpleReduction> getInnerSimpleReductions() const {
return InnerSimpleReductions;
}
@@ -540,7 +543,7 @@ class LoopInterchangeLegality {
SmallVector<Instruction *, 4> HasNoWrapReductions;
/// Vector of simple reductions of inner loop.
- SmallVector<SimpleReduction *, 8> InnerSimpleReductions;
+ SmallVector<SimpleReduction, 8> InnerSimpleReductions;
};
/// Manages information utilized by the profitability check for cache. The main
@@ -806,8 +809,10 @@ bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
// that store during checks.
Instruction *Skip = nullptr;
if (EnableUndoSimpleReduction) {
+ assert(InnerSimpleReductions.size() <= 1 &&
+ "So far we only support at most one reduction.");
if (InnerSimpleReductions.size() == 1)
- Skip = InnerSimpleReductions[0]->LcssaStorer;
+ Skip = InnerSimpleReductions[0].LcssaStorer;
}
// We do not have any basic block in between now make sure the outer header
@@ -947,7 +952,7 @@ static Value *followLCSSA(Value *SV) {
return followLCSSA(PHI->getIncomingValue(0));
}
-bool CheckReductionKind(Loop *L, PHINode *PHI,
+static bool CheckReductionKind(Loop *L, PHINode *PHI,
SmallVectorImpl<Instruction *> &HasNoWrapInsts) {
RecurrenceDescriptor RD;
if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD)) {
@@ -1035,14 +1040,71 @@ findInnerReductionPhi(Loop *L, Value *V,
return PHI;
else
return nullptr;
+ }
+ }
return nullptr;
}
+
+static PHINode *getCounterFromInc(Value *IncV, Loop *L) {
+ Instruction *IncI = dyn_cast<Instruction>(IncV);
+ if (!IncI)
+ return nullptr;
+
+ if (IncI->getOpcode() != Instruction::Add &&
+ IncI->getOpcode() != Instruction::Sub)
+ return nullptr;
+
+ PHINode *Phi = dyn_cast<PHINode>(IncI->getOperand(0));
+ if (Phi && Phi->getParent() == L->getHeader()) {
+ return Phi;
+ }
+
+ // Allow add/sub to be commuted.
+ Phi = dyn_cast<PHINode>(IncI->getOperand(1));
+ if (Phi && Phi->getParent() == L->getHeader()) {
+ return Phi;
}
return nullptr;
}
+/// UndoSimpleReduction requires the first_iteration check, so look for
+/// the IV used for the loop exit condition
+static PHINode *findCounterIV(Loop *L) {
+
+ assert(L->getLoopLatch() && "Must be in simplified form");
+
+ BranchInst *BI = cast<BranchInst>(L->getLoopLatch()->getTerminator());
+ if (L->isLoopInvariant(BI->getCondition()))
+ return nullptr;
+
+ ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
+ if (!Cond)
+ return nullptr;
+
+ // Look for a loop invariant RHS
+ Value *LHS = Cond->getOperand(0);
+ Value *RHS = Cond->getOperand(1);
+ if (!L->isLoopInvariant(RHS)) {
+ if (!L->isLoopInvariant(LHS))
+ return nullptr;
+ std::swap(LHS, RHS);
+ }
+
+ // IndVar = phi[{InitialValue, preheader}, {StepInst, latch}]
+ // StepInst = IndVar + step
+ // case 1:
+ // cmp = IndVar < FinalValue
+ PHINode *Counter = dyn_cast<PHINode>(LHS);
+ // case 2:
+ // cmp = StepInst < FinalValue
+ if (!Counter)
+ Counter = getCounterFromInc(LHS, L);
+
+ return Counter;
+}
+
/// Detect and record the simple reduction of the inner loop.
///
/// innerloop:
@@ -1069,8 +1131,7 @@ bool LoopInterchangeLegality::findSimpleReduction(
Value *Next = Phi->getIncomingValueForBlock(L->getLoopLatch());
// So far only supports constant initial value.
- auto *ConstInit = dyn_cast<Constant>(Init);
- if (!ConstInit)
+ if (!isa<Constant>(Init))
return false;
// The reduction result must live in the inner loop.
@@ -1088,11 +1149,7 @@ bool LoopInterchangeLegality::findSimpleReduction(
return false;
// Check the reduction operation.
- if (!ReUser->isAssociative() || !ReUser->isBinaryOp() ||
- (ReUser->getOpcode() == Instruction::Sub &&
- ReUser->getOperand(0) == Phi) ||
- (ReUser->getOpcode() == Instruction::FSub &&
- ReUser->getOperand(0) == Phi))
+ if (!ReUser->isAssociative())
return false;
// Check the reduction kind.
@@ -1100,13 +1157,7 @@ bool LoopInterchangeLegality::findSimpleReduction(
return false;
// Find lcssa_phi in OuterLoop's Latch
- if (!L->getExitingBlock())
- return false;
- BranchInst *BI = dyn_cast<BranchInst>(L->getExitingBlock()->getTerminator());
- if (!BI)
- return false;
- BasicBlock *ExitBlock =
- BI->getSuccessor(L->contains(BI->getSuccessor(0)) ? 1 : 0);
+ BasicBlock *ExitBlock = L->getExitBlock();;
if (!ExitBlock)
return false;
@@ -1119,6 +1170,8 @@ bool LoopInterchangeLegality::findSimpleReduction(
if (Lcssa == NULL && P->getParent() == ExitBlock &&
P->getIncomingValueForBlock(L->getLoopLatch()) == Next)
Lcssa = P;
+ else
+ return false;
} else
return false;
}
@@ -1139,17 +1192,23 @@ bool LoopInterchangeLegality::findSimpleReduction(
if (!DT->dominates(dyn_cast<Instruction>(MemRef), ExitBlock))
return false;
+ // find the IV used for the loop exit condition.
+ PHINode *CounterIV = findCounterIV(L);
+ if (!CounterIV)
+ return false;
+
// Found a simple reduction of inner loop.
- SimpleReduction *SR = new SimpleReduction;
- SR->Re = Phi;
- SR->Init = Init;
- SR->Next = Next;
- SR->LcssaPhi = Lcssa;
- SR->LcssaStorer = LcssaStorer;
- SR->MemRef = MemRef;
- SR->ElemTy = ElemTy;
-
- InnerSimpleReductions.push_back(&*SR);
+ SimpleReduction SR;
+ SR.Re = Phi;
+ SR.Init = Init;
+ SR.Next = Next;
+ SR.LcssaPhi = Lcssa;
+ SR.LcssaStorer = LcssaStorer;
+ SR.MemRef = MemRef;
+ SR.ElemTy = ElemTy;
+ SR.CounterIV = CounterIV;
+
+ InnerSimpleReductions.push_back(SR);
return true;
}
@@ -1454,8 +1513,10 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
// is a store instruction.
PHINode *LcssaSimpleRed = nullptr;
if (EnableUndoSimpleReduction) {
+ assert(InnerSimpleReductions.size() <= 1 &&
+ "So far we only support at most one reduction.");
if (InnerSimpleReductions.size() == 1)
- LcssaSimpleRed = InnerSimpleReductions[0]->LcssaPhi;
+ LcssaSimpleRed = InnerSimpleReductions[0].LcssaPhi;
}
if (!areInnerLoopExitPHIsSupported(OuterLoop, InnerLoop, OuterInnerReductions,
@@ -1844,31 +1905,36 @@ void LoopInterchangeTransform::restructureLoops(
/// In this way the initial const is used in the first iteration of loop.
void LoopInterchangeTransform::undoSimpleReduction() {
- auto &InnerSimpleReductions = LIL.getInnerSimpleReductions();
- LoopInterchangeLegality::SimpleReduction *SR = InnerSimpleReductions[0];
+ ArrayRef<LoopInterchangeLegality::SimpleReduction> InnerSimpleReductions =
+ LIL.getInnerSimpleReductions();
+
+ assert(InnerSimpleReductions.size() == 1 &&
+ "So far we only support at most one reduction.");
+
+ LoopInterchangeLegality::SimpleReduction SR = InnerSimpleReductions[0];
BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
IRBuilder<> Builder(&*(InnerLoopHeader->getFirstNonPHIIt()));
// When the reduction is intialized from constant value, we need to add
// a stmt loading from the memory object to target basic block in inner
// loop during undoing the reduction.
- Instruction *LoadMem = Builder.CreateLoad(SR->ElemTy, SR->MemRef);
+ Instruction *LoadMem = Builder.CreateLoad(SR.ElemTy, SR.MemRef);
// Check if it's the first iteration.
auto &InductionPHIs = LIL.getInnerLoopInductions();
- PHINode *IV = InductionPHIs[0];
+ PHINode *IV = SR.CounterIV;
Value *IVInit = IV->getIncomingValueForBlock(InnerLoop->getLoopPreheader());
Value *FirstIter = Builder.CreateICmpNE(IV, IVInit, "first.iter");
// Init new_var to MEM_REF or CONST depending on if it is the first iteration.
- Value *NewVar = Builder.CreateSelect(FirstIter, LoadMem, SR->Init, "new.var");
+ Value *NewVar = Builder.CreateSelect(FirstIter, LoadMem, SR.Init, "new.var");
// Replace all uses of reduction var with new variable.
- SR->Re->replaceAllUsesWith(NewVar);
+ SR.Re->replaceAllUsesWith(NewVar);
// Move store instruction into inner loop, just after reduction next's def.
- SR->LcssaStorer->setOperand(0, SR->Next);
- SR->LcssaStorer->moveAfter(dyn_cast<Instruction>(SR->Next));
+ SR.LcssaStorer->setOperand(0, SR.Next);
+ SR.LcssaStorer->moveAfter(dyn_cast<Instruction>(SR.Next));
}
bool LoopInterchangeTransform::transform(
diff --git a/llvm/test/Transforms/LoopInterchange/simple-reduction-limitation.ll b/llvm/test/Transforms/LoopInterchange/simple-reduction-limitation.ll
new file mode 100644
index 0000000000000..f4a2266ef9ffe
--- /dev/null
+++ b/llvm/test/Transforms/LoopInterchange/simple-reduction-limitation.ll
@@ -0,0 +1,240 @@
+; Several cases of undoing simple reductions that have not yet been supported.
+; RUN: opt < %s -passes="loop-interchange" -loop-interchange-undo-simple-reduction -pass-remarks-missed='loop-interchange' \
+; RUN: -pass-remarks-output=%t -S | FileCheck -check-prefix=IR %s
+; RUN: FileCheck --input-file=%t %s
+
+
+; 1. The initial value of the reduction is not a constant.
+; for (int i = 0; i < n; i++) {
+; for (int j = 0; j < n; j++)
+; s[i] = s[i] + a[j][i] * b[j][i];
+; }
+
+; CHECK: --- !Missed
+; CHECK-NEXT: Pass: loop-interchange
+; CHECK-NEXT: Name: UnsupportedPHIInner
+; CHECK-NEXT: Function: simple_reduction_01
+; CHECK-NEXT: Args:
+; CHECK-NEXT: - String: Only inner loops with induction or reduction PHI nodes can be interchange currently.
+
+; IR-LABEL: @simple_reduction_01(
+; IR-NOT: split
+define void @simple_reduction_01(ptr noalias noundef readonly captures(none) %a, ptr noalias noundef readonly captures(none) %b, ptr noalias noundef writeonly captures(none) %s, i64 noundef %n) {
+entry:
+ %cmp = icmp sgt i64 %n, 0
+ br i1 %cmp, label %outerloop_header, label %exit
+
+outerloop_header:
+ %index_i = phi i64 [ 0, %entry ], [ %index_i.next, %outerloop_latch ]
+ %addr_s = getelementptr inbounds nuw double, ptr %s, i64 %index_i
+ %invariant.gep.us = getelementptr inbounds nuw [100 x double], ptr %a, i64 0, i64 %index_i
+ %invariant.gep32.us = getelementptr inbounds nuw [100 x double], ptr %b, i64 0, i64 %index_i
+ %s_init = load double, ptr %addr_s, align 8
+ br label %innerloop
+
+innerloop:
+ %index_j = phi i64 [ 0, %outerloop_header ], [ %index_j.next, %innerloop ]
+ %reduction = phi double [ %s_init, %outerloop_header ], [ %add, %innerloop ]
+ %addr_a_j_i = getelementptr inbounds nuw [100 x double], ptr %invariant.gep.us, i64 %index_j
+ %0 = load double, ptr %addr_a_j_i, align 8
+ %addr_b_j_i = getelementptr inbounds nuw [100 x double], ptr %invariant.gep32.us, i64 %index_j
+ %1 = load double, ptr %addr_b_j_i, align 8
+ %mul = fmul fast double %1, %0
+ %add = fadd fast double %mul, %reduction
+ %index_j.next = add nuw nsw i64 %index_j, 1
+ %cond1 = icmp eq i64 %index_j.next, %n
+ br i1 %cond1, label %outerloop_latch, label %innerloop
+
+outerloop_latch:
+ %lcssa = phi double [ %add, %innerloop ]
+ store double %lcssa, ptr %addr_s, align 8
+ %index_i.next = add nuw nsw i64 %index_i, 1
+ %cond2 = icmp eq i64 %index_i.next, %n
+ br i1 %cond2, label %exit, label %outerloop_header
+
+exit:
+ ret void
+}
+
+; 2. There are two or more reductions
+; for (int i = 0; i < n; i++) {
+; s[i] = 0;
+; s2[i] = 0;
+; for (int j = 0; j < n; j++){
+; s[i] = s[i] + a[j][i] * b[j][i];
+; s2[i] = s2[i] + a[j][i];
+; }
+; }
+
+; CHECK: --- !Missed
+; CHECK-NEXT: Pass: loop-interchange
+; CHECK-NEXT: Name: UnsupportedPHIInner
+; CHECK-NEXT: Function: simple_reduction_02
+; CHECK-NEXT: Args:
+; CHECK-NEXT: - String: Only inner loops with induction or reduction PHI nodes can be interchange currently.
+
+; IR-LABEL: @simple_reduction_02(
+; IR-NOT: split
+define void @simple_reduction_02(ptr noalias noundef readonly captures(none) %a, ptr noalias noundef readonly captures(none) %b, ptr noalias noundef writeonly captures(none) %s, ptr noalias noundef writeonly captures(none) %s2, i64 noundef %n) {
+entry:
+ %cmp = icmp sgt i64 %n, 0
+ br i1 %cmp, label %outerloop_header, label %exit
+
+outerloop_header:
+ %index_i = phi i64 [ 0, %entry ], [ %index_i.next, %outerloop_latch ]
+ %addr_s = getelementptr inbounds nuw double, ptr %s, i64 %index_i
+ %addr_s2 = getelementptr inbounds nuw double, ptr %s2, i64 %index_i
+ %invariant.gep.us = getelementptr inbounds nuw [100 x double], ptr %a, i64 0, i64 %index_i
+ %invariant.gep32.us = getelementptr inbounds nuw [100 x double], ptr %b, i64 0, i64 %index_i
+ br label %innerloop
+
+innerloop:
+ %index_j = phi i64 [ 0, %outerloop_header ], [ %index_j.next, %innerloop ]
+ %reduction = phi double [ 0.000000e+00, %outerloop_header ], [ %add, %innerloop ]
+ %reduction2 = phi double [ 0.000000e+00, %outerloop_header ], [ %add, %innerloop ]
+ %addr_a_j_i = getelementptr inbounds nuw [100 x double], ptr %invariant.gep.us, i64 %index_j
+ %0 = load double, ptr %addr_a_j_i, align 8
+ %addr_b_j_i = getelementptr inbounds nuw [100 x double], ptr %invariant.gep32.us, i64 %index_j
+ %1 = load double, ptr %addr_b_j_i, align 8
+ %mul = fmul fast double %1, %0
+ %add = fadd fast double %mul, %reduction
+ %add2 = fadd fast double %reduction2, %0
+ %index_j.next = add nuw nsw i64 %index_j, 1
+ %cond1 = icmp eq i64 %index_j.next, %n
+ br i1 %cond1, label %outerloop_latch, label %innerloop
+
+outerloop_latch:
+ %lcssa = phi double [ %add, %innerloop ]
+ %lcssa2 = phi double [%add2, %innerloop]
+ store double %lcssa, ptr %addr_s, align 8
+ store double %lcssa2, ptr %addr_s2, align 8
+ %index_i.next = add nuw nsw i64 %index_i, 1
+ %cond2 = icmp eq i64 %index_i.next, %n
+ br i1 %cond2, label %exit, label %outerloop_header
+
+exit:
+ ret void
+}
+
+; 3. The reduction is used more than twice in the outer loop.
+; for (int i = 0; i < n; i++) {
+; s[i] = 0;
+; for (int j = 0; j < n; j++)
+; s[i] = s[i] + a[j][i] * b[j][i];
+; s[i] += 1;
+; }
+
+; CHECK: --- !Missed
+; CHECK-NEXT: Pass: loop-interchange
+; CHECK-NEXT: Name: UnsupportedPHIInner
+; CHECK-NEXT: Function: simple_reduction_03
+; CHECK-NEXT: Args:
+; CHECK-NEXT: - String: Only inner loops with induction or reduction PHI nodes can be interchange currently.
+
+; IR-LABEL: @simple_reduction_03(
+; IR-NOT: split
+define void @simple_reduction_03(ptr noalias noundef readonly captures(none) %a, ptr noalias noundef readonly captures(none) %b, ptr noalias noundef writeonly captures(none) %s, i64 noundef %n) {
+entry:
+ %cmp = icmp sgt i64 %n, 0
+ br i1 %cmp, label %outerloop_header, label %exit
+
+outerloop_header:
+ %index_i = phi i64 [ 0, %entry ], [ %index_i.next, %outerloop_latch ]
+ %addr_s = getelementptr inbounds nuw double, ptr %s, i64 %index_i
+ %invariant.gep.us = getelementptr inbounds nuw [100 x double], ptr %a, i64 0, i64 %index_i
+ %invariant.gep32.us = getelementptr inbounds nuw [100 x double], ptr %b, i64 0, i64 %index_i
+ br label %innerloop
+
+innerloop:
+ %index_j = phi i64 [ 0, %outerloop_header ], [ %index_j.next, %innerloop ]
+ %reduction = phi double [ 0.000000e+00, %outerloop_header ], [ %add, %innerloop ]
+ %addr_a_j_i = getelementptr inbounds nuw [100 x double], ptr %invariant.gep.us, i64 %index_j
+ %0 = load double, ptr %addr_a_j_i, align 8
+ %addr_b_j_i = getelementptr inbounds nuw [100 x double], ptr %invariant.gep32.us, i64 %index_j
+ %1 = load double, ptr %addr_b_j_i, align 8
+ %mul = fmul fast double %1, %0
+ %add = fadd fast double %mul, %reduction
+ %index_j.next = add nuw nsw i64 %index_j, 1
+ %cond1 = icmp eq i64 %index_j.next, %n
+ br i1 %cond1, label %outerloop_latch, label %innerloop
+
+outerloop_latch:
+ %lcssa = phi double [ %add, %innerloop ]
+ store double %lcssa, ptr %addr_s, align 8
+ %add17.us = fadd fast double %lcssa, 1.000000e+00
+ store double %add17.us, ptr %addr_s, align 8
+ %index_i.next = add nuw nsw i64 %index_i, 1
+ %cond2 = icmp eq i64 %index_i.next, %n
+ br i1 %cond2, label %exit, label %outerloop_header
+
+exit:
+ ret void
+}
+
+
+; 4. The reduction is not in the innermost loop.
+; for (int i = 0; i < n; i++) {
+; s[i] = 0;
+; for (int j = 0; j < n; j++) {
+; s[i] = s[i] + a[j][i] * b[j][i]; // reduction
+; for (int k = 0; k < n; k++)
+; c[k] = 1;
+
+; }
+; }
+
+; CHECK: --- !Missed
+; CHECK-NEXT: Pass: loop-interchange
+; CHECK-NEXT: Name: UnsupportedPHIOuter
+; CHECK-NEXT: Function: simple_reduction_04
+; CHECK-NEXT: Args:
+; CHECK-NEXT: - String: Only outer loops with induction or reduction PHI nodes can be interchanged currently.
+
+; IR-LABEL: @simple_reduction_04(
+; IR-NOT: split
+define void @simple_reduction_04(ptr noalias noundef readonly captures(none) %a, ptr noalias noundef readonly captures(none) %b, ptr noalias noundef writeonly captures(none) %c, ptr noalias noundef writeonly captures(none) %s, i64 noundef %n) {
+entry:
+ %cmp = icmp sgt i64 %n, 0
+ br i1 %cmp, label %i_loop_header, label %exit
+
+i_loop_header:
+ %index_i = phi i64 [ 0, %entry ], [ %index_i.next, %i_loop_latch ]
+ %addr_s = getelementptr inbounds nuw double, ptr %s, i64 %index_i
+ %invariant.gep.us = getelementptr inbounds nuw [100 x double], ptr %a, i64 0, i64 %index_i
+ %invariant.gep32.us = getelementptr inbounds nuw [100 x double], ptr %b, i64 0, i64 %index_i
+ br label %j_loop
+
+j_loop:
+ %index_j = phi i64 [ 0, %i_loop_header ], [ %index_j.next, %j_loop_latch ]
+ %reduction = phi double [ 0.000000e+00, %i_loop_header ], [ %add, %j_loop_latch ]
+ %addr_a_j_i = getelementptr inbounds nuw [100 x double], ptr %invariant.gep.us, i64 %index_j
+ %0 = load double, ptr %addr_a_j_i, align 8
+ %addr_b_j_i = getelementptr inbounds nuw [100 x double], ptr %invariant.gep32.us, i64 %index_j
+ %1 = load double, ptr %addr_b_j_i, align 8
+ %mul = fmul fast double %1, %0
+ %add = fadd fast double %mul, %reduction
+ br label %k_loop
+
+k_loop:
+ %index_k = phi i64 [ %index_k.next, %k_loop ], [ 0, %j_loop ]
+ %arrayidx22.us.us = getelementptr inbounds nuw double, ptr %c, i64 %index_k
+ ; store double 1.000000e+00, ptr %arrayidx22.us.us, align 8 // Avoid unrelated store instructions from affecting the interchange of the i-loop and j-loop
+ %index_k.next = add nuw nsw i64 %index_k, 1
+ %exitcond.not = icmp eq i64 %index_k.next, %n
+ br i1 %exitcond.not, label %j_loop_latch, label %k_loop
+
+j_loop_latch:
+ %index_j.next = add nuw nsw i64 %index_j, 1
+ %cond1 = icmp eq i64 %index_j.next, %n
+ br i1 %cond1, label %i_loop_latch, label %j_loop
+
+i_loop_latch:
+ %lcssa = phi double [ %add, %j_loop_latch ]
+ store double %lcssa, ptr %addr_s, align 8
+ %index_i.next = add nuw nsw i64 %index_i, 1
+ %cond2 = icmp eq i64 %index_i.next, %n
+ br i1 %cond2, label %exit, label %i_loop_header
+
+exit:
+ ret void
+}
diff --git a/llvm/test/Transforms/LoopInterchange/simple-reduction.ll b/llvm/test/Transforms/LoopInterchange/simple-reduction.ll
index 9a4393f827a36..d16f07b6b084e 100644
--- a/llvm/test/Transforms/LoopInterchange/simple-reduction.ll
+++ b/llvm/test/Transforms/LoopInterchange/simple-reduction.ll
@@ -1,5 +1,6 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; NOTE: Support simple reduction in the inner loop by undoing the simple reduction.
-; RUN: opt < %s -passes="loop(loop-interchange),dce" -undo-simple-reduction -loop-interchange-profitabilities=ignore -S | FileCheck %s
+; RUN: opt < %s -passes="loop-interchange" -loop-interchange-undo-simple-reduction -loop-interchange-profitabilities=ignore -S | FileCheck %s
; for (int i = 0; i < n; i++) {
; s[i] = 0;
@@ -24,6 +25,7 @@ define void @func(ptr noalias noundef readonly captures(none) %a, ptr noalias no
; CHECK-NEXT: br label [[INNERLOOP:%.*]]
; CHECK: innerloop:
; CHECK-NEXT: [[INDEX_J:%.*]] = phi i64 [ [[J_NEXT:%.*]], [[INNERLOOP_SPLIT:%.*]] ], [ 0, [[INNERLOOP_PREHEADER:%.*]] ]
+; CHECK-NEXT: [[REDUCTION_Dead:%.*]] = phi double [ [[ADD_LCSSA:%.*]], [[INNERLOOP_SPLIT:%.*]] ], [ 0.000000e+00, [[INNERLOOP_PREHEADER:%.*]] ]
; CHECK-NEXT: br label [[OUTERLOOPHEADER_PREHEADER:%.*]]
; CHECK: innerloop.split1:
; CHECK-NEXT: [[S:%.*]] = load double, ptr [[ADDR_S]], align 8
@@ -36,8 +38,12 @@ define void @func(ptr noalias noundef readonly captures(none) %a, ptr noalias no
; CHECK-NEXT: [[MUL:%.*]] = fmul fast double [[B_J_I]], [[A_J_I]]
; CHECK-NEXT: [[ADD:%.*]] = fadd fast double [[MUL]], [[NEW_VAR]]
; CHECK-NEXT: store double [[ADD]], ptr [[ADDR_S]], align 8
+; CHECK-NEXT: [[DEAD_J_NEXT:%.*]] = add nuw nsw i64 [[INDEX_J]], 1
+; CHECK-NEXT: [[DEAD_COND:%.*]] = icmp eq i64 [[DEAD_J_NEXT]], [[N]]
; CHECK-NEXT: br label [[OUTERLOOP_LATCH:%.*]]
; CHECK: innerloop.split:
+; CHECK-NEXT: [[DEAD_ADD_LCSSA:%.*]] = phi double [ [[ADD]], [[OUTERLOOP_LATCH]] ]
+; CHECK-NEXT: [[DEAD_LCSSA:%.*]] = phi double [ [[ADD]], [[OUTERLOOP_LATCH]] ]
; CHECK-NEXT: [[J_NEXT:%.*]] = add nuw nsw i64 [[INDEX_J]], 1
; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i64 [[J_NEXT]], [[N]]
; CHECK-NEXT: br i1 [[CMP1]], label [[EXIT_LOOPEXIT:%.*]], label [[INNERLOOP]]
>From 7842fbf3835e71f489c64c6dad6b0d4f1b9cc718 Mon Sep 17 00:00:00 2001
From: buggfg <3171290993 at qq.com>
Date: Wed, 24 Dec 2025 18:05:27 +0800
Subject: [PATCH 06/10] correct the clang-format
---
llvm/lib/Transforms/Scalar/LoopInterchange.cpp | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index 7cd6744c34528..1b72530a79ae7 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -953,7 +953,7 @@ static Value *followLCSSA(Value *SV) {
}
static bool CheckReductionKind(Loop *L, PHINode *PHI,
- SmallVectorImpl<Instruction *> &HasNoWrapInsts) {
+ SmallVectorImpl<Instruction *> &HasNoWrapInsts) {
RecurrenceDescriptor RD;
if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD)) {
// Detect floating point reduction only when it can be reordered.
@@ -1043,8 +1043,8 @@ findInnerReductionPhi(Loop *L, Value *V,
}
}
- return nullptr;
- }
+ return nullptr;
+}
static PHINode *getCounterFromInc(Value *IncV, Loop *L) {
Instruction *IncI = dyn_cast<Instruction>(IncV);
@@ -1157,7 +1157,7 @@ bool LoopInterchangeLegality::findSimpleReduction(
return false;
// Find lcssa_phi in OuterLoop's Latch
- BasicBlock *ExitBlock = L->getExitBlock();;
+ BasicBlock *ExitBlock = L->getExitBlock();
if (!ExitBlock)
return false;
>From 6f488cd9da5f28cc6b48d72564c6b9c76609b450 Mon Sep 17 00:00:00 2001
From: buggfg <3171290993 at qq.com>
Date: Wed, 24 Dec 2025 18:22:29 +0800
Subject: [PATCH 07/10] remove unused variable.
---
llvm/lib/Transforms/Scalar/LoopInterchange.cpp | 1 -
1 file changed, 1 deletion(-)
diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index 1b72530a79ae7..bfb3fe2af5f4c 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -1921,7 +1921,6 @@ void LoopInterchangeTransform::undoSimpleReduction() {
Instruction *LoadMem = Builder.CreateLoad(SR.ElemTy, SR.MemRef);
// Check if it's the first iteration.
- auto &InductionPHIs = LIL.getInnerLoopInductions();
PHINode *IV = SR.CounterIV;
Value *IVInit = IV->getIncomingValueForBlock(InnerLoop->getLoopPreheader());
Value *FirstIter = Builder.CreateICmpNE(IV, IVInit, "first.iter");
>From 4f58163dee1e5fd0938d82e9458c6616b6a04212 Mon Sep 17 00:00:00 2001
From: buggfg <3171290993 at qq.com>
Date: Thu, 25 Dec 2025 17:24:51 +0800
Subject: [PATCH 08/10] Add one negative test and improve security
---
.../lib/Transforms/Scalar/LoopInterchange.cpp | 152 +++++++---------
.../simple-reduction-limitation.ll | 172 +++++++++++++-----
.../LoopInterchange/simple-reduction.ll | 29 +--
3 files changed, 208 insertions(+), 145 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index bfb3fe2af5f4c..0789735625934 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -496,12 +496,9 @@ class LoopInterchangeLegality {
// The memory Location
Value *MemRef;
Type *ElemTy;
-
- /// IV used for the loop exit condition.
- PHINode *CounterIV;
};
- const ArrayRef<SimpleReduction> getInnerSimpleReductions() const {
+ ArrayRef<SimpleReduction> getInnerSimpleReductions() const {
return InnerSimpleReductions;
}
@@ -952,7 +949,7 @@ static Value *followLCSSA(Value *SV) {
return followLCSSA(PHI->getIncomingValue(0));
}
-static bool CheckReductionKind(Loop *L, PHINode *PHI,
+static bool checkReductionKind(Loop *L, PHINode *PHI,
SmallVectorImpl<Instruction *> &HasNoWrapInsts) {
RecurrenceDescriptor RD;
if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD)) {
@@ -1036,7 +1033,7 @@ findInnerReductionPhi(Loop *L, Value *V,
if (PHI->getNumIncomingValues() == 1)
continue;
- if (CheckReductionKind(L, PHI, HasNoWrapInsts))
+ if (checkReductionKind(L, PHI, HasNoWrapInsts))
return PHI;
else
return nullptr;
@@ -1046,65 +1043,6 @@ findInnerReductionPhi(Loop *L, Value *V,
return nullptr;
}
-static PHINode *getCounterFromInc(Value *IncV, Loop *L) {
- Instruction *IncI = dyn_cast<Instruction>(IncV);
- if (!IncI)
- return nullptr;
-
- if (IncI->getOpcode() != Instruction::Add &&
- IncI->getOpcode() != Instruction::Sub)
- return nullptr;
-
- PHINode *Phi = dyn_cast<PHINode>(IncI->getOperand(0));
- if (Phi && Phi->getParent() == L->getHeader()) {
- return Phi;
- }
-
- // Allow add/sub to be commuted.
- Phi = dyn_cast<PHINode>(IncI->getOperand(1));
- if (Phi && Phi->getParent() == L->getHeader()) {
- return Phi;
- }
-
- return nullptr;
-}
-
-/// UndoSimpleReduction requires the first_iteration check, so look for
-/// the IV used for the loop exit condition
-static PHINode *findCounterIV(Loop *L) {
-
- assert(L->getLoopLatch() && "Must be in simplified form");
-
- BranchInst *BI = cast<BranchInst>(L->getLoopLatch()->getTerminator());
- if (L->isLoopInvariant(BI->getCondition()))
- return nullptr;
-
- ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
- if (!Cond)
- return nullptr;
-
- // Look for a loop invariant RHS
- Value *LHS = Cond->getOperand(0);
- Value *RHS = Cond->getOperand(1);
- if (!L->isLoopInvariant(RHS)) {
- if (!L->isLoopInvariant(LHS))
- return nullptr;
- std::swap(LHS, RHS);
- }
-
- // IndVar = phi[{InitialValue, preheader}, {StepInst, latch}]
- // StepInst = IndVar + step
- // case 1:
- // cmp = IndVar < FinalValue
- PHINode *Counter = dyn_cast<PHINode>(LHS);
- // case 2:
- // cmp = StepInst < FinalValue
- if (!Counter)
- Counter = getCounterFromInc(LHS, L);
-
- return Counter;
-}
-
/// Detect and record the simple reduction of the inner loop.
///
/// innerloop:
@@ -1121,8 +1059,15 @@ bool LoopInterchangeLegality::findSimpleReduction(
// Only support undo simple reduction if the loop nest to be interchanged is
// the innermostin two loops.
- if (!L->isInnermost())
+ if (!L->isInnermost()) {
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedSimpleReduction",
+ L->getStartLoc(), L->getHeader())
+ << "Cannot undo a reduction when the loop is not the innermost "
+ "loop.";
+ });
return false;
+ }
if (Phi->getNumIncomingValues() != 2)
return false;
@@ -1131,8 +1076,14 @@ bool LoopInterchangeLegality::findSimpleReduction(
Value *Next = Phi->getIncomingValueForBlock(L->getLoopLatch());
// So far only supports constant initial value.
- if (!isa<Constant>(Init))
+ if (!isa<Constant>(Init)) {
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedSimpleReduction",
+ L->getStartLoc(), L->getHeader())
+ << "Cannot undo a reduction with non-constant initial value.";
+ });
return false;
+ }
// The reduction result must live in the inner loop.
if (Instruction *I = dyn_cast<Instruction>(Next)) {
@@ -1153,7 +1104,7 @@ bool LoopInterchangeLegality::findSimpleReduction(
return false;
// Check the reduction kind.
- if (ReUser != Next && !CheckReductionKind(L, Phi, HasNoWrapInsts))
+ if (ReUser != Next && !checkReductionKind(L, Phi, HasNoWrapInsts))
return false;
// Find lcssa_phi in OuterLoop's Latch
@@ -1175,27 +1126,42 @@ bool LoopInterchangeLegality::findSimpleReduction(
} else
return false;
}
- if (!Lcssa || !Lcssa->hasOneUser())
+ if (!Lcssa)
return false;
+ if (!Lcssa->hasOneUser()) {
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedSimpleReduction",
+ L->getStartLoc(), L->getHeader())
+ << "Cannot undo a reduction when the reduction is used more than "
+ "once in the outer loop.";
+ });
+ return false;
+ }
+
StoreInst *LcssaStorer =
dyn_cast<StoreInst>(Lcssa->getUniqueUndroppableUser());
- if (!LcssaStorer)
+ if (!LcssaStorer || LcssaStorer->getParent() != ExitBlock)
return false;
Value *MemRef = LcssaStorer->getOperand(1);
Type *ElemTy = LcssaStorer->getOperand(0)->getType();
- // LcssaStorer stores the reduction result in BB. undoSimpleReduction() will
- // move it into the inner loop. Here we must ensure that the memory reference
- // and its operands dominate the target block; otherwise the move is unsafe.
- if (!DT->dominates(dyn_cast<Instruction>(MemRef), ExitBlock))
- return false;
-
- // find the IV used for the loop exit condition.
- PHINode *CounterIV = findCounterIV(L);
- if (!CounterIV)
+ // LcssaStorer stores the reduction result in BB.
+ // When the reduction is initialized from a constant value, we need to load
+ // from the memory object into the target basic block of the inner loop during
+ // the undoing of the reduction. This means the memory reference was used
+ // prematurely. So we must ensure that the memory reference does not dominate
+ // the target basic block.
+ if (!DT->dominates(dyn_cast<Instruction>(MemRef), L->getHeader())) {
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedSimpleReduction",
+ L->getStartLoc(), L->getHeader())
+ << "Cannot undo a reduction when memory reference does not "
+ "dominate the inner loop.";
+ });
return false;
+ }
// Found a simple reduction of inner loop.
SimpleReduction SR;
@@ -1206,7 +1172,6 @@ bool LoopInterchangeLegality::findSimpleReduction(
SR.LcssaStorer = LcssaStorer;
SR.MemRef = MemRef;
SR.ElemTy = ElemTy;
- SR.CounterIV = CounterIV;
InnerSimpleReductions.push_back(SR);
return true;
@@ -1254,8 +1219,15 @@ bool LoopInterchangeLegality::findInductionAndReductions(
}
// For now we only support at most one reduction.
- if (InnerSimpleReductions.size() > 1)
+ if (InnerSimpleReductions.size() > 1){
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedSimpleReduction",
+ L->getStartLoc(), L->getHeader())
+ << "Cannot undo a reduction with two or more reductions.";
+ });
return false;
+ }
+
return true;
}
@@ -1915,16 +1887,21 @@ void LoopInterchangeTransform::undoSimpleReduction() {
BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
IRBuilder<> Builder(&*(InnerLoopHeader->getFirstNonPHIIt()));
+ // Check if it's the first iteration.
+ LLVMContext &Context = InnerLoopHeader->getContext();
+ PHINode *FirstIter =
+ Builder.CreatePHI(Type::getInt1Ty(Context), 2, "first.iter");
+ FirstIter->addIncoming(ConstantInt::get(Type::getInt1Ty(Context), 1),
+ InnerLoop->getLoopPreheader());
+ FirstIter->addIncoming(ConstantInt::get(Type::getInt1Ty(Context), 0),
+ InnerLoop->getLoopLatch());
+ assert(FirstIter->isComplete() && "The FirstIter PHI node is not complete.");
+
// When the reduction is intialized from constant value, we need to add
// a stmt loading from the memory object to target basic block in inner
// loop during undoing the reduction.
Instruction *LoadMem = Builder.CreateLoad(SR.ElemTy, SR.MemRef);
- // Check if it's the first iteration.
- PHINode *IV = SR.CounterIV;
- Value *IVInit = IV->getIncomingValueForBlock(InnerLoop->getLoopPreheader());
- Value *FirstIter = Builder.CreateICmpNE(IV, IVInit, "first.iter");
-
// Init new_var to MEM_REF or CONST depending on if it is the first iteration.
Value *NewVar = Builder.CreateSelect(FirstIter, LoadMem, SR.Init, "new.var");
@@ -1940,7 +1917,8 @@ bool LoopInterchangeTransform::transform(
ArrayRef<Instruction *> DropNoWrapInsts) {
bool Transformed = false;
- auto &InnerSimpleReductions = LIL.getInnerSimpleReductions();
+ ArrayRef<LoopInterchangeLegality::SimpleReduction> InnerSimpleReductions =
+ LIL.getInnerSimpleReductions();
if (EnableUndoSimpleReduction && InnerSimpleReductions.size() == 1)
undoSimpleReduction();
diff --git a/llvm/test/Transforms/LoopInterchange/simple-reduction-limitation.ll b/llvm/test/Transforms/LoopInterchange/simple-reduction-limitation.ll
index f4a2266ef9ffe..dee19ed11bd1b 100644
--- a/llvm/test/Transforms/LoopInterchange/simple-reduction-limitation.ll
+++ b/llvm/test/Transforms/LoopInterchange/simple-reduction-limitation.ll
@@ -1,15 +1,23 @@
; Several cases of undoing simple reductions that have not yet been supported.
; RUN: opt < %s -passes="loop-interchange" -loop-interchange-undo-simple-reduction -pass-remarks-missed='loop-interchange' \
-; RUN: -pass-remarks-output=%t -S | FileCheck -check-prefix=IR %s
-; RUN: FileCheck --input-file=%t %s
+; RUN: -pass-remarks-output=%t -S | FileCheck --input-file=%t %s
; 1. The initial value of the reduction is not a constant.
; for (int i = 0; i < n; i++) {
+; r = s[i];
; for (int j = 0; j < n; j++)
-; s[i] = s[i] + a[j][i] * b[j][i];
+; r = r + a[j][i] * b[j][i];
+; s[i] = r;
; }
+; CHECK: --- !Missed
+; CHECK-NEXT: Pass: loop-interchange
+; CHECK-NEXT: Name: UnsupportedSimpleReduction
+; CHECK-NEXT: Function: simple_reduction_01
+; CHECK-NEXT: Args:
+; CHECK-NEXT: - String: Cannot undo a reduction with non-constant initial value.
+
; CHECK: --- !Missed
; CHECK-NEXT: Pass: loop-interchange
; CHECK-NEXT: Name: UnsupportedPHIInner
@@ -17,9 +25,7 @@
; CHECK-NEXT: Args:
; CHECK-NEXT: - String: Only inner loops with induction or reduction PHI nodes can be interchange currently.
-; IR-LABEL: @simple_reduction_01(
-; IR-NOT: split
-define void @simple_reduction_01(ptr noalias noundef readonly captures(none) %a, ptr noalias noundef readonly captures(none) %b, ptr noalias noundef writeonly captures(none) %s, i64 noundef %n) {
+define void @simple_reduction_01(ptr noalias readonly %a, ptr noalias readonly %b, ptr noalias writeonly %s, i64 %n) {
entry:
%cmp = icmp sgt i64 %n, 0
br i1 %cmp, label %outerloop_header, label %exit
@@ -27,17 +33,17 @@ entry:
outerloop_header:
%index_i = phi i64 [ 0, %entry ], [ %index_i.next, %outerloop_latch ]
%addr_s = getelementptr inbounds nuw double, ptr %s, i64 %index_i
- %invariant.gep.us = getelementptr inbounds nuw [100 x double], ptr %a, i64 0, i64 %index_i
- %invariant.gep32.us = getelementptr inbounds nuw [100 x double], ptr %b, i64 0, i64 %index_i
+ %invariant.gep.us = getelementptr inbounds nuw double, ptr %a, i64 %index_i
+ %invariant.gep32.us = getelementptr inbounds nuw double, ptr %b, i64 %index_i
%s_init = load double, ptr %addr_s, align 8
br label %innerloop
innerloop:
%index_j = phi i64 [ 0, %outerloop_header ], [ %index_j.next, %innerloop ]
%reduction = phi double [ %s_init, %outerloop_header ], [ %add, %innerloop ]
- %addr_a_j_i = getelementptr inbounds nuw [100 x double], ptr %invariant.gep.us, i64 %index_j
+ %addr_a_j_i = getelementptr inbounds nuw double, ptr %invariant.gep.us, i64 %index_j
%0 = load double, ptr %addr_a_j_i, align 8
- %addr_b_j_i = getelementptr inbounds nuw [100 x double], ptr %invariant.gep32.us, i64 %index_j
+ %addr_b_j_i = getelementptr inbounds nuw double, ptr %invariant.gep32.us, i64 %index_j
%1 = load double, ptr %addr_b_j_i, align 8
%mul = fmul fast double %1, %0
%add = fadd fast double %mul, %reduction
@@ -58,14 +64,22 @@ exit:
; 2. There are two or more reductions
; for (int i = 0; i < n; i++) {
-; s[i] = 0;
-; s2[i] = 0;
+; r1 = 0;
+; r2 = 0;
; for (int j = 0; j < n; j++){
-; s[i] = s[i] + a[j][i] * b[j][i];
-; s2[i] = s2[i] + a[j][i];
+; r1 = r1 + a[j][i] * b[j][i];
+; r2 = r2 + a[j][i];
; }
+; s[i] = r1;
+; s2[i] = r2;
; }
+; CHECK: --- !Missed
+; CHECK-NEXT: Pass: loop-interchange
+; CHECK-NEXT: Name: UnsupportedSimpleReduction
+; CHECK-NEXT: Function: simple_reduction_02
+; CHECK-NEXT: Args:
+; CHECK-NEXT: - String: Cannot undo a reduction with two or more reductions.
; CHECK: --- !Missed
; CHECK-NEXT: Pass: loop-interchange
; CHECK-NEXT: Name: UnsupportedPHIInner
@@ -73,9 +87,7 @@ exit:
; CHECK-NEXT: Args:
; CHECK-NEXT: - String: Only inner loops with induction or reduction PHI nodes can be interchange currently.
-; IR-LABEL: @simple_reduction_02(
-; IR-NOT: split
-define void @simple_reduction_02(ptr noalias noundef readonly captures(none) %a, ptr noalias noundef readonly captures(none) %b, ptr noalias noundef writeonly captures(none) %s, ptr noalias noundef writeonly captures(none) %s2, i64 noundef %n) {
+define void @simple_reduction_02(ptr noalias readonly %a, ptr noalias readonly %b, ptr noalias writeonly %s, ptr noalias writeonly %s2, i64 %n) {
entry:
%cmp = icmp sgt i64 %n, 0
br i1 %cmp, label %outerloop_header, label %exit
@@ -84,17 +96,17 @@ outerloop_header:
%index_i = phi i64 [ 0, %entry ], [ %index_i.next, %outerloop_latch ]
%addr_s = getelementptr inbounds nuw double, ptr %s, i64 %index_i
%addr_s2 = getelementptr inbounds nuw double, ptr %s2, i64 %index_i
- %invariant.gep.us = getelementptr inbounds nuw [100 x double], ptr %a, i64 0, i64 %index_i
- %invariant.gep32.us = getelementptr inbounds nuw [100 x double], ptr %b, i64 0, i64 %index_i
+ %invariant.gep.us = getelementptr inbounds nuw double, ptr %a, i64 %index_i
+ %invariant.gep32.us = getelementptr inbounds nuw double, ptr %b, i64 %index_i
br label %innerloop
innerloop:
%index_j = phi i64 [ 0, %outerloop_header ], [ %index_j.next, %innerloop ]
%reduction = phi double [ 0.000000e+00, %outerloop_header ], [ %add, %innerloop ]
- %reduction2 = phi double [ 0.000000e+00, %outerloop_header ], [ %add, %innerloop ]
- %addr_a_j_i = getelementptr inbounds nuw [100 x double], ptr %invariant.gep.us, i64 %index_j
+ %reduction2 = phi double [ 0.000000e+00, %outerloop_header ], [ %add2, %innerloop ]
+ %addr_a_j_i = getelementptr inbounds nuw double, ptr %invariant.gep.us, i64 %index_j
%0 = load double, ptr %addr_a_j_i, align 8
- %addr_b_j_i = getelementptr inbounds nuw [100 x double], ptr %invariant.gep32.us, i64 %index_j
+ %addr_b_j_i = getelementptr inbounds nuw double, ptr %invariant.gep32.us, i64 %index_j
%1 = load double, ptr %addr_b_j_i, align 8
%mul = fmul fast double %1, %0
%add = fadd fast double %mul, %reduction
@@ -116,14 +128,21 @@ exit:
ret void
}
-; 3. The reduction is used more than twice in the outer loop.
+; 3. The reduction is used more than once in the outer loop.
; for (int i = 0; i < n; i++) {
-; s[i] = 0;
+; r = 0;
; for (int j = 0; j < n; j++)
-; s[i] = s[i] + a[j][i] * b[j][i];
-; s[i] += 1;
+; r = r + a[j][i] * b[j][i];
+; r += 1;
+; s[i] = r;
; }
+; CHECK: --- !Missed
+; CHECK-NEXT: Pass: loop-interchange
+; CHECK-NEXT: Name: UnsupportedSimpleReduction
+; CHECK-NEXT: Function: simple_reduction_03
+; CHECK-NEXT: Args:
+; CHECK-NEXT: - String: Cannot undo a reduction when the reduction is used more than once in the outer loop.
; CHECK: --- !Missed
; CHECK-NEXT: Pass: loop-interchange
; CHECK-NEXT: Name: UnsupportedPHIInner
@@ -131,9 +150,7 @@ exit:
; CHECK-NEXT: Args:
; CHECK-NEXT: - String: Only inner loops with induction or reduction PHI nodes can be interchange currently.
-; IR-LABEL: @simple_reduction_03(
-; IR-NOT: split
-define void @simple_reduction_03(ptr noalias noundef readonly captures(none) %a, ptr noalias noundef readonly captures(none) %b, ptr noalias noundef writeonly captures(none) %s, i64 noundef %n) {
+define void @simple_reduction_03(ptr noalias readonly %a, ptr noalias readonly %b, ptr noalias writeonly %s, i64 %n) {
entry:
%cmp = icmp sgt i64 %n, 0
br i1 %cmp, label %outerloop_header, label %exit
@@ -141,16 +158,16 @@ entry:
outerloop_header:
%index_i = phi i64 [ 0, %entry ], [ %index_i.next, %outerloop_latch ]
%addr_s = getelementptr inbounds nuw double, ptr %s, i64 %index_i
- %invariant.gep.us = getelementptr inbounds nuw [100 x double], ptr %a, i64 0, i64 %index_i
- %invariant.gep32.us = getelementptr inbounds nuw [100 x double], ptr %b, i64 0, i64 %index_i
+ %invariant.gep.us = getelementptr inbounds nuw double, ptr %a, i64 %index_i
+ %invariant.gep32.us = getelementptr inbounds nuw double, ptr %b, i64 %index_i
br label %innerloop
innerloop:
%index_j = phi i64 [ 0, %outerloop_header ], [ %index_j.next, %innerloop ]
%reduction = phi double [ 0.000000e+00, %outerloop_header ], [ %add, %innerloop ]
- %addr_a_j_i = getelementptr inbounds nuw [100 x double], ptr %invariant.gep.us, i64 %index_j
+ %addr_a_j_i = getelementptr inbounds nuw double, ptr %invariant.gep.us, i64 %index_j
%0 = load double, ptr %addr_a_j_i, align 8
- %addr_b_j_i = getelementptr inbounds nuw [100 x double], ptr %invariant.gep32.us, i64 %index_j
+ %addr_b_j_i = getelementptr inbounds nuw double, ptr %invariant.gep32.us, i64 %index_j
%1 = load double, ptr %addr_b_j_i, align 8
%mul = fmul fast double %1, %0
%add = fadd fast double %mul, %reduction
@@ -174,13 +191,13 @@ exit:
; 4. The reduction is not in the innermost loop.
; for (int i = 0; i < n; i++) {
-; s[i] = 0;
+; r = 0;
; for (int j = 0; j < n; j++) {
-; s[i] = s[i] + a[j][i] * b[j][i]; // reduction
+; r = r + a[j][i] * b[j][i]; // reduction
; for (int k = 0; k < n; k++)
; c[k] = 1;
-
; }
+; s[i] = r;
; }
; CHECK: --- !Missed
@@ -189,10 +206,20 @@ exit:
; CHECK-NEXT: Function: simple_reduction_04
; CHECK-NEXT: Args:
; CHECK-NEXT: - String: Only outer loops with induction or reduction PHI nodes can be interchanged currently.
+; CHECK: --- !Missed
+; CHECK-NEXT: Pass: loop-interchange
+; CHECK-NEXT: Name: UnsupportedSimpleReduction
+; CHECK-NEXT: Function: simple_reduction_04
+; CHECK-NEXT: Args:
+; CHECK-NEXT: - String: Cannot undo a reduction when the loop is not the innermost loop.
+; CHECK: --- !Missed
+; CHECK-NEXT: Pass: loop-interchange
+; CHECK-NEXT: Name: UnsupportedPHIInner
+; CHECK-NEXT: Function: simple_reduction_04
+; CHECK-NEXT: Args:
+; CHECK-NEXT: - String: Only inner loops with induction or reduction PHI nodes can be interchange currently.
-; IR-LABEL: @simple_reduction_04(
-; IR-NOT: split
-define void @simple_reduction_04(ptr noalias noundef readonly captures(none) %a, ptr noalias noundef readonly captures(none) %b, ptr noalias noundef writeonly captures(none) %c, ptr noalias noundef writeonly captures(none) %s, i64 noundef %n) {
+define void @simple_reduction_04(ptr noalias readonly %a, ptr noalias readonly %b, ptr noalias writeonly %c, ptr noalias writeonly %s, i64 %n) {
entry:
%cmp = icmp sgt i64 %n, 0
br i1 %cmp, label %i_loop_header, label %exit
@@ -200,16 +227,16 @@ entry:
i_loop_header:
%index_i = phi i64 [ 0, %entry ], [ %index_i.next, %i_loop_latch ]
%addr_s = getelementptr inbounds nuw double, ptr %s, i64 %index_i
- %invariant.gep.us = getelementptr inbounds nuw [100 x double], ptr %a, i64 0, i64 %index_i
- %invariant.gep32.us = getelementptr inbounds nuw [100 x double], ptr %b, i64 0, i64 %index_i
+ %invariant.gep.us = getelementptr inbounds nuw double, ptr %a, i64 %index_i
+ %invariant.gep32.us = getelementptr inbounds nuw double, ptr %b, i64 %index_i
br label %j_loop
j_loop:
%index_j = phi i64 [ 0, %i_loop_header ], [ %index_j.next, %j_loop_latch ]
%reduction = phi double [ 0.000000e+00, %i_loop_header ], [ %add, %j_loop_latch ]
- %addr_a_j_i = getelementptr inbounds nuw [100 x double], ptr %invariant.gep.us, i64 %index_j
+ %addr_a_j_i = getelementptr inbounds nuw double, ptr %invariant.gep.us, i64 %index_j
%0 = load double, ptr %addr_a_j_i, align 8
- %addr_b_j_i = getelementptr inbounds nuw [100 x double], ptr %invariant.gep32.us, i64 %index_j
+ %addr_b_j_i = getelementptr inbounds nuw double, ptr %invariant.gep32.us, i64 %index_j
%1 = load double, ptr %addr_b_j_i, align 8
%mul = fmul fast double %1, %0
%add = fadd fast double %mul, %reduction
@@ -218,7 +245,6 @@ j_loop:
k_loop:
%index_k = phi i64 [ %index_k.next, %k_loop ], [ 0, %j_loop ]
%arrayidx22.us.us = getelementptr inbounds nuw double, ptr %c, i64 %index_k
- ; store double 1.000000e+00, ptr %arrayidx22.us.us, align 8 // Avoid unrelated store instructions from affecting the interchange of the i-loop and j-loop
%index_k.next = add nuw nsw i64 %index_k, 1
%exitcond.not = icmp eq i64 %index_k.next, %n
br i1 %exitcond.not, label %j_loop_latch, label %k_loop
@@ -238,3 +264,61 @@ i_loop_latch:
exit:
ret void
}
+
+
+; 5. MemRef doesn't dominate InnerLoop's HeaderBB.
+; for (int i = 0; i < n; i++) {
+; r = 0;
+; for (int j = 0; j < n; j++)
+; r = r + a[j][i] * b[j][i];
+; s[i] = r;
+; }
+
+; CHECK: --- !Missed
+; CHECK-NEXT: Pass: loop-interchange
+; CHECK-NEXT: Name: UnsupportedSimpleReduction
+; CHECK-NEXT: Function: simple_reduction_05
+; CHECK-NEXT: Args:
+; CHECK-NEXT: - String: Cannot undo a reduction when memory reference does not dominate the inner loop.
+; CHECK: --- !Missed
+; CHECK-NEXT: Pass: loop-interchange
+; CHECK-NEXT: Name: UnsupportedPHIInner
+; CHECK-NEXT: Function: simple_reduction_05
+; CHECK-NEXT: Args:
+; CHECK-NEXT: - String: Only inner loops with induction or reduction PHI nodes can be interchange currently.
+
+define void @simple_reduction_05(ptr noalias readonly %a, ptr noalias readonly %b, ptr noalias writeonly %s, i64 %n) {
+entry:
+ %cmp = icmp sgt i64 %n, 0
+ br i1 %cmp, label %outerloop_header, label %exit
+
+outerloop_header:
+ %index_i = phi i64 [ 0, %entry ], [ %index_i.next, %outerloop_latch ]
+ %invariant.gep.us = getelementptr inbounds nuw double, ptr %a, i64 %index_i
+ %invariant.gep32.us = getelementptr inbounds nuw double, ptr %b, i64 %index_i
+ br label %innerloop
+
+innerloop:
+ %index_j = phi i64 [ 0, %outerloop_header ], [ %index_j.next, %innerloop ]
+ %reduction = phi double [ 0.000000e+00, %outerloop_header ], [ %add, %innerloop ]
+ %addr_a_j_i = getelementptr inbounds nuw double, ptr %invariant.gep.us, i64 %index_j
+ %0 = load double, ptr %addr_a_j_i, align 8
+ %addr_b_j_i = getelementptr inbounds nuw double, ptr %invariant.gep32.us, i64 %index_j
+ %1 = load double, ptr %addr_b_j_i, align 8
+ %mul = fmul fast double %1, %0
+ %add = fadd fast double %mul, %reduction
+ %index_j.next = add nuw nsw i64 %index_j, 1
+ %cond1 = icmp eq i64 %index_j.next, %n
+ br i1 %cond1, label %outerloop_latch, label %innerloop
+
+outerloop_latch:
+ %lcssa = phi double [ %add, %innerloop ]
+ %addr_s = getelementptr inbounds nuw double, ptr %s, i64 %index_i
+ store double %lcssa, ptr %addr_s, align 8
+ %index_i.next = add nuw nsw i64 %index_i, 1
+ %cond2 = icmp eq i64 %index_i.next, %n
+ br i1 %cond2, label %exit, label %outerloop_header
+
+exit:
+ ret void
+}
diff --git a/llvm/test/Transforms/LoopInterchange/simple-reduction.ll b/llvm/test/Transforms/LoopInterchange/simple-reduction.ll
index d16f07b6b084e..db37c64013232 100644
--- a/llvm/test/Transforms/LoopInterchange/simple-reduction.ll
+++ b/llvm/test/Transforms/LoopInterchange/simple-reduction.ll
@@ -3,13 +3,14 @@
; RUN: opt < %s -passes="loop-interchange" -loop-interchange-undo-simple-reduction -loop-interchange-profitabilities=ignore -S | FileCheck %s
; for (int i = 0; i < n; i++) {
-; s[i] = 0;
+; r = 0;
; for (int j = 0; j < n; j++)
-; s[i] = s[i] + a[j][i] * b[j][i];
+; r = r + a[j][i] * b[j][i];
+; s[i] = r;
; }
-define void @func(ptr noalias noundef readonly captures(none) %a, ptr noalias noundef readonly captures(none) %b, ptr noalias noundef writeonly captures(none) %s, i64 noundef %n) {
-; CHECK-LABEL: define void @func(ptr noalias noundef readonly captures(none) %a, ptr noalias noundef readonly captures(none) %b, ptr noalias noundef writeonly captures(none) %s, i64 noundef %n) {
+define void @func(ptr noalias readonly %a, ptr noalias readonly %b, ptr noalias writeonly %s, i64 %n) {
+; CHECK-LABEL: define void @func(ptr noalias readonly %a, ptr noalias readonly %b, ptr noalias writeonly %s, i64 %n) {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i64 [[N:%.*]], 0
; CHECK-NEXT: br i1 [[CMP]], label [[INNERLOOP_PREHEADER:%.*]], label [[EXIT:%.*]]
@@ -18,22 +19,22 @@ define void @func(ptr noalias noundef readonly captures(none) %a, ptr noalias no
; CHECK: outerloop_header:
; CHECK-NEXT: [[INDEX_I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[OUTERLOOP_LATCH:%.*]] ], [ 0, [[OUTERLOOPHEADER_PREHEADER:%.*]] ]
; CHECK-NEXT: [[ADDR_S:%.*]] = getelementptr inbounds nuw double, ptr %s, i64 [[INDEX_I]]
-; CHECK-NEXT: [[ADDR_A:%.*]] = getelementptr inbounds nuw [100 x double], ptr %a, i64 0, i64 [[INDEX_I]]
-; CHECK-NEXT: [[ADDR_B:%.*]] = getelementptr inbounds nuw [100 x double], ptr %b, i64 0, i64 [[INDEX_I]]
+; CHECK-NEXT: [[ADDR_A:%.*]] = getelementptr inbounds nuw double, ptr %a, i64 [[INDEX_I]]
+; CHECK-NEXT: [[ADDR_B:%.*]] = getelementptr inbounds nuw double, ptr %b, i64 [[INDEX_I]]
; CHECK-NEXT: br label [[INNERLOOP_SPLIT1:%.*]]
; CHECK: innerloop.preheader:
; CHECK-NEXT: br label [[INNERLOOP:%.*]]
; CHECK: innerloop:
; CHECK-NEXT: [[INDEX_J:%.*]] = phi i64 [ [[J_NEXT:%.*]], [[INNERLOOP_SPLIT:%.*]] ], [ 0, [[INNERLOOP_PREHEADER:%.*]] ]
-; CHECK-NEXT: [[REDUCTION_Dead:%.*]] = phi double [ [[ADD_LCSSA:%.*]], [[INNERLOOP_SPLIT:%.*]] ], [ 0.000000e+00, [[INNERLOOP_PREHEADER:%.*]] ]
+; CHECK-NEXT: [[DEAD_REDUCTION:%.*]] = phi double [ [[ADD_LCSSA:%.*]], [[INNERLOOP_SPLIT:%.*]] ], [ 0.000000e+00, [[INNERLOOP_PREHEADER:%.*]] ]
+; CHECK-NEXT: [[FIRSTITER:%.*]] = phi i1 [ false, [[INNERLOOP_SPLIT:%.*]] ], [ true, [[INNERLOOP_PREHEADER:%.*]] ]
; CHECK-NEXT: br label [[OUTERLOOPHEADER_PREHEADER:%.*]]
; CHECK: innerloop.split1:
; CHECK-NEXT: [[S:%.*]] = load double, ptr [[ADDR_S]], align 8
-; CHECK-NEXT: [[FIRSTITER:%.*]] = icmp ne i64 [[INDEX_J]], 0
; CHECK-NEXT: [[NEW_VAR:%.*]] = select i1 [[FIRSTITER]], double [[S]], double 0.000000e+00
-; CHECK-NEXT: [[ADDR_A_J_I:%.*]] = getelementptr inbounds nuw [100 x double], ptr [[ADDR_A]], i64 [[INDEX_J]]
+; CHECK-NEXT: [[ADDR_A_J_I:%.*]] = getelementptr inbounds nuw double, ptr [[ADDR_A]], i64 [[INDEX_J]]
; CHECK-NEXT: [[A_J_I:%.*]] = load double, ptr [[ADDR_A_J_I]], align 8
-; CHECK-NEXT: [[ADDR_B_J_I:%.*]] = getelementptr inbounds nuw [100 x double], ptr [[ADDR_B]], i64 [[INDEX_J]]
+; CHECK-NEXT: [[ADDR_B_J_I:%.*]] = getelementptr inbounds nuw double, ptr [[ADDR_B]], i64 [[INDEX_J]]
; CHECK-NEXT: [[B_J_I:%.*]] = load double, ptr [[ADDR_B_J_I]], align 8
; CHECK-NEXT: [[MUL:%.*]] = fmul fast double [[B_J_I]], [[A_J_I]]
; CHECK-NEXT: [[ADD:%.*]] = fadd fast double [[MUL]], [[NEW_VAR]]
@@ -63,16 +64,16 @@ entry:
outerloop_header:
%index_i = phi i64 [ 0, %entry ], [ %index_i.next, %outerloop_latch ]
%addr_s = getelementptr inbounds nuw double, ptr %s, i64 %index_i
- %invariant.gep.us = getelementptr inbounds nuw [100 x double], ptr %a, i64 0, i64 %index_i
- %invariant.gep32.us = getelementptr inbounds nuw [100 x double], ptr %b, i64 0, i64 %index_i
+ %invariant.gep.us = getelementptr inbounds nuw double, ptr %a, i64 %index_i
+ %invariant.gep32.us = getelementptr inbounds nuw double, ptr %b, i64 %index_i
br label %innerloop
innerloop:
%index_j = phi i64 [ 0, %outerloop_header ], [ %index_j.next, %innerloop ]
%reduction = phi double [ 0.000000e+00, %outerloop_header ], [ %add, %innerloop ]
- %addr_a_j_i = getelementptr inbounds nuw [100 x double], ptr %invariant.gep.us, i64 %index_j
+ %addr_a_j_i = getelementptr inbounds nuw double, ptr %invariant.gep.us, i64 %index_j
%0 = load double, ptr %addr_a_j_i, align 8
- %addr_b_j_i = getelementptr inbounds nuw [100 x double], ptr %invariant.gep32.us, i64 %index_j
+ %addr_b_j_i = getelementptr inbounds nuw double, ptr %invariant.gep32.us, i64 %index_j
%1 = load double, ptr %addr_b_j_i, align 8
%mul = fmul fast double %1, %0
%add = fadd fast double %mul, %reduction
>From aa84d82fcff25f804c643f8c95958d4d6fbad0ae Mon Sep 17 00:00:00 2001
From: buggfg <3171290993 at qq.com>
Date: Thu, 25 Dec 2025 17:28:00 +0800
Subject: [PATCH 09/10] correct the format
---
llvm/lib/Transforms/Scalar/LoopInterchange.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index 0789735625934..12599ae4970d5 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -1219,7 +1219,7 @@ bool LoopInterchangeLegality::findInductionAndReductions(
}
// For now we only support at most one reduction.
- if (InnerSimpleReductions.size() > 1){
+ if (InnerSimpleReductions.size() > 1) {
ORE->emit([&]() {
return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedSimpleReduction",
L->getStartLoc(), L->getHeader())
>From b159dc43d98e32ebce50e788c82d1c3410fddafb Mon Sep 17 00:00:00 2001
From: buggfg <3171290993 at qq.com>
Date: Thu, 25 Dec 2025 18:06:34 +0800
Subject: [PATCH 10/10] fix test
---
.../LoopInterchange/simple-reduction-limitation.ll | 13 ++++++++++++-
1 file changed, 12 insertions(+), 1 deletion(-)
diff --git a/llvm/test/Transforms/LoopInterchange/simple-reduction-limitation.ll b/llvm/test/Transforms/LoopInterchange/simple-reduction-limitation.ll
index dee19ed11bd1b..7218415fa81c0 100644
--- a/llvm/test/Transforms/LoopInterchange/simple-reduction-limitation.ll
+++ b/llvm/test/Transforms/LoopInterchange/simple-reduction-limitation.ll
@@ -1,6 +1,7 @@
; Several cases of undoing simple reductions that have not yet been supported.
; RUN: opt < %s -passes="loop-interchange" -loop-interchange-undo-simple-reduction -pass-remarks-missed='loop-interchange' \
-; RUN: -pass-remarks-output=%t -S | FileCheck --input-file=%t %s
+; RUN: -pass-remarks-output=%t -S | FileCheck -check-prefix=IR %s
+; RUN: FileCheck --input-file=%t %s
; 1. The initial value of the reduction is not a constant.
@@ -25,6 +26,8 @@
; CHECK-NEXT: Args:
; CHECK-NEXT: - String: Only inner loops with induction or reduction PHI nodes can be interchange currently.
+; IR-LABEL: @simple_reduction_01(
+; IR-NOT: split
define void @simple_reduction_01(ptr noalias readonly %a, ptr noalias readonly %b, ptr noalias writeonly %s, i64 %n) {
entry:
%cmp = icmp sgt i64 %n, 0
@@ -87,6 +90,8 @@ exit:
; CHECK-NEXT: Args:
; CHECK-NEXT: - String: Only inner loops with induction or reduction PHI nodes can be interchange currently.
+; IR-LABEL: @simple_reduction_02(
+; IR-NOT: split
define void @simple_reduction_02(ptr noalias readonly %a, ptr noalias readonly %b, ptr noalias writeonly %s, ptr noalias writeonly %s2, i64 %n) {
entry:
%cmp = icmp sgt i64 %n, 0
@@ -150,6 +155,8 @@ exit:
; CHECK-NEXT: Args:
; CHECK-NEXT: - String: Only inner loops with induction or reduction PHI nodes can be interchange currently.
+; IR-LABEL: @simple_reduction_03(
+; IR-NOT: split
define void @simple_reduction_03(ptr noalias readonly %a, ptr noalias readonly %b, ptr noalias writeonly %s, i64 %n) {
entry:
%cmp = icmp sgt i64 %n, 0
@@ -219,6 +226,8 @@ exit:
; CHECK-NEXT: Args:
; CHECK-NEXT: - String: Only inner loops with induction or reduction PHI nodes can be interchange currently.
+; IR-LABEL: @simple_reduction_04(
+; IR-NOT: split
define void @simple_reduction_04(ptr noalias readonly %a, ptr noalias readonly %b, ptr noalias writeonly %c, ptr noalias writeonly %s, i64 %n) {
entry:
%cmp = icmp sgt i64 %n, 0
@@ -287,6 +296,8 @@ exit:
; CHECK-NEXT: Args:
; CHECK-NEXT: - String: Only inner loops with induction or reduction PHI nodes can be interchange currently.
+; IR-LABEL: @simple_reduction_05(
+; IR-NOT: split
define void @simple_reduction_05(ptr noalias readonly %a, ptr noalias readonly %b, ptr noalias writeonly %s, i64 %n) {
entry:
%cmp = icmp sgt i64 %n, 0
More information about the llvm-commits
mailing list