[llvm] [LoopInterchange] Defer CacheCost calculation until needed (PR #146874)
Ryotaro Kasuga via llvm-commits
llvm-commits at lists.llvm.org
Thu Jul 3 06:37:35 PDT 2025
https://github.com/kasuga-fj updated https://github.com/llvm/llvm-project/pull/146874
>From 72e3e12480b763241f6659767edb2a2fb06c77c7 Mon Sep 17 00:00:00 2001
From: Ryotaro Kasuga <kasuga.ryotaro at fujitsu.com>
Date: Thu, 3 Jul 2025 11:18:51 +0000
Subject: [PATCH 1/2] [LoopInterchange] Defer CacheCost calculation until
needed
---
.../lib/Transforms/Scalar/LoopInterchange.cpp | 96 +++++++++++++++----
.../delay-cachecost-calculation.ll | 77 +++++++++++++++
2 files changed, 153 insertions(+), 20 deletions(-)
create mode 100644 llvm/test/Transforms/LoopInterchange/delay-cachecost-calculation.ll
diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index 5bb5f749d9f1a..9b3bb2053961e 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -407,6 +407,33 @@ class LoopInterchangeLegality {
SmallVector<PHINode *, 8> InnerLoopInductions;
};
+/// Manages information utilized by the profitability check for cache. The main
+/// purpose of this class is to delay the computation of CacheCost until it is
+/// actually needed.
+class LoopInterchangeCacheCostManager {
+ Loop *OutermostLoop;
+ LoopStandardAnalysisResults *AR;
+ DependenceInfo *DI;
+
+ /// CacheCost for \ref OutermostLoop. Once it is computed, it is cached. Note
+ /// that the result can be nullptr.
+ std::optional<std::unique_ptr<CacheCost>> CC;
+
+ /// Maps each loop to an index representing the optimal position within the
+ /// loop-nest, as determined by the cache cost analysis.
+ DenseMap<const Loop *, unsigned> CostMap;
+
+ void computeIfUnitinialized();
+
+public:
+ LoopInterchangeCacheCostManager(Loop *OutermostLoop,
+ LoopStandardAnalysisResults *AR,
+ DependenceInfo *DI)
+ : OutermostLoop(OutermostLoop), AR(AR), DI(DI) {}
+ std::unique_ptr<CacheCost> &getCacheCost();
+ const DenseMap<const Loop *, unsigned> &getCostMap();
+};
+
/// LoopInterchangeProfitability checks if it is profitable to interchange the
/// loop.
class LoopInterchangeProfitability {
@@ -419,8 +446,7 @@ class LoopInterchangeProfitability {
bool isProfitable(const Loop *InnerLoop, const Loop *OuterLoop,
unsigned InnerLoopId, unsigned OuterLoopId,
CharMatrix &DepMatrix,
- const DenseMap<const Loop *, unsigned> &CostMap,
- std::unique_ptr<CacheCost> &CC);
+ LoopInterchangeCacheCostManager &LICCM);
private:
int getInstrOrderCost();
@@ -477,15 +503,15 @@ struct LoopInterchange {
LoopInfo *LI = nullptr;
DependenceInfo *DI = nullptr;
DominatorTree *DT = nullptr;
- std::unique_ptr<CacheCost> CC = nullptr;
+ LoopStandardAnalysisResults *AR = nullptr;
/// Interface to emit optimization remarks.
OptimizationRemarkEmitter *ORE;
LoopInterchange(ScalarEvolution *SE, LoopInfo *LI, DependenceInfo *DI,
- DominatorTree *DT, std::unique_ptr<CacheCost> &CC,
+ DominatorTree *DT, LoopStandardAnalysisResults *AR,
OptimizationRemarkEmitter *ORE)
- : SE(SE), LI(LI), DI(DI), DT(DT), CC(std::move(CC)), ORE(ORE) {}
+ : SE(SE), LI(LI), DI(DI), DT(DT), AR(AR), ORE(ORE) {}
bool run(Loop *L) {
if (L->getParentLoop())
@@ -548,11 +574,12 @@ struct LoopInterchange {
// indicates the loop should be placed as the innermost loop.
//
// For the old pass manager CacheCost would be null.
- DenseMap<const Loop *, unsigned> CostMap;
- if (CC != nullptr) {
- for (const auto &[Idx, Cost] : enumerate(CC->getLoopCosts()))
- CostMap[Cost.first] = Idx;
- }
+ // DenseMap<const Loop *, unsigned> CostMap;
+ // if (CC != nullptr) {
+ // for (const auto &[Idx, Cost] : enumerate(CC->getLoopCosts()))
+ // CostMap[Cost.first] = Idx;
+ // }
+ LoopInterchangeCacheCostManager LICCM(LoopList[0], AR, DI);
// We try to achieve the globally optimal memory access for the loopnest,
// and do interchange based on a bubble-sort fasion. We start from
// the innermost loop, move it outwards to the best possible position
@@ -561,7 +588,7 @@ struct LoopInterchange {
bool ChangedPerIter = false;
for (unsigned i = SelecLoopId; i > SelecLoopId - j; i--) {
bool Interchanged =
- processLoop(LoopList, i, i - 1, DependencyMatrix, CostMap);
+ processLoop(LoopList, i, i - 1, DependencyMatrix, LICCM);
ChangedPerIter |= Interchanged;
Changed |= Interchanged;
}
@@ -576,7 +603,7 @@ struct LoopInterchange {
bool processLoop(SmallVectorImpl<Loop *> &LoopList, unsigned InnerLoopId,
unsigned OuterLoopId,
std::vector<std::vector<char>> &DependencyMatrix,
- const DenseMap<const Loop *, unsigned> &CostMap) {
+ LoopInterchangeCacheCostManager &LICCM) {
Loop *OuterLoop = LoopList[OuterLoopId];
Loop *InnerLoop = LoopList[InnerLoopId];
LLVM_DEBUG(dbgs() << "Processing InnerLoopId = " << InnerLoopId
@@ -589,7 +616,7 @@ struct LoopInterchange {
LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n");
LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId,
- DependencyMatrix, CostMap, CC)) {
+ DependencyMatrix, LICCM)) {
LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n");
return false;
}
@@ -1122,6 +1149,36 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
return true;
}
+void LoopInterchangeCacheCostManager::computeIfUnitinialized() {
+ if (CC.has_value())
+ return;
+
+ LLVM_DEBUG(dbgs() << "Compute CacheCost.\n");
+ CC = CacheCost::getCacheCost(*OutermostLoop, *AR, *DI);
+ // Obtain the loop vector returned from loop cache analysis beforehand,
+ // and put each <Loop, index> pair into a map for constant time query
+ // later. Indices in loop vector reprsent the optimal order of the
+ // corresponding loop, e.g., given a loopnest with depth N, index 0
+ // indicates the loop should be placed as the outermost loop and index N
+ // indicates the loop should be placed as the innermost loop.
+ //
+ // For the old pass manager CacheCost would be null.
+ if (*CC != nullptr)
+ for (const auto &[Idx, Cost] : enumerate((*CC)->getLoopCosts()))
+ CostMap[Cost.first] = Idx;
+}
+
+std::unique_ptr<CacheCost> &LoopInterchangeCacheCostManager::getCacheCost() {
+ computeIfUnitinialized();
+ return *CC;
+}
+
+const DenseMap<const Loop *, unsigned> &
+LoopInterchangeCacheCostManager::getCostMap() {
+ computeIfUnitinialized();
+ return CostMap;
+}
+
int LoopInterchangeProfitability::getInstrOrderCost() {
unsigned GoodOrder, BadOrder;
BadOrder = GoodOrder = 0;
@@ -1247,8 +1304,7 @@ std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization(
bool LoopInterchangeProfitability::isProfitable(
const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId,
unsigned OuterLoopId, CharMatrix &DepMatrix,
- const DenseMap<const Loop *, unsigned> &CostMap,
- std::unique_ptr<CacheCost> &CC) {
+ LoopInterchangeCacheCostManager &LICCM) {
// isProfitable() is structured to avoid endless loop interchange. If the
// highest priority rule (isProfitablePerLoopCacheAnalysis by default) could
// decide the profitability then, profitability check will stop and return the
@@ -1261,9 +1317,12 @@ bool LoopInterchangeProfitability::isProfitable(
std::optional<bool> shouldInterchange;
for (RuleTy RT : Profitabilities) {
switch (RT) {
- case RuleTy::PerLoopCacheAnalysis:
+ case RuleTy::PerLoopCacheAnalysis: {
+ std::unique_ptr<CacheCost> &CC = LICCM.getCacheCost();
+ const DenseMap<const Loop *, unsigned> &CostMap = LICCM.getCostMap();
shouldInterchange = isProfitablePerLoopCacheAnalysis(CostMap, CC);
break;
+ }
case RuleTy::PerInstrOrderCost:
shouldInterchange = isProfitablePerInstrOrderCost();
break;
@@ -1841,10 +1900,7 @@ PreservedAnalyses LoopInterchangePass::run(LoopNest &LN,
});
DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI);
- std::unique_ptr<CacheCost> CC =
- CacheCost::getCacheCost(LN.getOutermostLoop(), AR, DI);
-
- if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, CC, &ORE).run(LN))
+ if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, &AR, &ORE).run(LN))
return PreservedAnalyses::all();
U.markLoopNestChanged(true);
return getLoopPassPreservedAnalyses();
diff --git a/llvm/test/Transforms/LoopInterchange/delay-cachecost-calculation.ll b/llvm/test/Transforms/LoopInterchange/delay-cachecost-calculation.ll
new file mode 100644
index 0000000000000..69ee30eadade7
--- /dev/null
+++ b/llvm/test/Transforms/LoopInterchange/delay-cachecost-calculation.ll
@@ -0,0 +1,77 @@
+; REQUIRES: asserts
+
+; RUN: opt -passes=loop-interchange -debug -disable-output %s 2>&1 | FileCheck %s
+
+ at A = global [16 x [16 x i32]] zeroinitializer
+
+; Check that the CacheCost is calculated only when required. In this case, it
+; is computed after passing the legality check.
+;
+; for (i = 0; i < 16; i++)
+; for (j = 0; j < 16; j++)
+; A[j][i] += 1;
+
+; CHECK: Loops are legal to interchange
+; CHECK: Compute CacheCost
+define void @legal_to_interchange() {
+entry:
+ br label %for.i.header
+
+for.i.header:
+ %i = phi i32 [ 0, %entry ], [ %i.next, %for.i.latch ]
+ br label %for.j
+
+for.j:
+ %j = phi i32 [ 0, %for.i.header ], [ %j.next, %for.j ]
+ %idx = getelementptr inbounds [16 x [16 x i32]], ptr @A, i32 %j, i32 %i
+ %val = load i32, ptr %idx
+ %inc = add i32 %val, 1
+ store i32 %inc, ptr %idx
+ %j.next = add i32 %j, 1
+ %j.exit = icmp eq i32 %j.next, 16
+ br i1 %j.exit, label %for.i.latch, label %for.j
+
+for.i.latch:
+ %i.next = add i32 %i, 1
+ %i.exit = icmp eq i32 %i.next, 16
+ br i1 %i.exit, label %exit, label %for.i.header
+
+exit:
+ ret void
+}
+
+; Check that the CacheCost is not calculated when not required. In this case,
+; the legality check always fails so that we do not need to compute the
+; CacheCost.
+;
+; for (i = 0; i < 16; i++)
+; for (j = 0; j < 16; j++)
+; A[j][i] = A[i][j];
+
+; CHECK-NOT: Compute CacheCost
+define void @illegal_to_interchange() {
+entry:
+ br label %for.i.header
+
+for.i.header:
+ %i = phi i32 [ 0, %entry ], [ %i.next, %for.i.latch ]
+ br label %for.j
+
+for.j:
+ %j = phi i32 [ 0, %for.i.header ], [ %j.next, %for.j ]
+ %idx.load = getelementptr inbounds [16 x [16 x i32]], ptr @A, i32 %i, i32 %j
+ %idx.store = getelementptr inbounds [16 x [16 x i32]], ptr @A, i32 %j, i32 %i
+ %val = load i32, ptr %idx.load
+ store i32 %val, ptr %idx.store
+ %j.next = add i32 %j, 1
+ %j.exit = icmp eq i32 %j.next, 16
+ br i1 %j.exit, label %for.i.latch, label %for.j
+
+for.i.latch:
+ %i.next = add i32 %i, 1
+ %i.exit = icmp eq i32 %i.next, 16
+ br i1 %i.exit, label %exit, label %for.i.header
+
+exit:
+ ret void
+}
>From 23771cad96d74123362883c03129caaa4ba0bf03 Mon Sep 17 00:00:00 2001
From: Ryotaro Kasuga <kasuga.ryotaro at fujitsu.com>
Date: Thu, 3 Jul 2025 22:37:21 +0900
Subject: [PATCH 2/2] Clean up commented-out code
---
llvm/lib/Transforms/Scalar/LoopInterchange.cpp | 13 -------------
1 file changed, 13 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index 9b3bb2053961e..b8c871814e9ab 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -566,19 +566,6 @@ struct LoopInterchange {
}
unsigned SelecLoopId = selectLoopForInterchange(LoopList);
- // Obtain the loop vector returned from loop cache analysis beforehand,
- // and put each <Loop, index> pair into a map for constant time query
- // later. Indices in loop vector reprsent the optimal order of the
- // corresponding loop, e.g., given a loopnest with depth N, index 0
- // indicates the loop should be placed as the outermost loop and index N
- // indicates the loop should be placed as the innermost loop.
- //
- // For the old pass manager CacheCost would be null.
- // DenseMap<const Loop *, unsigned> CostMap;
- // if (CC != nullptr) {
- // for (const auto &[Idx, Cost] : enumerate(CC->getLoopCosts()))
- // CostMap[Cost.first] = Idx;
- // }
LoopInterchangeCacheCostManager LICCM(LoopList[0], AR, DI);
// We try to achieve the globally optimal memory access for the loopnest,
// and do interchange based on a bubble-sort fasion. We start from
More information about the llvm-commits
mailing list