[llvm] [LoopInterchange] Add an option to prioritize vectorization (PR #131988)
Ryotaro Kasuga via llvm-commits
llvm-commits at lists.llvm.org
Wed Mar 19 08:18:52 PDT 2025
https://github.com/kasuga-fj updated https://github.com/llvm/llvm-project/pull/131988
>From 6598b4f49b52b5c54c1990329a8586f71a366d14 Mon Sep 17 00:00:00 2001
From: Ryotaro Kasuga <kasuga.ryotaro at fujitsu.com>
Date: Fri, 14 Mar 2025 15:54:18 +0000
Subject: [PATCH 1/2] [LoopInterchange] Add an option to prioritize
vectorization
The LoopInterchange cost-model consists of several decision rules. They
are called one by one, and if some rule can determine the profitability,
then the subsequent rules aren't called. In the current implementation,
the rule for `CacheCostAnalysis` is called first, and if it fails to
determine the profitability, then the rule for vectorization is called.
However, there are cases where interchanging loops for vectorization
makes the code faster even if such exchanges are detrimental to the
cache. For example, exchanging the inner two loops in the following
example looks about x3 faster in my local (compiled with `-O3
-mcpu=neoverse-v2 -mllvm -cache-line-size=64`), even though it's
rejected by the rule based on cache cost. (NOTE: LoopInterchange cannot
exchange these loops due to legality checks. This should also be
improved.)
```c
__attribute__((aligned(64))) float aa[256][256],bb[256][256],cc[256][256],
dd[256][256],ee[256][256],ff[256][256];
// Alternative of TSVC s231 with more array accesses than the original.
void s231_alternative() {
for (int nl = 0; nl < 100*(100000/256); nl++) {
for (int i = 0; i < 256; ++i) {
for (int j = 1; j < 256; j++) {
aa[j][i] = aa[j-1][i] + bb[j][i] + cc[i][j]
+ dd[i][j] + ee[i][j] + ff[i][j];
}
}
}
}
```
This patch introduces a new option to prioritize the vectorization rule
over the cache cost rule.
Related issue: #131130
---
.../lib/Transforms/Scalar/LoopInterchange.cpp | 62 +++++++++++---
.../profitability-vectorization.ll | 81 +++++++++++++++++++
2 files changed, 130 insertions(+), 13 deletions(-)
create mode 100644 llvm/test/Transforms/LoopInterchange/profitability-vectorization.ll
diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index 61b86d2aecef0..364acb0ebc344 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -84,6 +84,11 @@ static cl::opt<unsigned int> MaxLoopNestDepth(
"loop-interchange-max-loop-nest-depth", cl::init(10), cl::Hidden,
cl::desc("Maximum depth of loop nest considered for the transform"));
+static cl::opt<bool> PrioritizeVectorization(
+ "loop-interchange-prioritize-vectorization", cl::init(false), cl::Hidden,
+ cl::desc("Prioritize increasing vectorization opportunity over cache cost "
+ "when determining profitability"));
+
#ifndef NDEBUG
static void printDepMatrix(CharMatrix &DepMatrix) {
for (auto &Row : DepMatrix) {
@@ -1193,22 +1198,53 @@ bool LoopInterchangeProfitability::isProfitable(
unsigned OuterLoopId, CharMatrix &DepMatrix,
const DenseMap<const Loop *, unsigned> &CostMap,
std::unique_ptr<CacheCost> &CC) {
- // isProfitable() is structured to avoid endless loop interchange.
- // If loop cache analysis could decide the profitability then,
- // profitability check will stop and return the analysis result.
- // If cache analysis failed to analyze the loopnest (e.g.,
- // due to delinearization issues) then only check whether it is
- // profitable for InstrOrderCost. Likewise, if InstrOrderCost failed to
- // analysis the profitability then only, isProfitableForVectorization
- // will decide.
- std::optional<bool> shouldInterchange =
- isProfitablePerLoopCacheAnalysis(CostMap, CC);
- if (!shouldInterchange.has_value()) {
- shouldInterchange = isProfitablePerInstrOrderCost();
- if (!shouldInterchange.has_value())
+ // isProfitable() is structured to avoid endless loop interchange. If the
+ // highest priority rule (isProfitablePerLoopCacheAnalysis by default) could
+ // decide the profitability then, profitability check will stop and return the
+ // analysis result. If it failed to determine it (e.g., cache analysis failed
+ // to analyze the loopnest due to delinearization issues) then go ahead the
+ // second highest priority rule (isProfitablePerInstrOrderCost by default).
+ // Likewise, if it failed to analysis the profitability then only, the last
+ // rule (isProfitableForVectorization by default) will decide.
+ enum class RuleTy {
+ PerLoopCacheAnalysis,
+ PerInstrOrderCost,
+ ForVectorization,
+ };
+
+ // We prefer cache cost to vectorization by default.
+ RuleTy RuleOrder[3] = {RuleTy::PerLoopCacheAnalysis,
+ RuleTy::PerInstrOrderCost, RuleTy::ForVectorization};
+
+ // If we prefer vectorization to cache cost, change the order of application
+ // of each rule.
+ if (PrioritizeVectorization) {
+ RuleOrder[0] = RuleTy::ForVectorization;
+ RuleOrder[1] = RuleTy::PerLoopCacheAnalysis;
+ RuleOrder[2] = RuleTy::PerInstrOrderCost;
+ }
+
+ std::optional<bool> shouldInterchange;
+ for (RuleTy RT : RuleOrder) {
+ switch (RT) {
+ case RuleTy::PerLoopCacheAnalysis:
+ shouldInterchange = isProfitablePerLoopCacheAnalysis(CostMap, CC);
+ break;
+ case RuleTy::PerInstrOrderCost:
+ shouldInterchange = isProfitablePerInstrOrderCost();
+ break;
+ case RuleTy::ForVectorization:
shouldInterchange =
isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix);
+ break;
+ }
+
+ // If this rule could determine the profitability, don't call subsequent
+ // rules.
+ if (shouldInterchange.has_value())
+ break;
}
+
if (!shouldInterchange.has_value()) {
ORE->emit([&]() {
return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
diff --git a/llvm/test/Transforms/LoopInterchange/profitability-vectorization.ll b/llvm/test/Transforms/LoopInterchange/profitability-vectorization.ll
new file mode 100644
index 0000000000000..0018aa0308f28
--- /dev/null
+++ b/llvm/test/Transforms/LoopInterchange/profitability-vectorization.ll
@@ -0,0 +1,81 @@
+; RUN: opt < %s -passes=loop-interchange -cache-line-size=64 \
+; RUN: -pass-remarks-output=%t -disable-output
+; RUN: FileCheck -input-file %t --check-prefix=PROFIT-CACHE %s
+
+; RUN: opt < %s -passes=loop-interchange -cache-line-size=64 \
+; RUN: -pass-remarks-output=%t -disable-output -loop-interchange-prioritize-vectorization=1
+; RUN: FileCheck -input-file %t --check-prefix=PROFIT-VEC %s
+
+ at A = dso_local global [256 x [256 x float]] zeroinitializer
+ at B = dso_local global [256 x [256 x float]] zeroinitializer
+ at C = dso_local global [256 x [256 x float]] zeroinitializer
+ at D = dso_local global [256 x [256 x float]] zeroinitializer
+ at E = dso_local global [256 x [256 x float]] zeroinitializer
+ at F = dso_local global [256 x [256 x float]] zeroinitializer
+
+; Check the behavior of the LoopInterchange cost-model. In the below code,
+; exchanging the loops is not profitable in terms of cache, but it is necessary
+; to vectorize the innermost loop.
+;
+; for (int i = 0; i < 256; i++)
+; for (int j = 1; j < 256; j++)
+; A[j][i] = A[j-1][i] + B[j][i] + C[i][j] + D[i][j] + E[i][j] + F[i][j];
+;
+
+; PROFIT-CACHE: --- !Missed
+; PROFIT-CACHE-NEXT: Pass: loop-interchange
+; PROFIT-CACHE-NEXT: Name: InterchangeNotProfitable
+; PROFIT-CACHE-NEXT: Function: f
+; PROFIT-CACHE-NEXT: Args:
+; PROFIT-CACHE-NEXT: - String: Interchanging loops is not considered to improve cache locality nor vectorization.
+; PROFIT-CACHE-NEXT: ...
+
+; PROFIT-VEC: --- !Passed
+; PROFIT-VEC-NEXT: Pass: loop-interchange
+; PROFIT-VEC-NEXT: Name: Interchanged
+; PROFIT-VEC-NEXT: Function: f
+; PROFIT-VEC-NEXT: Args:
+; PROFIT-VEC-NEXT: - String: Loop interchanged with enclosing loop.
+; PROFIT-VEC-NEXT: ...
+define void @f() {
+entry:
+ br label %for.i.header
+
+for.i.header:
+ %i = phi i64 [ 0, %entry ], [ %i.next, %for.i.inc ]
+ br label %for.j.body
+
+for.j.body:
+ %j = phi i64 [ 1, %for.i.header ], [ %j.next, %for.j.body ]
+ %j.dec = add nsw i64 %j, -1
+ %a.0.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @A, i64 %j.dec, i64 %i
+ %b.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @B, i64 %j, i64 %i
+ %c.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @C, i64 %i, i64 %j
+ %d.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @D, i64 %i, i64 %j
+ %e.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @E, i64 %i, i64 %j
+ %f.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @F, i64 %i, i64 %j
+ %a.0 = load float, ptr %a.0.index, align 4
+ %b = load float, ptr %b.index, align 4
+ %c = load float, ptr %c.index, align 4
+ %d = load float, ptr %d.index, align 4
+ %e = load float, ptr %e.index, align 4
+ %f = load float, ptr %f.index, align 4
+ %add.0 = fadd float %a.0, %b
+ %add.1 = fadd float %add.0, %c
+ %add.2 = fadd float %add.1, %d
+ %add.3 = fadd float %add.2, %e
+ %add.4 = fadd float %add.3, %f
+ %a.1.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @A, i64 %j, i64 %i
+ store float %add.4, ptr %a.1.index, align 4
+ %j.next = add nuw nsw i64 %j, 1
+ %cmp.j = icmp eq i64 %j.next, 256
+ br i1 %cmp.j, label %for.i.inc, label %for.j.body
+
+for.i.inc:
+ %i.next = add nuw nsw i64 %i, 1
+ %cmp.i = icmp eq i64 %i.next, 256
+ br i1 %cmp.i, label %exit, label %for.i.header
+
+exit:
+ ret void
+}
>From c0b8ab0b91877d640d5c444fbbf00a54948a9e8d Mon Sep 17 00:00:00 2001
From: Ryotaro Kasuga <kasuga.ryotaro at fujitsu.com>
Date: Thu, 20 Mar 2025 00:18:44 +0900
Subject: [PATCH 2/2] Update llvm/lib/Transforms/Scalar/LoopInterchange.cpp
Co-authored-by: Florian Hahn <flo at fhahn.com>
---
llvm/lib/Transforms/Scalar/LoopInterchange.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index 364acb0ebc344..d81467afe4a59 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -1224,7 +1224,7 @@ bool LoopInterchangeProfitability::isProfitable(
RuleOrder[2] = RuleTy::PerInstrOrderCost;
}
- std::optional<bool> shouldInterchange;
+ std::optional<bool> ShouldInterchange;
for (RuleTy RT : RuleOrder) {
switch (RT) {
case RuleTy::PerLoopCacheAnalysis:
More information about the llvm-commits
mailing list