[llvm] [LoopFusion] Extending SIV to handle separate loops (PR #146383)
Alireza Torabian via llvm-commits
llvm-commits at lists.llvm.org
Mon Sep 22 09:03:20 PDT 2025
https://github.com/1997alireza updated https://github.com/llvm/llvm-project/pull/146383
>From f463d8ee5a88ddc83f4056463aeae6640e091ca5 Mon Sep 17 00:00:00 2001
From: a00917109 <alireza.torabian at huawei.com>
Date: Thu, 18 Sep 2025 18:46:06 -0400
Subject: [PATCH] [LoopFusion] Detecting loop-carried dependencies using DA
info
Loop fusion pass will uses the information provided by DA to
detect loop-carried dependencies and fuse the loops if it is legal.
---
llvm/lib/Transforms/Scalar/LoopFuse.cpp | 29 +++
.../LoopFusion/backward_loop_carried.ll | 187 ++++++++++++++++++
llvm/test/Transforms/LoopFusion/simple.ll | 41 ++--
3 files changed, 241 insertions(+), 16 deletions(-)
create mode 100644 llvm/test/Transforms/LoopFusion/backward_loop_carried.ll
diff --git a/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
index b5eb647a042b9..788ba0beed88d 100644
--- a/llvm/lib/Transforms/Scalar/LoopFuse.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
@@ -100,6 +100,8 @@ STATISTIC(OnlySecondCandidateIsGuarded,
"The second candidate is guarded while the first one is not");
STATISTIC(NumHoistedInsts, "Number of hoisted preheader instructions.");
STATISTIC(NumSunkInsts, "Number of hoisted preheader instructions.");
+STATISTIC(NumDepSafeFused, "Number of fused loops with dependencies "
+ "proven safe based on the dependence direction");
enum FusionDependenceAnalysisChoice {
FUSION_DEPENDENCE_ANALYSIS_SCEV,
@@ -1371,6 +1373,33 @@ struct LoopFuser {
<< "\n");
}
#endif
+ unsigned Levels = DepResult->getLevels();
+ unsigned SameSDLevels = DepResult->getSameSDLevels();
+ unsigned CurLoopLevel = FC0.L->getLoopDepth();
+
+ bool OuterEqDir = true;
+ for (unsigned II = 1; II <= std::min(CurLoopLevel - 1, Levels); ++II) {
+ unsigned Direction = DepResult->getDirection(II, II > Levels);
+ if (!(Direction & Dependence::DVEntry::EQ)) {
+ // Different accesses in the outer levels of CurLoopLevel
+ OuterEqDir = false;
+ break;
+ }
+ }
+ if (!OuterEqDir || CurLoopLevel > Levels + SameSDLevels) {
+ LLVM_DEBUG(dbgs() << "Safe to fuse with no dependency\n");
+ NumDepSafeFused++;
+ return true;
+ }
+
+ assert(CurLoopLevel > Levels && "Fusion candidates are not separated");
+ unsigned CurDir = DepResult->getDirection(CurLoopLevel, true);
+ if (!(CurDir & Dependence::DVEntry::GT)) {
+ LLVM_DEBUG(dbgs() << "Safe to fuse with backward loop-carried "
+ "dependency\n");
+ NumDepSafeFused++;
+ return true;
+ }
if (DepResult->getNextPredecessor() || DepResult->getNextSuccessor())
LLVM_DEBUG(
diff --git a/llvm/test/Transforms/LoopFusion/backward_loop_carried.ll b/llvm/test/Transforms/LoopFusion/backward_loop_carried.ll
new file mode 100644
index 0000000000000..1822f4fac3387
--- /dev/null
+++ b/llvm/test/Transforms/LoopFusion/backward_loop_carried.ll
@@ -0,0 +1,187 @@
+; RUN: opt -S -passes=loop-fusion -da-disable-delinearization-checks < %s | FileCheck %s
+
+; The two inner loops have no dependency and are allowed to be fused as in the
+; outer loops, different levels are accessed to.
+
+; C Code
+;
+;; for (long int i = 0; i < n; i++) {
+;; for (long int j = 0; j < n; j++) {
+;; for (long int k = 0; k < n; k++)
+;; A[i][j][k] = i;
+;; for (long int k = 0; k < n; k++)
+;; temp = A[i + 3][j + 2][k + 1];
+;; }
+;; }
+
+define void @backward_dep0(i64 %n, ptr %A) nounwind uwtable ssp {
+entry:
+ %cmp10 = icmp sgt i64 %n, 0
+ br i1 %cmp10, label %for.cond1.preheader.preheader, label %for.end26
+
+; CHECK-LABEL: backward_dep
+; CHECK-COUNT-1: for.body{{[0-9]+}}:
+; CHECK-NOT: for.body{{[0-9]+}}:
+
+for.cond1.preheader.preheader: ; preds = %entry
+ br label %for.cond1.preheader
+
+for.cond1.preheader: ; preds = %for.cond1.preheader.preheader, %for.inc24
+ %i.011 = phi i64 [ %inc25, %for.inc24 ], [ 0, %for.cond1.preheader.preheader ]
+ %cmp26 = icmp sgt i64 %n, 0
+ br i1 %cmp26, label %for.cond4.preheader.preheader, label %for.inc24
+
+for.cond4.preheader.preheader: ; preds = %for.cond1.preheader
+ br label %for.cond4.preheader
+
+for.cond4.preheader: ; preds = %for.cond4.preheader.preheader, %for.inc21
+ %j.07 = phi i64 [ %inc22, %for.inc21 ], [ 0, %for.cond4.preheader.preheader ]
+ %cmp51 = icmp sgt i64 %n, 0
+ br i1 %cmp51, label %for.body6.preheader, label %for.cond10.loopexit
+
+for.body6.preheader: ; preds = %for.cond4.preheader
+ br label %for.body6
+
+for.body6: ; preds = %for.body6.preheader, %for.body6
+ %k.02 = phi i64 [ %inc, %for.body6 ], [ 0, %for.body6.preheader ]
+ %arrayidx8 = getelementptr inbounds [100 x [100 x i64]], ptr %A, i64 %i.011, i64 %j.07, i64 %k.02
+ store i64 %i.011, ptr %arrayidx8, align 8
+ %inc = add nsw i64 %k.02, 1
+ %exitcond13 = icmp ne i64 %inc, %n
+ br i1 %exitcond13, label %for.body6, label %for.cond10.loopexit.loopexit
+
+for.cond10.loopexit.loopexit: ; preds = %for.body6
+ br label %for.cond10.loopexit
+
+for.cond10.loopexit: ; preds = %for.cond10.loopexit.loopexit, %for.cond4.preheader
+ %cmp113 = icmp sgt i64 %n, 0
+ br i1 %cmp113, label %for.body12.preheader, label %for.inc21
+
+for.body12.preheader: ; preds = %for.cond10.loopexit
+ br label %for.body12
+
+for.body12: ; preds = %for.body12.preheader, %for.body12
+ %k9.05 = phi i64 [ %inc19, %for.body12 ], [ 0, %for.body12.preheader ]
+ %add = add nsw i64 %k9.05, 1
+ %add13 = add nsw i64 %j.07, 2
+ %add14 = add nsw i64 %i.011, 3
+ %arrayidx17 = getelementptr inbounds [100 x [100 x i64]], ptr %A, i64 %add14, i64 %add13, i64 %add
+ %0 = load i64, ptr %arrayidx17, align 8
+ %inc19 = add nsw i64 %k9.05, 1
+ %exitcond = icmp ne i64 %inc19, %n
+ br i1 %exitcond, label %for.body12, label %for.inc21.loopexit
+
+for.inc21.loopexit: ; preds = %for.body12
+ br label %for.inc21
+
+for.inc21: ; preds = %for.inc21.loopexit, %for.cond10.loopexit
+ %inc22 = add nsw i64 %j.07, 1
+ %exitcond14 = icmp ne i64 %inc22, %n
+ br i1 %exitcond14, label %for.cond4.preheader, label %for.inc24.loopexit
+
+for.inc24.loopexit: ; preds = %for.inc21
+ br label %for.inc24
+
+for.inc24: ; preds = %for.inc24.loopexit, %for.cond1.preheader
+ %inc25 = add nsw i64 %i.011, 1
+ %exitcond15 = icmp ne i64 %inc25, %n
+ br i1 %exitcond15, label %for.cond1.preheader, label %for.end26.loopexit
+
+for.end26.loopexit: ; preds = %for.inc24
+ br label %for.end26
+
+for.end26: ; preds = %for.end26.loopexit, %entry
+ ret void
+}
+
+; The two inner loops have a backward loop-carried dependency, allowing them
+; to be fused.
+
+; C Code
+;
+;; for (long int i = 0; i < n; i++) {
+;; for (long int j = 0; j < n; j++) {
+;; for (long int k = 0; k < n; k++)
+;; A[i][j][k] = i;
+;; for (long int k = 0; k < n; k++)
+;; temp = A[i][j][k - 1];
+;; }
+;; }
+
+define void @backward_dep1(i64 %n, ptr %A) nounwind uwtable ssp {
+entry:
+ %cmp10 = icmp sgt i64 %n, 0
+ br i1 %cmp10, label %for.cond1.preheader.preheader, label %for.end26
+
+; CHECK-LABEL: backward_dep
+; CHECK-COUNT-1: for.body{{[0-9]+}}:
+; CHECK-NOT: for.body{{[0-9]+}}:
+
+for.cond1.preheader.preheader: ; preds = %entry
+ br label %for.cond1.preheader
+
+for.cond1.preheader: ; preds = %for.cond1.preheader.preheader, %for.inc24
+ %i.011 = phi i64 [ %inc25, %for.inc24 ], [ 0, %for.cond1.preheader.preheader ]
+ %cmp26 = icmp sgt i64 %n, 0
+ br i1 %cmp26, label %for.cond4.preheader.preheader, label %for.inc24
+
+for.cond4.preheader.preheader: ; preds = %for.cond1.preheader
+ br label %for.cond4.preheader
+
+for.cond4.preheader: ; preds = %for.cond4.preheader.preheader, %for.inc21
+ %j.07 = phi i64 [ %inc22, %for.inc21 ], [ 0, %for.cond4.preheader.preheader ]
+ %cmp51 = icmp sgt i64 %n, 0
+ br i1 %cmp51, label %for.body6.preheader, label %for.cond10.loopexit
+
+for.body6.preheader: ; preds = %for.cond4.preheader
+ br label %for.body6
+
+for.body6: ; preds = %for.body6.preheader, %for.body6
+ %k.02 = phi i64 [ %inc, %for.body6 ], [ 0, %for.body6.preheader ]
+ %arrayidx8 = getelementptr inbounds [100 x [100 x i64]], ptr %A, i64 %i.011, i64 %j.07, i64 %k.02
+ store i64 %i.011, ptr %arrayidx8, align 8
+ %inc = add nsw i64 %k.02, 1
+ %exitcond13 = icmp ne i64 %inc, %n
+ br i1 %exitcond13, label %for.body6, label %for.cond10.loopexit.loopexit
+
+for.cond10.loopexit.loopexit: ; preds = %for.body6
+ br label %for.cond10.loopexit
+
+for.cond10.loopexit: ; preds = %for.cond10.loopexit.loopexit, %for.cond4.preheader
+ %cmp113 = icmp sgt i64 %n, 0
+ br i1 %cmp113, label %for.body12.preheader, label %for.inc21
+
+for.body12.preheader: ; preds = %for.cond10.loopexit
+ br label %for.body12
+
+for.body12: ; preds = %for.body12.preheader, %for.body12
+ %k9.05 = phi i64 [ %inc19, %for.body12 ], [ 0, %for.body12.preheader ]
+ %add = add nsw i64 %k9.05, -1
+ %arrayidx17 = getelementptr inbounds [100 x [100 x i64]], ptr %A, i64 %i.011, i64 %j.07, i64 %add
+ %0 = load i64, ptr %arrayidx17, align 8
+ %inc19 = add nsw i64 %k9.05, 1
+ %exitcond = icmp ne i64 %inc19, %n
+ br i1 %exitcond, label %for.body12, label %for.inc21.loopexit
+
+for.inc21.loopexit: ; preds = %for.body12
+ br label %for.inc21
+
+for.inc21: ; preds = %for.inc21.loopexit, %for.cond10.loopexit
+ %inc22 = add nsw i64 %j.07, 1
+ %exitcond14 = icmp ne i64 %inc22, %n
+ br i1 %exitcond14, label %for.cond4.preheader, label %for.inc24.loopexit
+
+for.inc24.loopexit: ; preds = %for.inc21
+ br label %for.inc24
+
+for.inc24: ; preds = %for.inc24.loopexit, %for.cond1.preheader
+ %inc25 = add nsw i64 %i.011, 1
+ %exitcond15 = icmp ne i64 %inc25, %n
+ br i1 %exitcond15, label %for.cond1.preheader, label %for.end26.loopexit
+
+for.end26.loopexit: ; preds = %for.inc24
+ br label %for.end26
+
+for.end26: ; preds = %for.end26.loopexit, %entry
+ ret void
+}
\ No newline at end of file
diff --git a/llvm/test/Transforms/LoopFusion/simple.ll b/llvm/test/Transforms/LoopFusion/simple.ll
index d63890df14461..54556eb98ad80 100644
--- a/llvm/test/Transforms/LoopFusion/simple.ll
+++ b/llvm/test/Transforms/LoopFusion/simple.ll
@@ -298,42 +298,51 @@ bb23: ; preds = %bb17, %bb
ret void
}
+; void forward_dep(int *arg) {
+; for (int i = 0; i < 100; i++) {
+; int tmp = i - 3;
+; int val = tmp * (i + 3) % i;
+; arg[i] = val;
+; }
+;
+; for (int j = 0; j < 100; j++) {
+; int val = arg[j - 3];
+; arg[j] = val * 3;
+; }
+; }
+;
define void @forward_dep(ptr noalias %arg) {
; CHECK-LABEL: @forward_dep(
-; CHECK-NEXT: bb:
-; CHECK-NEXT: br label [[BB7:%.*]]
+; CHECK-NEXT: [[BB:.*]]:
+; CHECK-NEXT: br label %[[BB7:.*]]
; CHECK: bb7:
-; CHECK-NEXT: [[DOT013:%.*]] = phi i32 [ 0, [[BB:%.*]] ], [ [[TMP15:%.*]], [[BB14:%.*]] ]
-; CHECK-NEXT: [[INDVARS_IV22:%.*]] = phi i64 [ 0, [[BB]] ], [ [[INDVARS_IV_NEXT3:%.*]], [[BB14]] ]
+; CHECK-NEXT: [[DOT013:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[TMP15:%.*]], %[[BB25:.*]] ]
+; CHECK-NEXT: [[INDVARS_IV22:%.*]] = phi i64 [ 0, %[[BB]] ], [ [[INDVARS_IV_NEXT3:%.*]], %[[BB25]] ]
+; CHECK-NEXT: [[INDVARS_IV1:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[BB25]] ], [ 0, %[[BB]] ]
; CHECK-NEXT: [[TMP:%.*]] = add nsw i32 [[DOT013]], -3
; CHECK-NEXT: [[TMP8:%.*]] = add nuw nsw i64 [[INDVARS_IV22]], 3
; CHECK-NEXT: [[TMP9:%.*]] = trunc i64 [[TMP8]] to i32
; CHECK-NEXT: [[TMP10:%.*]] = mul nsw i32 [[TMP]], [[TMP9]]
; CHECK-NEXT: [[TMP11:%.*]] = trunc i64 [[INDVARS_IV22]] to i32
; CHECK-NEXT: [[TMP12:%.*]] = srem i32 [[TMP10]], [[TMP11]]
-; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[ARG:%.*]], i64 [[INDVARS_IV22]]
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[ARG]], i64 [[INDVARS_IV22]]
; CHECK-NEXT: store i32 [[TMP12]], ptr [[TMP13]], align 4
-; CHECK-NEXT: br label [[BB14]]
+; CHECK-NEXT: br label %[[BB14:.*]]
; CHECK: bb14:
-; CHECK-NEXT: [[INDVARS_IV_NEXT3]] = add nuw nsw i64 [[INDVARS_IV22]], 1
-; CHECK-NEXT: [[TMP15]] = add nuw nsw i32 [[DOT013]], 1
-; CHECK-NEXT: [[EXITCOND4:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT3]], 100
-; CHECK-NEXT: br i1 [[EXITCOND4]], label [[BB7]], label [[BB19_PREHEADER:%.*]]
-; CHECK: bb19.preheader:
-; CHECK-NEXT: br label [[BB19:%.*]]
-; CHECK: bb19:
-; CHECK-NEXT: [[INDVARS_IV1:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[BB25:%.*]] ], [ 0, [[BB19_PREHEADER]] ]
; CHECK-NEXT: [[TMP20:%.*]] = add nsw i64 [[INDVARS_IV1]], -3
; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[ARG]], i64 [[TMP20]]
; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4
; CHECK-NEXT: [[TMP23:%.*]] = mul nsw i32 [[TMP22]], 3
; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[ARG]], i64 [[INDVARS_IV1]]
; CHECK-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4
-; CHECK-NEXT: br label [[BB25]]
+; CHECK-NEXT: br label %[[BB25]]
; CHECK: bb25:
+; CHECK-NEXT: [[INDVARS_IV_NEXT3]] = add nuw nsw i64 [[INDVARS_IV22]], 1
+; CHECK-NEXT: [[TMP15]] = add nuw nsw i32 [[DOT013]], 1
+; CHECK-NEXT: [[EXITCOND4:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT3]], 100
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV1]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], 100
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[BB19]], label [[BB26:%.*]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label %[[BB7]], label %[[BB26:.*]]
; CHECK: bb26:
; CHECK-NEXT: ret void
;
More information about the llvm-commits
mailing list