[llvm] [LoopFusion] Extending SIV to handle separate loops (PR #146383)

Alireza Torabian via llvm-commits llvm-commits at lists.llvm.org
Mon Sep 22 09:03:20 PDT 2025


https://github.com/1997alireza updated https://github.com/llvm/llvm-project/pull/146383

>From f463d8ee5a88ddc83f4056463aeae6640e091ca5 Mon Sep 17 00:00:00 2001
From: a00917109 <alireza.torabian at huawei.com>
Date: Thu, 18 Sep 2025 18:46:06 -0400
Subject: [PATCH] [LoopFusion] Detecting loop-carried dependencies using DA
 info

Loop fusion pass will uses the information provided by DA to
detect loop-carried dependencies and fuse the loops if it is legal.
---
 llvm/lib/Transforms/Scalar/LoopFuse.cpp       |  29 +++
 .../LoopFusion/backward_loop_carried.ll       | 187 ++++++++++++++++++
 llvm/test/Transforms/LoopFusion/simple.ll     |  41 ++--
 3 files changed, 241 insertions(+), 16 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopFusion/backward_loop_carried.ll

diff --git a/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
index b5eb647a042b9..788ba0beed88d 100644
--- a/llvm/lib/Transforms/Scalar/LoopFuse.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
@@ -100,6 +100,8 @@ STATISTIC(OnlySecondCandidateIsGuarded,
           "The second candidate is guarded while the first one is not");
 STATISTIC(NumHoistedInsts, "Number of hoisted preheader instructions.");
 STATISTIC(NumSunkInsts, "Number of hoisted preheader instructions.");
+STATISTIC(NumDepSafeFused, "Number of fused loops with dependencies "
+                           "proven safe based on the dependence direction");
 
 enum FusionDependenceAnalysisChoice {
   FUSION_DEPENDENCE_ANALYSIS_SCEV,
@@ -1371,6 +1373,33 @@ struct LoopFuser {
                           << "\n");
       }
 #endif
+      unsigned Levels = DepResult->getLevels();
+      unsigned SameSDLevels = DepResult->getSameSDLevels();
+      unsigned CurLoopLevel = FC0.L->getLoopDepth();
+
+      bool OuterEqDir = true;
+      for (unsigned II = 1; II <= std::min(CurLoopLevel - 1, Levels); ++II) {
+        unsigned Direction = DepResult->getDirection(II, II > Levels);
+        if (!(Direction & Dependence::DVEntry::EQ)) {
+          // Different accesses in the outer levels of CurLoopLevel
+          OuterEqDir = false;
+          break;
+        }
+      }
+      if (!OuterEqDir || CurLoopLevel > Levels + SameSDLevels) {
+        LLVM_DEBUG(dbgs() << "Safe to fuse with no dependency\n");
+        NumDepSafeFused++;
+        return true;
+      }
+
+      assert(CurLoopLevel > Levels && "Fusion candidates are not separated");
+      unsigned CurDir = DepResult->getDirection(CurLoopLevel, true);
+      if (!(CurDir & Dependence::DVEntry::GT)) {
+        LLVM_DEBUG(dbgs() << "Safe to fuse with backward loop-carried "
+                             "dependency\n");
+        NumDepSafeFused++;
+        return true;
+      }
 
       if (DepResult->getNextPredecessor() || DepResult->getNextSuccessor())
         LLVM_DEBUG(
diff --git a/llvm/test/Transforms/LoopFusion/backward_loop_carried.ll b/llvm/test/Transforms/LoopFusion/backward_loop_carried.ll
new file mode 100644
index 0000000000000..1822f4fac3387
--- /dev/null
+++ b/llvm/test/Transforms/LoopFusion/backward_loop_carried.ll
@@ -0,0 +1,187 @@
+; RUN: opt -S -passes=loop-fusion -da-disable-delinearization-checks < %s | FileCheck %s
+
+; The two inner loops have no dependency and are allowed to be fused as in the
+; outer loops, different levels are accessed to.
+
+; C Code
+;
+;;  for (long int i = 0; i < n; i++) {
+;;    for (long int j = 0; j < n; j++) {
+;;      for (long int k = 0; k < n; k++)
+;;        A[i][j][k] = i;
+;;      for (long int k = 0; k < n; k++)
+;;        temp = A[i + 3][j + 2][k + 1];
+;;    }
+;;  }
+
+define void @backward_dep0(i64 %n, ptr %A) nounwind uwtable ssp {
+entry:
+  %cmp10 = icmp sgt i64 %n, 0
+  br i1 %cmp10, label %for.cond1.preheader.preheader, label %for.end26
+
+; CHECK-LABEL: backward_dep
+; CHECK-COUNT-1: for.body{{[0-9]+}}:
+; CHECK-NOT:     for.body{{[0-9]+}}:
+
+for.cond1.preheader.preheader:                    ; preds = %entry
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.cond1.preheader.preheader, %for.inc24
+  %i.011 = phi i64 [ %inc25, %for.inc24 ], [ 0, %for.cond1.preheader.preheader ]
+  %cmp26 = icmp sgt i64 %n, 0
+  br i1 %cmp26, label %for.cond4.preheader.preheader, label %for.inc24
+
+for.cond4.preheader.preheader:                    ; preds = %for.cond1.preheader
+  br label %for.cond4.preheader
+
+for.cond4.preheader:                              ; preds = %for.cond4.preheader.preheader, %for.inc21
+  %j.07 = phi i64 [ %inc22, %for.inc21 ], [ 0, %for.cond4.preheader.preheader ]
+  %cmp51 = icmp sgt i64 %n, 0
+  br i1 %cmp51, label %for.body6.preheader, label %for.cond10.loopexit
+
+for.body6.preheader:                              ; preds = %for.cond4.preheader
+  br label %for.body6
+
+for.body6:                                        ; preds = %for.body6.preheader, %for.body6
+  %k.02 = phi i64 [ %inc, %for.body6 ], [ 0, %for.body6.preheader ]
+  %arrayidx8 = getelementptr inbounds [100 x [100 x i64]], ptr %A, i64 %i.011, i64 %j.07, i64 %k.02
+  store i64 %i.011, ptr %arrayidx8, align 8
+  %inc = add nsw i64 %k.02, 1
+  %exitcond13 = icmp ne i64 %inc, %n
+  br i1 %exitcond13, label %for.body6, label %for.cond10.loopexit.loopexit
+
+for.cond10.loopexit.loopexit:                     ; preds = %for.body6
+  br label %for.cond10.loopexit
+
+for.cond10.loopexit:                              ; preds = %for.cond10.loopexit.loopexit, %for.cond4.preheader
+  %cmp113 = icmp sgt i64 %n, 0
+  br i1 %cmp113, label %for.body12.preheader, label %for.inc21
+
+for.body12.preheader:                             ; preds = %for.cond10.loopexit
+  br label %for.body12
+
+for.body12:                                       ; preds = %for.body12.preheader, %for.body12
+  %k9.05 = phi i64 [ %inc19, %for.body12 ], [ 0, %for.body12.preheader ]
+  %add = add nsw i64 %k9.05, 1
+  %add13 = add nsw i64 %j.07, 2
+  %add14 = add nsw i64 %i.011, 3
+  %arrayidx17 = getelementptr inbounds [100 x [100 x i64]], ptr %A, i64 %add14, i64 %add13, i64 %add
+  %0 = load i64, ptr %arrayidx17, align 8
+  %inc19 = add nsw i64 %k9.05, 1
+  %exitcond = icmp ne i64 %inc19, %n
+  br i1 %exitcond, label %for.body12, label %for.inc21.loopexit
+
+for.inc21.loopexit:                               ; preds = %for.body12
+  br label %for.inc21
+
+for.inc21:                                        ; preds = %for.inc21.loopexit, %for.cond10.loopexit
+  %inc22 = add nsw i64 %j.07, 1
+  %exitcond14 = icmp ne i64 %inc22, %n
+  br i1 %exitcond14, label %for.cond4.preheader, label %for.inc24.loopexit
+
+for.inc24.loopexit:                               ; preds = %for.inc21
+  br label %for.inc24
+
+for.inc24:                                        ; preds = %for.inc24.loopexit, %for.cond1.preheader
+  %inc25 = add nsw i64 %i.011, 1
+  %exitcond15 = icmp ne i64 %inc25, %n
+  br i1 %exitcond15, label %for.cond1.preheader, label %for.end26.loopexit
+
+for.end26.loopexit:                               ; preds = %for.inc24
+  br label %for.end26
+
+for.end26:                                        ; preds = %for.end26.loopexit, %entry
+  ret void
+}
+
+; The two inner loops have a backward loop-carried dependency, allowing them
+; to be fused.
+
+; C Code
+;
+;;  for (long int i = 0; i < n; i++) {
+;;    for (long int j = 0; j < n; j++) {
+;;      for (long int k = 0; k < n; k++)
+;;        A[i][j][k] = i;
+;;      for (long int k = 0; k < n; k++)
+;;        temp = A[i][j][k - 1];
+;;    }
+;;  }
+
+define void @backward_dep1(i64 %n, ptr %A) nounwind uwtable ssp {
+entry:
+  %cmp10 = icmp sgt i64 %n, 0
+  br i1 %cmp10, label %for.cond1.preheader.preheader, label %for.end26
+
+; CHECK-LABEL: backward_dep
+; CHECK-COUNT-1: for.body{{[0-9]+}}:
+; CHECK-NOT:     for.body{{[0-9]+}}:
+
+for.cond1.preheader.preheader:                    ; preds = %entry
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.cond1.preheader.preheader, %for.inc24
+  %i.011 = phi i64 [ %inc25, %for.inc24 ], [ 0, %for.cond1.preheader.preheader ]
+  %cmp26 = icmp sgt i64 %n, 0
+  br i1 %cmp26, label %for.cond4.preheader.preheader, label %for.inc24
+
+for.cond4.preheader.preheader:                    ; preds = %for.cond1.preheader
+  br label %for.cond4.preheader
+
+for.cond4.preheader:                              ; preds = %for.cond4.preheader.preheader, %for.inc21
+  %j.07 = phi i64 [ %inc22, %for.inc21 ], [ 0, %for.cond4.preheader.preheader ]
+  %cmp51 = icmp sgt i64 %n, 0
+  br i1 %cmp51, label %for.body6.preheader, label %for.cond10.loopexit
+
+for.body6.preheader:                              ; preds = %for.cond4.preheader
+  br label %for.body6
+
+for.body6:                                        ; preds = %for.body6.preheader, %for.body6
+  %k.02 = phi i64 [ %inc, %for.body6 ], [ 0, %for.body6.preheader ]
+  %arrayidx8 = getelementptr inbounds [100 x [100 x i64]], ptr %A, i64 %i.011, i64 %j.07, i64 %k.02
+  store i64 %i.011, ptr %arrayidx8, align 8
+  %inc = add nsw i64 %k.02, 1
+  %exitcond13 = icmp ne i64 %inc, %n
+  br i1 %exitcond13, label %for.body6, label %for.cond10.loopexit.loopexit
+
+for.cond10.loopexit.loopexit:                     ; preds = %for.body6
+  br label %for.cond10.loopexit
+
+for.cond10.loopexit:                              ; preds = %for.cond10.loopexit.loopexit, %for.cond4.preheader
+  %cmp113 = icmp sgt i64 %n, 0
+  br i1 %cmp113, label %for.body12.preheader, label %for.inc21
+
+for.body12.preheader:                             ; preds = %for.cond10.loopexit
+  br label %for.body12
+
+for.body12:                                       ; preds = %for.body12.preheader, %for.body12
+  %k9.05 = phi i64 [ %inc19, %for.body12 ], [ 0, %for.body12.preheader ]
+  %add = add nsw i64 %k9.05, -1
+  %arrayidx17 = getelementptr inbounds [100 x [100 x i64]], ptr %A, i64 %i.011, i64 %j.07, i64 %add
+  %0 = load i64, ptr %arrayidx17, align 8
+  %inc19 = add nsw i64 %k9.05, 1
+  %exitcond = icmp ne i64 %inc19, %n
+  br i1 %exitcond, label %for.body12, label %for.inc21.loopexit
+
+for.inc21.loopexit:                               ; preds = %for.body12
+  br label %for.inc21
+
+for.inc21:                                        ; preds = %for.inc21.loopexit, %for.cond10.loopexit
+  %inc22 = add nsw i64 %j.07, 1
+  %exitcond14 = icmp ne i64 %inc22, %n
+  br i1 %exitcond14, label %for.cond4.preheader, label %for.inc24.loopexit
+
+for.inc24.loopexit:                               ; preds = %for.inc21
+  br label %for.inc24
+
+for.inc24:                                        ; preds = %for.inc24.loopexit, %for.cond1.preheader
+  %inc25 = add nsw i64 %i.011, 1
+  %exitcond15 = icmp ne i64 %inc25, %n
+  br i1 %exitcond15, label %for.cond1.preheader, label %for.end26.loopexit
+
+for.end26.loopexit:                               ; preds = %for.inc24
+  br label %for.end26
+
+for.end26:                                        ; preds = %for.end26.loopexit, %entry
+  ret void
+}
\ No newline at end of file
diff --git a/llvm/test/Transforms/LoopFusion/simple.ll b/llvm/test/Transforms/LoopFusion/simple.ll
index d63890df14461..54556eb98ad80 100644
--- a/llvm/test/Transforms/LoopFusion/simple.ll
+++ b/llvm/test/Transforms/LoopFusion/simple.ll
@@ -298,42 +298,51 @@ bb23:                                             ; preds = %bb17, %bb
   ret void
 }
 
+; void forward_dep(int *arg) {
+;     for (int i = 0; i < 100; i++) {
+;         int tmp = i - 3;
+;         int val = tmp * (i + 3) % i;
+;         arg[i] = val;
+;     }
+; 
+;     for (int j = 0; j < 100; j++) {
+;         int val = arg[j - 3];
+;         arg[j] = val * 3;
+;     }
+; }
+;
 define void @forward_dep(ptr noalias %arg) {
 ; CHECK-LABEL: @forward_dep(
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    br label [[BB7:%.*]]
+; CHECK-NEXT:  [[BB:.*]]:
+; CHECK-NEXT:    br label %[[BB7:.*]]
 ; CHECK:       bb7:
-; CHECK-NEXT:    [[DOT013:%.*]] = phi i32 [ 0, [[BB:%.*]] ], [ [[TMP15:%.*]], [[BB14:%.*]] ]
-; CHECK-NEXT:    [[INDVARS_IV22:%.*]] = phi i64 [ 0, [[BB]] ], [ [[INDVARS_IV_NEXT3:%.*]], [[BB14]] ]
+; CHECK-NEXT:    [[DOT013:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[TMP15:%.*]], %[[BB25:.*]] ]
+; CHECK-NEXT:    [[INDVARS_IV22:%.*]] = phi i64 [ 0, %[[BB]] ], [ [[INDVARS_IV_NEXT3:%.*]], %[[BB25]] ]
+; CHECK-NEXT:    [[INDVARS_IV1:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[BB25]] ], [ 0, %[[BB]] ]
 ; CHECK-NEXT:    [[TMP:%.*]] = add nsw i32 [[DOT013]], -3
 ; CHECK-NEXT:    [[TMP8:%.*]] = add nuw nsw i64 [[INDVARS_IV22]], 3
 ; CHECK-NEXT:    [[TMP9:%.*]] = trunc i64 [[TMP8]] to i32
 ; CHECK-NEXT:    [[TMP10:%.*]] = mul nsw i32 [[TMP]], [[TMP9]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = trunc i64 [[INDVARS_IV22]] to i32
 ; CHECK-NEXT:    [[TMP12:%.*]] = srem i32 [[TMP10]], [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[ARG:%.*]], i64 [[INDVARS_IV22]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[ARG]], i64 [[INDVARS_IV22]]
 ; CHECK-NEXT:    store i32 [[TMP12]], ptr [[TMP13]], align 4
-; CHECK-NEXT:    br label [[BB14]]
+; CHECK-NEXT:    br label %[[BB14:.*]]
 ; CHECK:       bb14:
-; CHECK-NEXT:    [[INDVARS_IV_NEXT3]] = add nuw nsw i64 [[INDVARS_IV22]], 1
-; CHECK-NEXT:    [[TMP15]] = add nuw nsw i32 [[DOT013]], 1
-; CHECK-NEXT:    [[EXITCOND4:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT3]], 100
-; CHECK-NEXT:    br i1 [[EXITCOND4]], label [[BB7]], label [[BB19_PREHEADER:%.*]]
-; CHECK:       bb19.preheader:
-; CHECK-NEXT:    br label [[BB19:%.*]]
-; CHECK:       bb19:
-; CHECK-NEXT:    [[INDVARS_IV1:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[BB25:%.*]] ], [ 0, [[BB19_PREHEADER]] ]
 ; CHECK-NEXT:    [[TMP20:%.*]] = add nsw i64 [[INDVARS_IV1]], -3
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[ARG]], i64 [[TMP20]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4
 ; CHECK-NEXT:    [[TMP23:%.*]] = mul nsw i32 [[TMP22]], 3
 ; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[ARG]], i64 [[INDVARS_IV1]]
 ; CHECK-NEXT:    store i32 [[TMP23]], ptr [[TMP24]], align 4
-; CHECK-NEXT:    br label [[BB25]]
+; CHECK-NEXT:    br label %[[BB25]]
 ; CHECK:       bb25:
+; CHECK-NEXT:    [[INDVARS_IV_NEXT3]] = add nuw nsw i64 [[INDVARS_IV22]], 1
+; CHECK-NEXT:    [[TMP15]] = add nuw nsw i32 [[DOT013]], 1
+; CHECK-NEXT:    [[EXITCOND4:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT3]], 100
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV1]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], 100
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[BB19]], label [[BB26:%.*]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[BB7]], label %[[BB26:.*]]
 ; CHECK:       bb26:
 ; CHECK-NEXT:    ret void
 ;



More information about the llvm-commits mailing list