[llvm] [LoopFusion] Fix sink instructions (PR #147501)

Tue Jul 8 03:41:26 PDT 2025

llvmbot wrote:




@llvm/pr-subscribers-llvm-transforms

Author: Madhur Amilkanthwar (madhur13490)

<details>
<summary>Changes</summary>

If we have instructions in second loop's preheader which can be sunk, we should also be adjusting
PHI nodes to receive values from the new loop's latch block.

Fixes #128600

---
Full diff: https://github.com/llvm/llvm-project/pull/147501.diff


2 Files Affected:

- (modified) llvm/lib/Transforms/Scalar/LoopFuse.cpp (+29-5) 
- (added) llvm/test/Transforms/LoopFusion/sunk-phi-nodes.ll (+86) 


``````````diff

diff --git a/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
index d6bd92d520e28..6e1556a4d90b4 100644
--- a/llvm/lib/Transforms/Scalar/LoopFuse.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
@@ -988,8 +988,8 @@ struct LoopFuser {
 
             // If it is not safe to hoist/sink all instructions in the
             // pre-header, we cannot fuse these loops.
-            if (!collectMovablePreheaderInsts(*FC0, *FC1, SafeToHoist,
-                                              SafeToSink)) {
+            if (!collectAndFixMovablePreheaderInsts(*FC0, *FC1, SafeToHoist,
+                                                    SafeToSink)) {
               LLVM_DEBUG(dbgs() << "Could not hoist/sink all instructions in "
                                    "Fusion Candidate Pre-header.\n"
                                 << "Not Fusing.\n");
@@ -1033,8 +1033,8 @@ struct LoopFuser {
                                                FuseCounter);
 
           FusionCandidate FusedCand(
-              performFusion((Peel ? FC0Copy : *FC0), *FC1), DT, &PDT, ORE,
-              FC0Copy.PP);
+              performFusion((Peel ? FC0Copy : *FC0), *FC1, SafeToSink), DT,
+              &PDT, ORE, FC0Copy.PP);
           FusedCand.verify();
           assert(FusedCand.isEligibleForFusion(SE) &&
                  "Fused candidate should be eligible for fusion!");
@@ -1176,9 +1176,31 @@ struct LoopFuser {
     return true;
   }
 
+  void fixPHINodes(SmallVector<Instruction *, 4> &SafeToSink,
+                   const FusionCandidate &FC0,
+                   const FusionCandidate &FC1) const {
+    // Iterate over SafeToSink instructions and update PHI nodes
+    // to take values from the latch block of FC0 if they are taking
+    // from the latch block of FC1.
+    for (Instruction *Inst : SafeToSink) {
+      LLVM_DEBUG(dbgs() << "UPDATING: Instruction: " << *Inst << "\n");
+      // Continue if the instruction is not a PHI node.
+      if (!isa<PHINode>(Inst))
+        continue;
+      PHINode *Phi = dyn_cast<PHINode>(Inst);
+      LLVM_DEBUG(dbgs() << "UPDATING: PHI node: " << *Phi << "\n");
+      for (unsigned I = 0; I < Phi->getNumIncomingValues(); I++) {
+        if (Phi->getIncomingBlock(I) != FC0.Latch)
+          continue;
+        assert(FC1.Latch && "FC1 latch is not set");
+        Phi->setIncomingBlock(I, FC1.Latch);
+      }
+    }
+  }
+
   /// Collect instructions in the \p FC1 Preheader that can be hoisted
   /// to the \p FC0 Preheader or sunk into the \p FC1 Body
-  bool collectMovablePreheaderInsts(
+  bool collectAndFixMovablePreheaderInsts(
       const FusionCandidate &FC0, const FusionCandidate &FC1,
       SmallVector<Instruction *, 4> &SafeToHoist,
       SmallVector<Instruction *, 4> &SafeToSink) const {
@@ -1226,6 +1248,8 @@ struct LoopFuser {
     }
     LLVM_DEBUG(
         dbgs() << "All preheader instructions could be sunk or hoisted!\n");
+
+    fixPHINodes(SafeToSink, FC0, FC1);
     return true;
   }
 
diff --git a/llvm/test/Transforms/LoopFusion/sunk-phi-nodes.ll b/llvm/test/Transforms/LoopFusion/sunk-phi-nodes.ll
new file mode 100644
index 0000000000000..3c72df8ae19fb
--- /dev/null
+++ b/llvm/test/Transforms/LoopFusion/sunk-phi-nodes.ll
@@ -0,0 +1,86 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=mem2reg,loop-rotate,loop-fusion < %s 2>&1 | FileCheck %s
+define i32 @main() {
+; CHECK-LABEL: define i32 @main() {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[SUM1_02:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[ADD:%.*]], %[[FOR_INC6:.*]] ]
+; CHECK-NEXT:    [[I_01:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_INC6]] ]
+; CHECK-NEXT:    [[I1_04:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC7:%.*]], %[[FOR_INC6]] ]
+; CHECK-NEXT:    [[SUM2_03:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[ADD5:%.*]], %[[FOR_INC6]] ]
+; CHECK-NEXT:    [[ADD]] = add nsw i32 [[SUM1_02]], [[I_01]]
+; CHECK-NEXT:    br label %[[FOR_INC:.*]]
+; CHECK:       [[FOR_INC]]:
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[I1_04]], [[I1_04]]
+; CHECK-NEXT:    [[ADD5]] = add nsw i32 [[SUM2_03]], [[MUL]]
+; CHECK-NEXT:    br label %[[FOR_INC6]]
+; CHECK:       [[FOR_INC6]]:
+; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_01]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[INC]], 10
+; CHECK-NEXT:    [[INC7]] = add nsw i32 [[I1_04]], 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp slt i32 [[INC7]], 10
+; CHECK-NEXT:    br i1 [[CMP3]], label %[[FOR_BODY]], label %[[FOR_END8:.*]]
+; CHECK:       [[FOR_END8]]:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %retval = alloca i32, align 4
+  %sum1 = alloca i32, align 4
+  %sum2 = alloca i32, align 4
+  %i = alloca i32, align 4
+  %i1 = alloca i32, align 4
+  store i32 0, ptr %retval, align 4
+  store i32 0, ptr %sum1, align 4
+  store i32 0, ptr %sum2, align 4
+  store i32 0, ptr %i, align 4
+  br label %for.cond
+
+for.cond:
+  %0 = load i32, ptr %i, align 4
+  %cmp = icmp slt i32 %0, 10
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+  %1 = load i32, ptr %i, align 4
+  %2 = load i32, ptr %sum1, align 4
+  %add = add nsw i32 %2, %1
+  store i32 %add, ptr %sum1, align 4
+  br label %for.inc
+
+for.inc:
+  %3 = load i32, ptr %i, align 4
+  %inc = add nsw i32 %3, 1
+  store i32 %inc, ptr %i, align 4
+  br label %for.cond
+
+for.end:
+  store i32 0, ptr %i1, align 4
+  br label %for.cond2
+
+for.cond2:
+  %4 = load i32, ptr %i1, align 4
+  %cmp3 = icmp slt i32 %4, 10
+  br i1 %cmp3, label %for.body4, label %for.end8
+
+for.body4:
+  %5 = load i32, ptr %i1, align 4
+  %6 = load i32, ptr %i1, align 4
+  %mul = mul nsw i32 %5, %6
+  %7 = load i32, ptr %sum2, align 4
+  %add5 = add nsw i32 %7, %mul
+  store i32 %add5, ptr %sum2, align 4
+  br label %for.inc6
+
+for.inc6:
+  %8 = load i32, ptr %i1, align 4
+  %inc7 = add nsw i32 %8, 1
+  store i32 %inc7, ptr %i1, align 4
+  br label %for.cond2
+
+for.end8:
+  %9 = load i32, ptr %sum1, align 4
+  %10 = load i32, ptr %sum2, align 4
+  ret i32 0
+}
+

``````````

</details>


https://github.com/llvm/llvm-project/pull/147501