[llvm] [LoopFusion] Fix sink instructions (PR #147501)

Tue Jul 8 03:40:49 PDT 2025

https://github.com/madhur13490 created https://github.com/llvm/llvm-project/pull/147501

If we have instructions in second loop's preheader which can be sunk, we should also be adjusting
PHI nodes to receive values from the new loop's latch block.

Fixes #128600

>From 5c7daeeda142ca7a71dc9e403ac5dea452b82f7b Mon Sep 17 00:00:00 2001
From: Madhur Amilkanthwar <madhura at nvidia.com>
Date: Tue, 8 Jul 2025 01:48:33 -0700
Subject: [PATCH] [LoopFusion] Fix sink instructions

If we have instructions in second loop's preheader
which can be sunk, we should also be adjusting
PHI nodes to receive values from the new loop's latch block.

Fixes #128600
---
 llvm/lib/Transforms/Scalar/LoopFuse.cpp       | 34 ++++++--
 .../Transforms/LoopFusion/sunk-phi-nodes.ll   | 86 +++++++++++++++++++
 2 files changed, 115 insertions(+), 5 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopFusion/sunk-phi-nodes.ll

diff --git a/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
index d6bd92d520e28..6e1556a4d90b4 100644
--- a/llvm/lib/Transforms/Scalar/LoopFuse.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
@@ -988,8 +988,8 @@ struct LoopFuser {
 
             // If it is not safe to hoist/sink all instructions in the
             // pre-header, we cannot fuse these loops.
-            if (!collectMovablePreheaderInsts(*FC0, *FC1, SafeToHoist,
-                                              SafeToSink)) {
+            if (!collectAndFixMovablePreheaderInsts(*FC0, *FC1, SafeToHoist,
+                                                    SafeToSink)) {
               LLVM_DEBUG(dbgs() << "Could not hoist/sink all instructions in "
                                    "Fusion Candidate Pre-header.\n"
                                 << "Not Fusing.\n");
@@ -1033,8 +1033,8 @@ struct LoopFuser {
                                                FuseCounter);
 
           FusionCandidate FusedCand(
-              performFusion((Peel ? FC0Copy : *FC0), *FC1), DT, &PDT, ORE,
-              FC0Copy.PP);
+              performFusion((Peel ? FC0Copy : *FC0), *FC1, SafeToSink), DT,
+              &PDT, ORE, FC0Copy.PP);
           FusedCand.verify();
           assert(FusedCand.isEligibleForFusion(SE) &&
                  "Fused candidate should be eligible for fusion!");
@@ -1176,9 +1176,31 @@ struct LoopFuser {
     return true;
   }
 
+  void fixPHINodes(SmallVector<Instruction *, 4> &SafeToSink,
+                   const FusionCandidate &FC0,
+                   const FusionCandidate &FC1) const {
+    // Iterate over SafeToSink instructions and update PHI nodes
+    // to take values from the latch block of FC0 if they are taking
+    // from the latch block of FC1.
+    for (Instruction *Inst : SafeToSink) {
+      LLVM_DEBUG(dbgs() << "UPDATING: Instruction: " << *Inst << "\n");
+      // Continue if the instruction is not a PHI node.
+      if (!isa<PHINode>(Inst))
+        continue;
+      PHINode *Phi = dyn_cast<PHINode>(Inst);
+      LLVM_DEBUG(dbgs() << "UPDATING: PHI node: " << *Phi << "\n");
+      for (unsigned I = 0; I < Phi->getNumIncomingValues(); I++) {
+        if (Phi->getIncomingBlock(I) != FC0.Latch)
+          continue;
+        assert(FC1.Latch && "FC1 latch is not set");
+        Phi->setIncomingBlock(I, FC1.Latch);
+      }
+    }
+  }
+
   /// Collect instructions in the \p FC1 Preheader that can be hoisted
   /// to the \p FC0 Preheader or sunk into the \p FC1 Body
-  bool collectMovablePreheaderInsts(
+  bool collectAndFixMovablePreheaderInsts(
       const FusionCandidate &FC0, const FusionCandidate &FC1,
       SmallVector<Instruction *, 4> &SafeToHoist,
       SmallVector<Instruction *, 4> &SafeToSink) const {
@@ -1226,6 +1248,8 @@ struct LoopFuser {
     }
     LLVM_DEBUG(
         dbgs() << "All preheader instructions could be sunk or hoisted!\n");
+
+    fixPHINodes(SafeToSink, FC0, FC1);
     return true;
   }
 
diff --git a/llvm/test/Transforms/LoopFusion/sunk-phi-nodes.ll b/llvm/test/Transforms/LoopFusion/sunk-phi-nodes.ll
new file mode 100644
index 0000000000000..3c72df8ae19fb
--- /dev/null
+++ b/llvm/test/Transforms/LoopFusion/sunk-phi-nodes.ll
@@ -0,0 +1,86 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=mem2reg,loop-rotate,loop-fusion < %s 2>&1 | FileCheck %s
+define i32 @main() {
+; CHECK-LABEL: define i32 @main() {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[SUM1_02:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[ADD:%.*]], %[[FOR_INC6:.*]] ]
+; CHECK-NEXT:    [[I_01:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_INC6]] ]
+; CHECK-NEXT:    [[I1_04:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC7:%.*]], %[[FOR_INC6]] ]
+; CHECK-NEXT:    [[SUM2_03:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[ADD5:%.*]], %[[FOR_INC6]] ]
+; CHECK-NEXT:    [[ADD]] = add nsw i32 [[SUM1_02]], [[I_01]]
+; CHECK-NEXT:    br label %[[FOR_INC:.*]]
+; CHECK:       [[FOR_INC]]:
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[I1_04]], [[I1_04]]
+; CHECK-NEXT:    [[ADD5]] = add nsw i32 [[SUM2_03]], [[MUL]]
+; CHECK-NEXT:    br label %[[FOR_INC6]]
+; CHECK:       [[FOR_INC6]]:
+; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_01]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[INC]], 10
+; CHECK-NEXT:    [[INC7]] = add nsw i32 [[I1_04]], 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp slt i32 [[INC7]], 10
+; CHECK-NEXT:    br i1 [[CMP3]], label %[[FOR_BODY]], label %[[FOR_END8:.*]]
+; CHECK:       [[FOR_END8]]:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %retval = alloca i32, align 4
+  %sum1 = alloca i32, align 4
+  %sum2 = alloca i32, align 4
+  %i = alloca i32, align 4
+  %i1 = alloca i32, align 4
+  store i32 0, ptr %retval, align 4
+  store i32 0, ptr %sum1, align 4
+  store i32 0, ptr %sum2, align 4
+  store i32 0, ptr %i, align 4
+  br label %for.cond
+
+for.cond:
+  %0 = load i32, ptr %i, align 4
+  %cmp = icmp slt i32 %0, 10
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+  %1 = load i32, ptr %i, align 4
+  %2 = load i32, ptr %sum1, align 4
+  %add = add nsw i32 %2, %1
+  store i32 %add, ptr %sum1, align 4
+  br label %for.inc
+
+for.inc:
+  %3 = load i32, ptr %i, align 4
+  %inc = add nsw i32 %3, 1
+  store i32 %inc, ptr %i, align 4
+  br label %for.cond
+
+for.end:
+  store i32 0, ptr %i1, align 4
+  br label %for.cond2
+
+for.cond2:
+  %4 = load i32, ptr %i1, align 4
+  %cmp3 = icmp slt i32 %4, 10
+  br i1 %cmp3, label %for.body4, label %for.end8
+
+for.body4:
+  %5 = load i32, ptr %i1, align 4
+  %6 = load i32, ptr %i1, align 4
+  %mul = mul nsw i32 %5, %6
+  %7 = load i32, ptr %sum2, align 4
+  %add5 = add nsw i32 %7, %mul
+  store i32 %add5, ptr %sum2, align 4
+  br label %for.inc6
+
+for.inc6:
+  %8 = load i32, ptr %i1, align 4
+  %inc7 = add nsw i32 %8, 1
+  store i32 %inc7, ptr %i1, align 4
+  br label %for.cond2
+
+for.end8:
+  %9 = load i32, ptr %sum1, align 4
+  %10 = load i32, ptr %sum2, align 4
+  ret i32 0
+}
+