[llvm] [Pipeliner] Fix Phi node dependency calculation (PR #160056)

Mon Sep 22 02:27:35 PDT 2025

llvmbot wrote:




@llvm/pr-subscribers-backend-hexagon

Author: Santanu Das (quic-santdas)

<details>
<summary>Changes</summary>

For calculation of RecMII, the pipeliner finds all the loop carried dependences in the loop which includes phi nodes. While calculation of dependency, the pipeliner encounters a corner case where the definition of the phis comes from the same source, but not from a single instruction.

In this particular case, there is a overlap of the definition but the source is different. Adding a check to find overlap of phi node definition fixes the problem.

---
Full diff: https://github.com/llvm/llvm-project/pull/160056.diff


2 Files Affected:

- (modified) llvm/lib/CodeGen/MachinePipeliner.cpp (+17-1) 
- (added) llvm/test/CodeGen/Hexagon/swp-overlap-dep.ll (+125) 


``````````diff

diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index 3a9651c5cee04..4b9e3fda4ac08 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -3009,6 +3009,21 @@ bool SwingSchedulerDAG::mayOverlapInLaterIter(
   if (OffsetBIsScalable || OffsetOIsScalable)
     return true;
 
+  // Check if the definitions of the two MIs have a chance of
+  // having the same value, but not from the same parent instruction.
+  // This is possible if the values come via phi nodes. So check
+  // for the parents of phi nodes.
+  auto doesOverlap = [&](MachineInstr* MI, MachineInstr* MI2) {
+    if (MI->isPHI()) {
+      for (auto MO : MI->operands()) {
+        if (!MO.isReg()) continue;
+        if (MI2 == MRI.getVRegDef(MO.getReg()))
+          return true;
+      }
+    }
+    return false;
+  };
+
   if (!BaseOpB->isIdenticalTo(*BaseOpO)) {
     // Pass cases with different base operands but same initial values.
     // Typically for when pre/post increment is used.
@@ -3034,7 +3049,8 @@ bool SwingSchedulerDAG::mayOverlapInLaterIter(
     MachineInstr *InitDefO = MRI.getVRegDef(InitValO);
 
     if (!InitDefB->isIdenticalTo(*InitDefO))
-      return true;
+      if (!doesOverlap(InitDefB, InitDefO) && !doesOverlap(InitDefO, InitDefB))
+        return true;
   }
 
   LocationSize AccessSizeB = (*BaseMI->memoperands_begin())->getSize();
diff --git a/llvm/test/CodeGen/Hexagon/swp-overlap-dep.ll b/llvm/test/CodeGen/Hexagon/swp-overlap-dep.ll
new file mode 100644
index 0000000000000..3296bfc965218
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/swp-overlap-dep.ll
@@ -0,0 +1,125 @@
+; Test passes if the inner loop consists of 5 packets.
+; This is due to the fact that the pipeliner is able
+; to create a schedule with II=5
+
+; RUN: llc -O3 -mtriple=hexagon -mv71t < %s | FileCheck %s
+
+; CHECK: loop0(.LBB0_[[LOOP:.]],
+; CHECK: .LBB0_[[LOOP]]:
+; CHECK: {
+; CHECK: }
+; CHECK: {
+; CHECK: }
+; CHECK: {
+; CHECK: }
+; CHECK: {
+; CHECK: }
+; CHECK: {
+; CHECK: }{{[ \t]*}}:endloop0
+
+define dso_local void @foo(ptr noundef readonly captures(none) %arg, ptr noundef readonly captures(none) %arg1, ptr noundef captures(none) %arg2, i32 noundef %arg3, i32 noundef %arg4, ptr noalias noundef captures(none) %arg5) local_unnamed_addr {
+bb:
+  %icmp = icmp sgt i32 %arg3, 0
+  br i1 %icmp, label %bb6, label %bb70
+
+bb6:                                              ; preds = %bb
+  %icmp7 = icmp sgt i32 %arg4, 0
+  br i1 %icmp7, label %bb8, label %bb57
+
+bb8:                                              ; preds = %bb53, %bb6
+  %phi = phi ptr [ %getelementptr56, %bb53 ], [ %arg2, %bb6 ]
+  %phi9 = phi ptr [ %getelementptr32, %bb53 ], [ %arg1, %bb6 ]
+  %phi10 = phi i32 [ %add54, %bb53 ], [ 0, %bb6 ]
+  %phi11 = phi ptr [ %arg5, %bb53 ], [ %arg, %bb6 ]
+  %load = load i32, ptr %phi, align 4
+  %getelementptr = getelementptr inbounds nuw i8, ptr %phi, i32 4
+  %load12 = load i32, ptr %getelementptr, align 4
+  %zext = zext i32 %load12 to i64
+  %shl = shl nuw i64 %zext, 32
+  %zext13 = zext i32 %load to i64
+  %or = or disjoint i64 %shl, %zext13
+  %getelementptr14 = getelementptr inbounds nuw i8, ptr %phi9, i32 4
+  %load15 = load i32, ptr %getelementptr14, align 4
+  %getelementptr16 = getelementptr inbounds nuw i8, ptr %phi9, i32 8
+  %load17 = load i32, ptr %getelementptr16, align 4
+  %sub = sub nsw i32 0, %load15
+  %zext18 = zext i32 %load17 to i64
+  %shl19 = shl nuw i64 %zext18, 32
+  %zext20 = zext i32 %sub to i64
+  %or21 = or disjoint i64 %shl19, %zext20
+  %getelementptr22 = getelementptr inbounds nuw i8, ptr %phi9, i32 12
+  %load23 = load i32, ptr %getelementptr22, align 4
+  %getelementptr24 = getelementptr inbounds nuw i8, ptr %phi9, i32 16
+  %load25 = load i32, ptr %getelementptr24, align 4
+  %getelementptr26 = getelementptr inbounds nuw i8, ptr %phi9, i32 20
+  %load27 = load i32, ptr %getelementptr26, align 4
+  %zext28 = zext i32 %load27 to i64
+  %shl29 = shl nuw i64 %zext28, 32
+  %zext30 = zext i32 %load25 to i64
+  %or31 = or disjoint i64 %shl29, %zext30
+  %getelementptr32 = getelementptr i8, ptr %phi9, i32 24
+  br label %bb33
+
+bb33:                                             ; preds = %bb33, %bb8
+  %phi34 = phi ptr [ %arg5, %bb8 ], [ %getelementptr52, %bb33 ]
+  %phi35 = phi i32 [ 0, %bb8 ], [ %add, %bb33 ]
+  %phi36 = phi i32 [ %load, %bb8 ], [ %call42, %bb33 ]
+  %phi37 = phi i64 [ %or, %bb8 ], [ %or50, %bb33 ]
+  %phi38 = phi ptr [ %phi11, %bb8 ], [ %getelementptr39, %bb33 ]
+  %getelementptr39 = getelementptr inbounds nuw i8, ptr %phi38, i32 4
+  %load40 = load i32, ptr %phi38, align 4
+  %sext = sext i32 %load40 to i64
+  %shl41 = shl nsw i64 %sext, 25
+  %call = tail call i64 @llvm.hexagon.M7.dcmpyrw.acc(i64 %shl41, i64 %or21, i64 %phi37)
+  %ashr = ashr i64 %call, 28
+  %call42 = tail call i32 @llvm.hexagon.A2.sat(i64 %ashr)
+  %call43 = tail call i64 @llvm.hexagon.M7.dcmpyrwc(i64 %or31, i64 %phi37)
+  %call44 = tail call i64 @llvm.hexagon.M2.dpmpyss.acc.s0(i64 %call43, i32 %load23, i32 %call42)
+  %ashr45 = ashr i64 %call44, 25
+  %call46 = tail call i32 @llvm.hexagon.A2.sat(i64 %ashr45)
+  store i32 %call46, ptr %phi34, align 4
+  %zext47 = zext i32 %phi36 to i64
+  %shl48 = shl nuw i64 %zext47, 32
+  %zext49 = zext i32 %call42 to i64
+  %or50 = or disjoint i64 %shl48, %zext49
+  %add = add nuw nsw i32 %phi35, 1
+  %icmp51 = icmp eq i32 %add, %arg4
+  %getelementptr52 = getelementptr i8, ptr %phi34, i32 4
+  br i1 %icmp51, label %bb53, label %bb33, !llvm.loop !1
+
+bb53:                                             ; preds = %bb33
+  store i64 %or50, ptr %phi, align 8
+  %add54 = add nuw nsw i32 %phi10, 1
+  %icmp55 = icmp eq i32 %add54, %arg3
+  %getelementptr56 = getelementptr i8, ptr %phi, i32 8
+  br i1 %icmp55, label %bb70, label %bb8, !llvm.loop !3
+
+bb57:                                             ; preds = %bb57, %bb6
+  %phi58 = phi ptr [ %getelementptr69, %bb57 ], [ %arg2, %bb6 ]
+  %phi59 = phi i32 [ %add67, %bb57 ], [ 0, %bb6 ]
+  %load60 = load i32, ptr %phi58, align 4
+  %getelementptr61 = getelementptr inbounds nuw i8, ptr %phi58, i32 4
+  %load62 = load i32, ptr %getelementptr61, align 4
+  %zext63 = zext i32 %load62 to i64
+  %shl64 = shl nuw i64 %zext63, 32
+  %zext65 = zext i32 %load60 to i64
+  %or66 = or disjoint i64 %shl64, %zext65
+  store i64 %or66, ptr %phi58, align 8
+  %add67 = add nuw nsw i32 %phi59, 1
+  %icmp68 = icmp eq i32 %add67, %arg3
+  %getelementptr69 = getelementptr i8, ptr %phi58, i32 8
+  br i1 %icmp68, label %bb70, label %bb57, !llvm.loop !4
+
+bb70:                                             ; preds = %bb57, %bb53, %bb
+  ret void
+}
+
+declare i64 @llvm.hexagon.M7.dcmpyrw.acc(i64, i64, i64)
+declare i32 @llvm.hexagon.A2.sat(i64)
+declare i64 @llvm.hexagon.M7.dcmpyrwc(i64, i64)
+declare i64 @llvm.hexagon.M2.dpmpyss.acc.s0(i64, i32, i32)
+
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.mustprogress"}
+!3 = distinct !{!3, !2}
+!4 = distinct !{!4, !2}

``````````

</details>


https://github.com/llvm/llvm-project/pull/160056