[llvm] [Uniformity] Fixed control-div early stop (PR #139667)

Mon May 12 22:27:26 PDT 2025

llvmbot wrote:



@llvm/pr-subscribers-backend-amdgpu

@llvm/pr-subscribers-llvm-analysis

Author: Junjie Gu (jgu222)

<details>
<summary>Changes</summary>

Control-divergence finds joins by propagating labels from the divergent control branch. The code that checks the early stop for propagation is not correct in some cases.

This change fixes this issue by stopping at the post-dominator of the successors of the divergent branch.

https://github.com/llvm/llvm-project/issues/137277

---
Full diff: https://github.com/llvm/llvm-project/pull/139667.diff


4 Files Affected:

- (modified) llvm/include/llvm/ADT/GenericUniformityImpl.h (+29-31) 
- (modified) llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/diverged-entry-headers-nested.ll (+4-4) 
- (added) llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_div_branch.ll (+78) 
- (added) llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_div_loop.ll (+82) 


``````````diff

diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h
index d10355fff1bea..f479abdbb41b6 100644
--- a/llvm/include/llvm/ADT/GenericUniformityImpl.h
+++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h
@@ -610,9 +610,6 @@ template <typename ContextT> class DivergencePropagator {
     LLVM_DEBUG(dbgs() << "SDA:computeJoinPoints: "
                       << Context.print(&DivTermBlock) << "\n");
 
-    // Early stopping criterion
-    int FloorIdx = CyclePOT.size() - 1;
-    const BlockT *FloorLabel = nullptr;
     int DivTermIdx = CyclePOT.getIndex(&DivTermBlock);
 
     // Bootstrap with branch targets
@@ -626,14 +623,36 @@ template <typename ContextT> class DivergencePropagator {
         LLVM_DEBUG(dbgs() << "\tImmediate divergent cycle exit: "
                           << Context.print(SuccBlock) << "\n");
       }
-      auto SuccIdx = CyclePOT.getIndex(SuccBlock);
       visitEdge(*SuccBlock, *SuccBlock);
-      FloorIdx = std::min<int>(FloorIdx, SuccIdx);
     }
 
+    // Return true if B is inside an irreducible cycle
+    auto IsInIrreducibleCycle = [this](const BlockT *B) {
+      for (const auto *Cycle = CI.getCycle(B); Cycle;
+           Cycle = Cycle->getParentCycle()) {
+        if (!Cycle->isReducible())
+          return true;
+      }
+      return false;
+    };
+
+    // Technically propagation can continue until it reaches the last node.
+    //
+    // For efficiency, propagation can just stop at the IPD (immediate
+    // post-dominator) of successors(DivTemBlock) for any reducible graph.
+    // If FreshLabels.count()=1, the block in FreshLabels should be the IPD.
+    //
+    // For irreducible cycle, propagation continues until it reaches out of
+    // any irreducible cycles first, then stop when FreshLabels.count()=1.
     while (true) {
       auto BlockIdx = FreshLabels.find_last();
-      if (BlockIdx == -1 || BlockIdx < FloorIdx)
+      if (BlockIdx == -1)
+        break;
+
+      const auto *Block = CyclePOT[BlockIdx];
+      // If no irreducible cycle, stop if freshLable.count() = 1 and Block
+      // is the IPD. If it is in any irreducible cycle, continue propagation.
+      if (FreshLabels.count() == 1 && !IsInIrreducibleCycle(Block))
         break;
 
       LLVM_DEBUG(dbgs() << "Current labels:\n"; printDefs(dbgs()));
@@ -644,16 +663,12 @@ template <typename ContextT> class DivergencePropagator {
         continue;
       }
 
-      const auto *Block = CyclePOT[BlockIdx];
       LLVM_DEBUG(dbgs() << "visiting " << Context.print(Block) << " at index "
                         << BlockIdx << "\n");
 
       const auto *Label = BlockLabels[Block];
       assert(Label);
 
-      bool CausedJoin = false;
-      int LoweredFloorIdx = FloorIdx;
-
       // If the current block is the header of a reducible cycle that
       // contains the divergent branch, then the label should be
       // propagated to the cycle exits. Such a header is the "last
@@ -681,28 +696,11 @@ template <typename ContextT> class DivergencePropagator {
       if (const auto *BlockCycle = getReducibleParent(Block)) {
         SmallVector<BlockT *, 4> BlockCycleExits;
         BlockCycle->getExitBlocks(BlockCycleExits);
-        for (auto *BlockCycleExit : BlockCycleExits) {
-          CausedJoin |= visitCycleExitEdge(*BlockCycleExit, *Label);
-          LoweredFloorIdx =
-              std::min<int>(LoweredFloorIdx, CyclePOT.getIndex(BlockCycleExit));
-        }
+        for (auto *BlockCycleExit : BlockCycleExits)
+          visitCycleExitEdge(*BlockCycleExit, *Label);
       } else {
-        for (const auto *SuccBlock : successors(Block)) {
-          CausedJoin |= visitEdge(*SuccBlock, *Label);
-          LoweredFloorIdx =
-              std::min<int>(LoweredFloorIdx, CyclePOT.getIndex(SuccBlock));
-        }
-      }
-
-      // Floor update
-      if (CausedJoin) {
-        // 1. Different labels pushed to successors
-        FloorIdx = LoweredFloorIdx;
-      } else if (FloorLabel != Label) {
-        // 2. No join caused BUT we pushed a label that is different than the
-        // last pushed label
-        FloorIdx = LoweredFloorIdx;
-        FloorLabel = Label;
+        for (const auto *SuccBlock : successors(Block))
+          visitEdge(*SuccBlock, *Label);
       }
     }
 
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/diverged-entry-headers-nested.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/diverged-entry-headers-nested.ll
index 46e676b52c0ba..5bb59602faca4 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/diverged-entry-headers-nested.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/diverged-entry-headers-nested.ll
@@ -126,10 +126,10 @@ exit:
 ;; only the inner cycle is reported as diverged.
 ;;
 ;; CHECK-LABEL: UniformityInfo for function 'headers_b_t':
-;; CHECK: CYCLES ASSSUMED DIVERGENT:
-;; CHECK:   depth=2: entries(T P) S Q R
-;; CHECK: CYCLES WITH DIVERGENT EXIT:
-;; CHECK:   depth=1: entries(B A) D T S Q P R C
+;; NOCHECK: CYCLES ASSSUMED DIVERGENT:
+;; NOCHECK:   depth=2: entries(T P) S Q R
+;; NOCHECK: CYCLES WITH DIVERGENT EXIT:
+;; NOCHECK:   depth=1: entries(B A) D T S Q P R C
 
 define amdgpu_kernel void @headers_b_t(i32 %a, i32 %b, i32 %c) {
 entry:
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_div_branch.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_div_branch.ll
new file mode 100644
index 0000000000000..df949a86635c4
--- /dev/null
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_div_branch.ll
@@ -0,0 +1,78 @@
+;
+; RUN: opt -mtriple amdgcn-- -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s
+;
+; This is to test an if-then-else case with some unmerged basic blocks
+; (https://github.com/llvm/llvm-project/issues/137277)
+;
+;      Entry (div.cond)
+;      /   \
+;     B0   B3
+;     |    |
+;     B1   B4
+;     |    |
+;     B2   B5
+;      \  /
+;       B6 (phi: divergent)
+;
+
+
+; CHECK-LABEL:  'test_ctrl_divergence':
+; CHECK-LABEL:  BLOCK Entry
+; CHECK:  DIVERGENT:   %div.cond = icmp eq i32 %tid, 0
+; CHECK:  DIVERGENT:   br i1 %div.cond, label %B3, label %B0
+;
+; CHECK-LABEL:  BLOCK B6
+; CHECK:  DIVERGENT:   %div_a = phi i32 [ %a0, %B2 ], [ %a1, %B5 ]
+; CHECK:  DIVERGENT:   %div_b = phi i32 [ %b0, %B2 ], [ %b1, %B5 ]
+; CHECK:  DIVERGENT:   %div_c = phi i32 [ %c0, %B2 ], [ %c1, %B5 ]
+
+
+define amdgpu_kernel void @test_ctrl_divergence(i32 %a, i32 %b, i32 %c, i32 %d) {
+Entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %div.cond = icmp eq i32 %tid, 0
+  br i1 %div.cond, label %B3, label %B0 ; divergent branch
+
+B0:
+  %a0 = add i32 %a, 1
+  br label %B1
+
+B1:
+  %b0 = add i32 %b, 2
+  br label %B2
+
+B2:
+  %c0 = add i32 %c, 3
+  br label %B6
+
+B3:
+  %a1 = add i32 %a, 10
+  br label %B4
+
+B4:
+  %b1 = add i32 %b, 20
+  br label %B5
+
+B5:
+  %c1 = add i32 %c, 30
+  br label %B6
+
+B6:
+  %div_a = phi i32 [%a0, %B2], [%a1,  %B5]
+  %div_b = phi i32 [%b0, %B2], [%b1,  %B5]
+  %div_c = phi i32 [%c0, %B2], [%c1,  %B5]
+  br i1 %div.cond, label %B8, label %B7 ; divergent branch
+
+B7:
+  %d1 = add i32 %d, 1
+  br label %B8
+
+B8:
+  %div_d = phi i32 [%d1, %B7], [%d, %B6]
+  ret void
+}
+
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = {nounwind readnone }
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_div_loop.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_div_loop.ll
new file mode 100644
index 0000000000000..54c641862fe79
--- /dev/null
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_div_loop.ll
@@ -0,0 +1,82 @@
+;
+; RUN: opt -mtriple amdgcn-- -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s
+;
+; This is to test a divergent phi involving loops
+; (https://github.com/llvm/llvm-project/issues/137277).
+;
+;        B0 (div.cond)
+;      /   \
+;  (L)B1   B4
+;     |    |
+;     B2   B5 (L)
+;     |    |
+;     B3   /
+;      \  /
+;      B6 (phi: divergent)
+;
+
+;
+; CHECK-LABEL: UniformityInfo for function 'test_loop_ctrl_divergence':
+; CHECK-LABEL: BLOCK Entry
+; CHECK: DIVERGENT:   %tid = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-LABEL: BLOCK B0
+; CHECK: DIVERGENT:   %div.cond = icmp eq i32 %tid, 0
+; CHECK-LABEL: BLOCK B3
+; CHECK: %uni_a = phi i32 [ %a1, %B2 ], [ %a, %Entry ]
+; CHECK-LABEL: BLOCK B5
+; CHECK: %uni.a3 = phi i32 [ %a2, %B4 ], [ %uni_a3, %B5 ]
+; CHECK-LABEL BLOCK B6
+; CHECK: DIVERGENT:   %div_a = phi i32 [ %uni_a, %B3 ], [ %uni_a3, %B5 ]
+;
+
+define amdgpu_kernel void @test_loop_ctrl_divergence(i32 %a, i32 %b, i32 %c, i32 %d) {
+Entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %uni.cond0 = icmp eq i32 %d, 0
+  br i1 %uni.cond0, label %B3, label %B0 ; uniform branch
+
+B0:
+  %div.cond = icmp eq i32 %tid, 0
+  br i1 %div.cond, label %B4, label %B1 ; divergent branch
+
+B1:
+  %uni.a0 = phi i32 [%a, %B0], [%a0, %B1]
+  %a0 = add i32 %uni.a0, 1
+  %uni.cond1 = icmp slt i32 %a0, %b
+  br i1 %uni.cond1, label %B1, label %B2
+
+B2:
+  %a1 = add i32 %a0, 10
+  br label %B3
+
+B3:
+  %uni_a = phi i32 [%a1, %B2], [%a,  %Entry]
+  br label %B6
+
+B4:
+  %a2 = add i32 %a, 20
+  br label %B5
+
+B5:
+  %uni.a3= phi i32 [%a2, %B4], [%uni_a3, %B5]
+  %uni_a3 = add i32 %uni.a3, 1
+  %uni.cond2 = icmp slt i32 %uni_a3, %c
+  br i1 %uni.cond2, label %B5, label %B6
+
+B6:
+  %div_a = phi i32 [%uni_a, %B3], [%uni_a3, %B5] ;   divergent
+  %div.cond2 = icmp eq i32 %tid, 2
+  br i1 %div.cond2, label %B7, label %B8 ; divergent branch
+
+B7:
+  %c0 = add i32 %div_a, 2 ; divergent
+  br label %B8
+
+B8:
+  %ret = phi i32 [%c0, %B7], [0, %B6] ; divergent
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = {nounwind readnone }

``````````

</details>


https://github.com/llvm/llvm-project/pull/139667