[llvm] 6421248 - [Uniformity] Fixed control-div early stop (#139667)

Thu May 29 03:43:49 PDT 2025

Author: Junjie Gu
Date: 2025-05-29T10:43:46Z
New Revision: 6421248c959b809efff22773c98b115aac3f6a79

URL: https://github.com/llvm/llvm-project/commit/6421248c959b809efff22773c98b115aac3f6a79
DIFF: https://github.com/llvm/llvm-project/commit/6421248c959b809efff22773c98b115aac3f6a79.diff

LOG: [Uniformity] Fixed control-div early stop (#139667)

Control-divergence finds joins by propagating labels from the divergent
control branch. The code that checks the early stop for propagation is
not correct in some cases.

This PR, also included changes from ssahasra, fixes this issue by
stopping no early than the post-dominator of the divergent branch.

https://github.com/llvm/llvm-project/issues/137277

---------

Co-authored-by: Sameer Sahasrabuddhe <sameer.sahasrabuddhe at amd.com>

Added: 
    llvm/test/Analysis/UniformityAnalysis/AMDGPU/branch-after-join.ll
    llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/hidden-post-dom.ll
    llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_div_branch.ll
    llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_div_loop.ll
    llvm/test/Analysis/UniformityAnalysis/AMDGPU/unstructured-branch.ll

Modified: 
    llvm/include/llvm/ADT/GenericUniformityImpl.h
    llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/diverged-entry-headers-nested.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h
index d10355fff1bea..715df7ab9a7aa 100644

--- a/llvm/include/llvm/ADT/GenericUniformityImpl.h
+++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h
@@ -610,13 +610,29 @@ template <typename ContextT> class DivergencePropagator {
     LLVM_DEBUG(dbgs() << "SDA:computeJoinPoints: "
                       << Context.print(&DivTermBlock) << "\n");
 
-    // Early stopping criterion
-    int FloorIdx = CyclePOT.size() - 1;
-    const BlockT *FloorLabel = nullptr;
     int DivTermIdx = CyclePOT.getIndex(&DivTermBlock);
 
     // Bootstrap with branch targets
     auto const *DivTermCycle = CI.getCycle(&DivTermBlock);
+
+    // Locate the largest ancestor cycle that is not reducible and does not
+    // contain a reducible ancestor. This is done with a lambda that is defined
+    // and invoked in the same statement.
+    const CycleT *IrreducibleAncestor = [](const CycleT *C) -> const CycleT * {
+      if (!C)
+        return nullptr;
+      if (C->isReducible())
+        return nullptr;
+      while (const CycleT *P = C->getParentCycle()) {
+        if (P->isReducible())
+          return C;
+        C = P;
+      }
+      assert(!C->getParentCycle());
+      assert(!C->isReducible());
+      return C;
+    }(DivTermCycle);
+
     for (const auto *SuccBlock : successors(&DivTermBlock)) {
       if (DivTermCycle && !DivTermCycle->contains(SuccBlock)) {
         // If DivTerm exits the cycle immediately, computeJoin() might
@@ -626,14 +642,24 @@ template <typename ContextT> class DivergencePropagator {
         LLVM_DEBUG(dbgs() << "\tImmediate divergent cycle exit: "
                           << Context.print(SuccBlock) << "\n");
       }
-      auto SuccIdx = CyclePOT.getIndex(SuccBlock);
       visitEdge(*SuccBlock, *SuccBlock);
-      FloorIdx = std::min<int>(FloorIdx, SuccIdx);
     }
 
+    // Technically propagation can continue until it reaches the last node.
+    //
+    // For efficiency, propagation can stop if FreshLabels.count()==1. But
+    // For irreducible cycles, let propagation continue until it reaches
+    // out of irreducible cycles (see code for details.)
     while (true) {
       auto BlockIdx = FreshLabels.find_last();
-      if (BlockIdx == -1 || BlockIdx < FloorIdx)
+      if (BlockIdx == -1)
+        break;
+
+      const auto *Block = CyclePOT[BlockIdx];
+      // If no irreducible cycle, stop if freshLable.count() = 1 and Block
+      // is the IPD. If it is in any irreducible cycle, continue propagation.
+      if (FreshLabels.count() == 1 &&
+          (!IrreducibleAncestor || !IrreducibleAncestor->contains(Block)))
         break;
 
       LLVM_DEBUG(dbgs() << "Current labels:\n"; printDefs(dbgs()));
@@ -644,16 +670,12 @@ template <typename ContextT> class DivergencePropagator {
         continue;
       }
 
-      const auto *Block = CyclePOT[BlockIdx];
       LLVM_DEBUG(dbgs() << "visiting " << Context.print(Block) << " at index "
                         << BlockIdx << "\n");
 
       const auto *Label = BlockLabels[Block];
       assert(Label);
 
-      bool CausedJoin = false;
-      int LoweredFloorIdx = FloorIdx;
-
       // If the current block is the header of a reducible cycle that
       // contains the divergent branch, then the label should be
       // propagated to the cycle exits. Such a header is the "last
@@ -681,28 +703,11 @@ template <typename ContextT> class DivergencePropagator {
       if (const auto *BlockCycle = getReducibleParent(Block)) {
         SmallVector<BlockT *, 4> BlockCycleExits;
         BlockCycle->getExitBlocks(BlockCycleExits);
-        for (auto *BlockCycleExit : BlockCycleExits) {
-          CausedJoin |= visitCycleExitEdge(*BlockCycleExit, *Label);
-          LoweredFloorIdx =
-              std::min<int>(LoweredFloorIdx, CyclePOT.getIndex(BlockCycleExit));
-        }
+        for (auto *BlockCycleExit : BlockCycleExits)
+          visitCycleExitEdge(*BlockCycleExit, *Label);
       } else {
-        for (const auto *SuccBlock : successors(Block)) {
-          CausedJoin |= visitEdge(*SuccBlock, *Label);
-          LoweredFloorIdx =
-              std::min<int>(LoweredFloorIdx, CyclePOT.getIndex(SuccBlock));
-        }
-      }
-
-      // Floor update
-      if (CausedJoin) {
-        // 1. Different labels pushed to successors
-        FloorIdx = LoweredFloorIdx;
-      } else if (FloorLabel != Label) {
-        // 2. No join caused BUT we pushed a label that is 
diff erent than the
-        // last pushed label
-        FloorIdx = LoweredFloorIdx;
-        FloorLabel = Label;
+        for (const auto *SuccBlock : successors(Block))
+          visitEdge(*SuccBlock, *Label);
       }
     }
 

diff  --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/branch-after-join.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/branch-after-join.ll
new file mode 100644
index 0000000000000..7fed0854f0cb3
--- /dev/null
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/branch-after-join.ll
@@ -0,0 +1,94 @@
+;
+; RUN: opt -mtriple amdgcn-- -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s
+;
+;
+;      Entry (div.cond)
+;      /   \
+;     B0   B3
+;     |    |
+;     B1   B4
+;     |    |
+;      \  /
+;       B5 (phi: divergent)
+;       |
+;       B6  (div.uni)
+;      /   \
+;     B7   B9
+;     |    |
+;     B8   B10
+;     |    |
+;      \  /
+;       B11 (phi: uniform)
+
+
+; CHECK-LABEL:  'test_ctrl_divergence':
+; CHECK-LABEL:  BLOCK Entry
+; CHECK:  DIVERGENT:   %div.cond = icmp eq i32 %tid, 0
+; CHECK:  DIVERGENT:   br i1 %div.cond, label %B3, label %B0
+;
+; CHECK-LABEL:  BLOCK B5
+; CHECK:  DIVERGENT:   %div_a = phi i32 [ %a0, %B1 ], [ %a1, %B4 ]
+; CHECK:  DIVERGENT:   %div_b = phi i32 [ %b0, %B1 ], [ %b1, %B4 ]
+;
+; CHECK-LABEL:  BLOCK B6
+; CHECK-NOT:  DIVERGENT:   %uni.cond = icmp
+; CHECK-NOT:  DIVERGENT:   br i1 %div.cond
+;
+; CHECK-LABEL:  BLOCK B11
+; CHECK-NOT:  DIVERGENT:   %div_d = phi i32
+
+
+define amdgpu_kernel void @test_ctrl_divergence(i32 %a, i32 %b, i32 %c, i32 %d) {
+Entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %div.cond = icmp eq i32 %tid, 0
+  br i1 %div.cond, label %B3, label %B0 ; divergent branch
+
+B0:
+  %a0 = add i32 %a, 1
+  br label %B1
+
+B1:
+  %b0 = add i32 %b, 2
+  br label %B5
+
+B3:
+  %a1 = add i32 %a, 10
+  br label %B4
+
+B4:
+  %b1 = add i32 %b, 20
+  br label %B5
+
+B5:
+  %div_a = phi i32 [%a0, %B1], [%a1,  %B4]
+  %div_b = phi i32 [%b0, %B1], [%b1,  %B4]
+  br label %B6
+
+B6:
+  %uni.cond = icmp eq i32 %c, 0
+  br i1 %uni.cond, label %B7, label %B9
+
+B7:
+  %d1 = add i32 %d, 1
+  br label %B8
+
+B8:
+  br label %B11
+
+B9:
+  %d2 = add i32 %d, 3
+  br label %B10
+
+B10:
+  br label %B11
+
+B11:
+  %div_d = phi i32 [%d1, %B8], [%d2, %B10]
+  ret void
+}
+
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = {nounwind readnone }

diff  --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/diverged-entry-headers-nested.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/diverged-entry-headers-nested.ll
index 46e676b52c0ba..8dd44eb878e96 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/diverged-entry-headers-nested.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/diverged-entry-headers-nested.ll
@@ -128,8 +128,7 @@ exit:
 ;; CHECK-LABEL: UniformityInfo for function 'headers_b_t':
 ;; CHECK: CYCLES ASSSUMED DIVERGENT:
 ;; CHECK:   depth=2: entries(T P) S Q R
-;; CHECK: CYCLES WITH DIVERGENT EXIT:
-;; CHECK:   depth=1: entries(B A) D T S Q P R C
+;; CHECK-NOT: CYCLES WITH DIVERGENT EXIT:
 
 define amdgpu_kernel void @headers_b_t(i32 %a, i32 %b, i32 %c) {
 entry:

diff  --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/hidden-post-dom.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/hidden-post-dom.ll
new file mode 100644
index 0000000000000..1c76c7de61d72
--- /dev/null
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/hidden-post-dom.ll
@@ -0,0 +1,56 @@
+; RUN: opt %s -mtriple amdgcn-- -passes='print<uniformity>' -disable-output 2>&1 | FileCheck %s
+
+define amdgpu_kernel void @cycle_inner_ipd(i32 %n, i32 %a, i32 %b) #0 {
+;
+;          entry
+;        /      \
+;      E2<------E1
+;       | \     ^^
+;       |  \  /  |
+;       |   v/   |
+;       |   A    |
+;       |  /     |
+;       | /      |
+;       vv       |
+;       B------->C
+;                |
+;                X
+;
+;
+; CHECK-LABEL: BLOCK entry
+; CHECK:  DIVERGENT:   %tid = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK:  DIVERGENT:   %div.cond = icmp slt i32 %tid, 0
+; CHECK: END BLOCK
+;
+; CHECK-LABEL: BLOCK B
+; CHECK:  DIVERGENT:   %div.merge = phi i32 [ 0, %A ], [ %b, %E2 ]
+; CHECK: END BLOCK
+
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %div.cond = icmp slt i32 %tid, 0
+  %uni.cond = icmp slt i32 %a, 0
+  %uni.cond1 = icmp slt i32 %a, 2
+  %uni.cond2 = icmp slt i32 %a, 10
+  br i1 %uni.cond, label %E2, label %E1
+
+E1:
+  br label %E2
+
+E2:
+  br i1 %uni.cond1, label %A, label %B
+
+
+A:
+  br i1 %div.cond, label %E1, label %B
+
+B:
+  %div.merge = phi i32 [ 0, %A ], [ %b, %E2 ]
+  br label %C
+
+C:
+  br i1 %uni.cond2, label %E1, label %X
+
+X:
+  ret void
+}

diff  --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_div_branch.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_div_branch.ll
new file mode 100644
index 0000000000000..971a6a16b93fd
--- /dev/null
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_div_branch.ll
@@ -0,0 +1,75 @@
+; RUN: opt -mtriple amdgcn-- -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s
+;
+; This is to test an if-then-else case with some unmerged basic blocks
+; (https://github.com/llvm/llvm-project/issues/137277)
+;
+;      Entry (div.cond)
+;      /   \
+;     B0   B3
+;     |    |
+;     B1   B4
+;     |    |
+;     B2   B5
+;      \  /
+;       B6 (phi: divergent)
+;
+
+
+; CHECK-LABEL:  'test_ctrl_divergence':
+; CHECK-LABEL:  BLOCK Entry
+; CHECK:  DIVERGENT:   %div.cond = icmp eq i32 %tid, 0
+; CHECK:  DIVERGENT:   br i1 %div.cond, label %B3, label %B0
+;
+; CHECK-LABEL:  BLOCK B6
+; CHECK:  DIVERGENT:   %div_a = phi i32 [ %a0, %B2 ], [ %a1, %B5 ]
+; CHECK:  DIVERGENT:   %div_b = phi i32 [ %b0, %B2 ], [ %b1, %B5 ]
+; CHECK:  DIVERGENT:   %div_c = phi i32 [ %c0, %B2 ], [ %c1, %B5 ]
+
+
+define amdgpu_kernel void @test_ctrl_divergence(i32 %a, i32 %b, i32 %c, i32 %d) {
+Entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %div.cond = icmp eq i32 %tid, 0
+  br i1 %div.cond, label %B3, label %B0 ; divergent branch
+
+B0:
+  %a0 = add i32 %a, 1
+  br label %B1
+
+B1:
+  %b0 = add i32 %b, 2
+  br label %B2
+
+B2:
+  %c0 = add i32 %c, 3
+  br label %B6
+
+B3:
+  %a1 = add i32 %a, 10
+  br label %B4
+
+B4:
+  %b1 = add i32 %b, 20
+  br label %B5
+
+B5:
+  %c1 = add i32 %c, 30
+  br label %B6
+
+B6:
+  %div_a = phi i32 [%a0, %B2], [%a1,  %B5]
+  %div_b = phi i32 [%b0, %B2], [%b1,  %B5]
+  %div_c = phi i32 [%c0, %B2], [%c1,  %B5]
+  br i1 %div.cond, label %B8, label %B7 ; divergent branch
+
+B7:
+  %d1 = add i32 %d, 1
+  br label %B8
+
+B8:
+  %div_d = phi i32 [%d1, %B7], [%d, %B6]
+  ret void
+}
+
+
+declare i32 @llvm.amdgcn.workitem.id.x()

diff  --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_div_loop.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_div_loop.ll
new file mode 100644
index 0000000000000..5b56251312307
--- /dev/null
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_div_loop.ll
@@ -0,0 +1,79 @@
+; RUN: opt -mtriple amdgcn-- -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s
+;
+; This is to test a divergent phi involving loops
+; (https://github.com/llvm/llvm-project/issues/137277).
+;
+;        B0 (div.cond)
+;      /   \
+;  (L)B1   B4
+;     |    |
+;     B2   B5 (L)
+;     |    |
+;     B3   /
+;      \  /
+;      B6 (phi: divergent)
+;
+
+;
+; CHECK-LABEL: UniformityInfo for function 'test_loop_ctrl_divergence':
+; CHECK-LABEL: BLOCK Entry
+; CHECK: DIVERGENT:   %tid = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-LABEL: BLOCK B0
+; CHECK: DIVERGENT:   %div.cond = icmp eq i32 %tid, 0
+; CHECK-LABEL: BLOCK B3
+; CHECK: %uni_a = phi i32 [ %a1, %B2 ], [ %a, %Entry ]
+; CHECK-LABEL: BLOCK B5
+; CHECK: %uni.a3 = phi i32 [ %a2, %B4 ], [ %uni_a3, %B5 ]
+; CHECK-LABEL BLOCK B6
+; CHECK: DIVERGENT:   %div_a = phi i32 [ %uni_a, %B3 ], [ %uni_a3, %B5 ]
+;
+
+define amdgpu_kernel void @test_loop_ctrl_divergence(i32 %a, i32 %b, i32 %c, i32 %d) {
+Entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %uni.cond0 = icmp eq i32 %d, 0
+  br i1 %uni.cond0, label %B3, label %B0 ; uniform branch
+
+B0:
+  %div.cond = icmp eq i32 %tid, 0
+  br i1 %div.cond, label %B4, label %B1 ; divergent branch
+
+B1:
+  %uni.a0 = phi i32 [%a, %B0], [%a0, %B1]
+  %a0 = add i32 %uni.a0, 1
+  %uni.cond1 = icmp slt i32 %a0, %b
+  br i1 %uni.cond1, label %B1, label %B2
+
+B2:
+  %a1 = add i32 %a0, 10
+  br label %B3
+
+B3:
+  %uni_a = phi i32 [%a1, %B2], [%a,  %Entry]
+  br label %B6
+
+B4:
+  %a2 = add i32 %a, 20
+  br label %B5
+
+B5:
+  %uni.a3= phi i32 [%a2, %B4], [%uni_a3, %B5]
+  %uni_a3 = add i32 %uni.a3, 1
+  %uni.cond2 = icmp slt i32 %uni_a3, %c
+  br i1 %uni.cond2, label %B5, label %B6
+
+B6:
+  %div_a = phi i32 [%uni_a, %B3], [%uni_a3, %B5] ;   divergent
+  %div.cond2 = icmp eq i32 %tid, 2
+  br i1 %div.cond2, label %B7, label %B8 ; divergent branch
+
+B7:
+  %c0 = add i32 %div_a, 2 ; divergent
+  br label %B8
+
+B8:
+  %ret = phi i32 [%c0, %B7], [0, %B6] ; divergent
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()

diff  --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/unstructured-branch.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/unstructured-branch.ll
new file mode 100644
index 0000000000000..c6b054fea204a
--- /dev/null
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/unstructured-branch.ll
@@ -0,0 +1,87 @@
+; RUN: opt -mtriple amdgcn-- -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s
+
+;             Alpha (div.uni)
+;              |   \
+;             Entry \
+;          (div.cond)\
+;             /   \   \
+;            B0   B3  |
+;            |    |   |
+;            B1   B4<-+
+;            |    |
+;            B2   B5
+;          /  |    |
+;         /   |   B501
+;        /    |    |
+;     B201->B202  B502
+;             \  /
+;              B6 (phi: divergent)
+;
+;
+; CHECK-LABEL:  'test_ctrl_divergence':
+; CHECK-LABEL:  BLOCK Entry
+; CHECK:  DIVERGENT:   %div.cond = icmp eq i32 %tid, 0
+; CHECK:  DIVERGENT:   br i1 %div.cond, label %B3, label %B0
+;
+; CHECK-LABEL:  BLOCK B6
+; CHECK:  DIVERGENT:   %div_a = phi i32 [ %a0, %B202 ], [ %a1, %B502 ]
+; CHECK:  DIVERGENT:   %div_b = phi i32 [ %b0, %B202 ], [ %b1, %B502 ]
+; CHECK:  DIVERGENT:   %div_c = phi i32 [ %c0, %B202 ], [ %c1, %B502 ]
+
+define amdgpu_kernel void @test_ctrl_divergence(i32 %a, i32 %b, i32 %c, i32 %d) {
+Alpha:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %div.uni = icmp eq i32 %a, 0
+  br i1 %div.uni, label %Entry, label %B4
+
+Entry:
+  %div.cond = icmp eq i32 %tid, 0
+  br i1 %div.cond, label %B3, label %B0 ; divergent branch
+
+B0:
+  br label %B1
+
+B1:
+  br label %B2
+
+B2:
+  %a0 = add i32 %a, 1
+  %b0 = add i32 %b, 2
+  %c0 = add i32 %c, 3
+  br i1 %div.uni, label %B201, label %B202
+
+B201:
+  br label %B202
+
+B202:
+  br label %B6
+
+B3:
+  br label %B4
+
+B4:
+  %a1 = add i32 %a, 10
+  %b1 = add i32 %b, 20
+  %c1 = add i32 %c, 30
+  br i1 %div.uni, label %B5, label %B501
+
+B5:
+  br label %B501
+
+B501:
+  br label %B502
+
+B502:
+  br label %B6
+
+B6:
+  %div_a = phi i32 [%a0, %B202], [%a1,  %B502]
+  %div_b = phi i32 [%b0, %B202], [%b1,  %B502]
+  %div_c = phi i32 [%c0, %B202], [%c1,  %B502]
+  ret void
+}
+
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = {nounwind readnone }