[llvm] [Uniformity] Fixed control-div early stop (PR #139667)

Tue May 27 23:32:38 PDT 2025

https://github.com/jgu222 updated https://github.com/llvm/llvm-project/pull/139667

>From 2c1a0f018221babef47cd4827fd2e0991215601c Mon Sep 17 00:00:00 2001
From: "Gu, Junjie" <junjie.gu at intel.com>
Date: Mon, 12 May 2025 22:04:57 -0700
Subject: [PATCH 1/5] [Uniformity] Fixed control-div early stop

Control-divergence finds joins by propagating labels from the divergent
control branch. The code that checks the early stop for propagation is
not correct in some cases.

This change fixes this issue by stopping at the post-dominator of
the successors of the divergent branch.

https://github.com/llvm/llvm-project/issues/137277
---
 llvm/include/llvm/ADT/GenericUniformityImpl.h | 60 +++++++-------
 .../diverged-entry-headers-nested.ll          |  8 +-
 .../AMDGPU/phi_div_branch.ll                  | 78 ++++++++++++++++++
 .../UniformityAnalysis/AMDGPU/phi_div_loop.ll | 82 +++++++++++++++++++
 4 files changed, 193 insertions(+), 35 deletions(-)
 create mode 100644 llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_div_branch.ll
 create mode 100644 llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_div_loop.ll

diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h
index d10355fff1bea..f479abdbb41b6 100644
--- a/llvm/include/llvm/ADT/GenericUniformityImpl.h
+++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h
@@ -610,9 +610,6 @@ template <typename ContextT> class DivergencePropagator {
     LLVM_DEBUG(dbgs() << "SDA:computeJoinPoints: "
                       << Context.print(&DivTermBlock) << "\n");
 
-    // Early stopping criterion
-    int FloorIdx = CyclePOT.size() - 1;
-    const BlockT *FloorLabel = nullptr;
     int DivTermIdx = CyclePOT.getIndex(&DivTermBlock);
 
     // Bootstrap with branch targets
@@ -626,14 +623,36 @@ template <typename ContextT> class DivergencePropagator {
         LLVM_DEBUG(dbgs() << "\tImmediate divergent cycle exit: "
                           << Context.print(SuccBlock) << "\n");
       }
-      auto SuccIdx = CyclePOT.getIndex(SuccBlock);
       visitEdge(*SuccBlock, *SuccBlock);
-      FloorIdx = std::min<int>(FloorIdx, SuccIdx);
     }
 
+    // Return true if B is inside an irreducible cycle
+    auto IsInIrreducibleCycle = [this](const BlockT *B) {
+      for (const auto *Cycle = CI.getCycle(B); Cycle;
+           Cycle = Cycle->getParentCycle()) {
+        if (!Cycle->isReducible())
+          return true;
+      }
+      return false;
+    };
+
+    // Technically propagation can continue until it reaches the last node.
+    //
+    // For efficiency, propagation can just stop at the IPD (immediate
+    // post-dominator) of successors(DivTemBlock) for any reducible graph.
+    // If FreshLabels.count()=1, the block in FreshLabels should be the IPD.
+    //
+    // For irreducible cycle, propagation continues until it reaches out of
+    // any irreducible cycles first, then stop when FreshLabels.count()=1.
     while (true) {
       auto BlockIdx = FreshLabels.find_last();
-      if (BlockIdx == -1 || BlockIdx < FloorIdx)
+      if (BlockIdx == -1)
+        break;
+
+      const auto *Block = CyclePOT[BlockIdx];
+      // If no irreducible cycle, stop if freshLable.count() = 1 and Block
+      // is the IPD. If it is in any irreducible cycle, continue propagation.
+      if (FreshLabels.count() == 1 && !IsInIrreducibleCycle(Block))
         break;
 
       LLVM_DEBUG(dbgs() << "Current labels:\n"; printDefs(dbgs()));
@@ -644,16 +663,12 @@ template <typename ContextT> class DivergencePropagator {
         continue;
       }
 
-      const auto *Block = CyclePOT[BlockIdx];
       LLVM_DEBUG(dbgs() << "visiting " << Context.print(Block) << " at index "
                         << BlockIdx << "\n");
 
       const auto *Label = BlockLabels[Block];
       assert(Label);
 
-      bool CausedJoin = false;
-      int LoweredFloorIdx = FloorIdx;
-
       // If the current block is the header of a reducible cycle that
       // contains the divergent branch, then the label should be
       // propagated to the cycle exits. Such a header is the "last
@@ -681,28 +696,11 @@ template <typename ContextT> class DivergencePropagator {
       if (const auto *BlockCycle = getReducibleParent(Block)) {
         SmallVector<BlockT *, 4> BlockCycleExits;
         BlockCycle->getExitBlocks(BlockCycleExits);
-        for (auto *BlockCycleExit : BlockCycleExits) {
-          CausedJoin |= visitCycleExitEdge(*BlockCycleExit, *Label);
-          LoweredFloorIdx =
-              std::min<int>(LoweredFloorIdx, CyclePOT.getIndex(BlockCycleExit));
-        }
+        for (auto *BlockCycleExit : BlockCycleExits)
+          visitCycleExitEdge(*BlockCycleExit, *Label);
       } else {
-        for (const auto *SuccBlock : successors(Block)) {
-          CausedJoin |= visitEdge(*SuccBlock, *Label);
-          LoweredFloorIdx =
-              std::min<int>(LoweredFloorIdx, CyclePOT.getIndex(SuccBlock));
-        }
-      }
-
-      // Floor update
-      if (CausedJoin) {
-        // 1. Different labels pushed to successors
-        FloorIdx = LoweredFloorIdx;
-      } else if (FloorLabel != Label) {
-        // 2. No join caused BUT we pushed a label that is different than the
-        // last pushed label
-        FloorIdx = LoweredFloorIdx;
-        FloorLabel = Label;
+        for (const auto *SuccBlock : successors(Block))
+          visitEdge(*SuccBlock, *Label);
       }
     }
 
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/diverged-entry-headers-nested.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/diverged-entry-headers-nested.ll
index 46e676b52c0ba..5bb59602faca4 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/diverged-entry-headers-nested.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/diverged-entry-headers-nested.ll
@@ -126,10 +126,10 @@ exit:
 ;; only the inner cycle is reported as diverged.
 ;;
 ;; CHECK-LABEL: UniformityInfo for function 'headers_b_t':
-;; CHECK: CYCLES ASSSUMED DIVERGENT:
-;; CHECK:   depth=2: entries(T P) S Q R
-;; CHECK: CYCLES WITH DIVERGENT EXIT:
-;; CHECK:   depth=1: entries(B A) D T S Q P R C
+;; NOCHECK: CYCLES ASSSUMED DIVERGENT:
+;; NOCHECK:   depth=2: entries(T P) S Q R
+;; NOCHECK: CYCLES WITH DIVERGENT EXIT:
+;; NOCHECK:   depth=1: entries(B A) D T S Q P R C
 
 define amdgpu_kernel void @headers_b_t(i32 %a, i32 %b, i32 %c) {
 entry:
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_div_branch.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_div_branch.ll
new file mode 100644
index 0000000000000..df949a86635c4
--- /dev/null
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_div_branch.ll
@@ -0,0 +1,78 @@
+;
+; RUN: opt -mtriple amdgcn-- -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s
+;
+; This is to test an if-then-else case with some unmerged basic blocks
+; (https://github.com/llvm/llvm-project/issues/137277)
+;
+;      Entry (div.cond)
+;      /   \
+;     B0   B3
+;     |    |
+;     B1   B4
+;     |    |
+;     B2   B5
+;      \  /
+;       B6 (phi: divergent)
+;
+
+
+; CHECK-LABEL:  'test_ctrl_divergence':
+; CHECK-LABEL:  BLOCK Entry
+; CHECK:  DIVERGENT:   %div.cond = icmp eq i32 %tid, 0
+; CHECK:  DIVERGENT:   br i1 %div.cond, label %B3, label %B0
+;
+; CHECK-LABEL:  BLOCK B6
+; CHECK:  DIVERGENT:   %div_a = phi i32 [ %a0, %B2 ], [ %a1, %B5 ]
+; CHECK:  DIVERGENT:   %div_b = phi i32 [ %b0, %B2 ], [ %b1, %B5 ]
+; CHECK:  DIVERGENT:   %div_c = phi i32 [ %c0, %B2 ], [ %c1, %B5 ]
+
+
+define amdgpu_kernel void @test_ctrl_divergence(i32 %a, i32 %b, i32 %c, i32 %d) {
+Entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %div.cond = icmp eq i32 %tid, 0
+  br i1 %div.cond, label %B3, label %B0 ; divergent branch
+
+B0:
+  %a0 = add i32 %a, 1
+  br label %B1
+
+B1:
+  %b0 = add i32 %b, 2
+  br label %B2
+
+B2:
+  %c0 = add i32 %c, 3
+  br label %B6
+
+B3:
+  %a1 = add i32 %a, 10
+  br label %B4
+
+B4:
+  %b1 = add i32 %b, 20
+  br label %B5
+
+B5:
+  %c1 = add i32 %c, 30
+  br label %B6
+
+B6:
+  %div_a = phi i32 [%a0, %B2], [%a1,  %B5]
+  %div_b = phi i32 [%b0, %B2], [%b1,  %B5]
+  %div_c = phi i32 [%c0, %B2], [%c1,  %B5]
+  br i1 %div.cond, label %B8, label %B7 ; divergent branch
+
+B7:
+  %d1 = add i32 %d, 1
+  br label %B8
+
+B8:
+  %div_d = phi i32 [%d1, %B7], [%d, %B6]
+  ret void
+}
+
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = {nounwind readnone }
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_div_loop.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_div_loop.ll
new file mode 100644
index 0000000000000..54c641862fe79
--- /dev/null
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_div_loop.ll
@@ -0,0 +1,82 @@
+;
+; RUN: opt -mtriple amdgcn-- -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s
+;
+; This is to test a divergent phi involving loops
+; (https://github.com/llvm/llvm-project/issues/137277).
+;
+;        B0 (div.cond)
+;      /   \
+;  (L)B1   B4
+;     |    |
+;     B2   B5 (L)
+;     |    |
+;     B3   /
+;      \  /
+;      B6 (phi: divergent)
+;
+
+;
+; CHECK-LABEL: UniformityInfo for function 'test_loop_ctrl_divergence':
+; CHECK-LABEL: BLOCK Entry
+; CHECK: DIVERGENT:   %tid = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-LABEL: BLOCK B0
+; CHECK: DIVERGENT:   %div.cond = icmp eq i32 %tid, 0
+; CHECK-LABEL: BLOCK B3
+; CHECK: %uni_a = phi i32 [ %a1, %B2 ], [ %a, %Entry ]
+; CHECK-LABEL: BLOCK B5
+; CHECK: %uni.a3 = phi i32 [ %a2, %B4 ], [ %uni_a3, %B5 ]
+; CHECK-LABEL BLOCK B6
+; CHECK: DIVERGENT:   %div_a = phi i32 [ %uni_a, %B3 ], [ %uni_a3, %B5 ]
+;
+
+define amdgpu_kernel void @test_loop_ctrl_divergence(i32 %a, i32 %b, i32 %c, i32 %d) {
+Entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %uni.cond0 = icmp eq i32 %d, 0
+  br i1 %uni.cond0, label %B3, label %B0 ; uniform branch
+
+B0:
+  %div.cond = icmp eq i32 %tid, 0
+  br i1 %div.cond, label %B4, label %B1 ; divergent branch
+
+B1:
+  %uni.a0 = phi i32 [%a, %B0], [%a0, %B1]
+  %a0 = add i32 %uni.a0, 1
+  %uni.cond1 = icmp slt i32 %a0, %b
+  br i1 %uni.cond1, label %B1, label %B2
+
+B2:
+  %a1 = add i32 %a0, 10
+  br label %B3
+
+B3:
+  %uni_a = phi i32 [%a1, %B2], [%a,  %Entry]
+  br label %B6
+
+B4:
+  %a2 = add i32 %a, 20
+  br label %B5
+
+B5:
+  %uni.a3= phi i32 [%a2, %B4], [%uni_a3, %B5]
+  %uni_a3 = add i32 %uni.a3, 1
+  %uni.cond2 = icmp slt i32 %uni_a3, %c
+  br i1 %uni.cond2, label %B5, label %B6
+
+B6:
+  %div_a = phi i32 [%uni_a, %B3], [%uni_a3, %B5] ;   divergent
+  %div.cond2 = icmp eq i32 %tid, 2
+  br i1 %div.cond2, label %B7, label %B8 ; divergent branch
+
+B7:
+  %c0 = add i32 %div_a, 2 ; divergent
+  br label %B8
+
+B8:
+  %ret = phi i32 [%c0, %B7], [0, %B6] ; divergent
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = {nounwind readnone }

>From 5bb790f2d2df92f2bc1440a0092f281af2e2e474 Mon Sep 17 00:00:00 2001
From: "Gu, Junjie" <junjie.gu at intel.com>
Date: Thu, 22 May 2025 20:27:33 -0700
Subject: [PATCH 2/5] [UniformAnalysis] Changes based on feedback

Based on feedback, modifying early stop involving loops and minor lit test fix.
---
 llvm/include/llvm/ADT/GenericUniformityImpl.h        | 12 ++++++------
 .../UniformityAnalysis/AMDGPU/phi_div_branch.ll      |  5 +----
 .../UniformityAnalysis/AMDGPU/phi_div_loop.ll        |  5 +----
 3 files changed, 8 insertions(+), 14 deletions(-)

diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h
index f479abdbb41b6..1c7ee02fe0f7a 100644
--- a/llvm/include/llvm/ADT/GenericUniformityImpl.h
+++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h
@@ -630,6 +630,9 @@ template <typename ContextT> class DivergencePropagator {
     auto IsInIrreducibleCycle = [this](const BlockT *B) {
       for (const auto *Cycle = CI.getCycle(B); Cycle;
            Cycle = Cycle->getParentCycle()) {
+        // If everything is inside a reducible cycle, then look no further
+        if (Cycle->isReducible() && Cycle->contains(&DivTermBlock))
+          return false;
         if (!Cycle->isReducible())
           return true;
       }
@@ -638,12 +641,9 @@ template <typename ContextT> class DivergencePropagator {
 
     // Technically propagation can continue until it reaches the last node.
     //
-    // For efficiency, propagation can just stop at the IPD (immediate
-    // post-dominator) of successors(DivTemBlock) for any reducible graph.
-    // If FreshLabels.count()=1, the block in FreshLabels should be the IPD.
-    //
-    // For irreducible cycle, propagation continues until it reaches out of
-    // any irreducible cycles first, then stop when FreshLabels.count()=1.
+    // For efficiency, propagation can stop if FreshLabels.count()==1. But
+    // For irreducible cycles, let propagation continue until it reaches
+    // out of irreducible cycles (see code for details.)
     while (true) {
       auto BlockIdx = FreshLabels.find_last();
       if (BlockIdx == -1)
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_div_branch.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_div_branch.ll
index df949a86635c4..971a6a16b93fd 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_div_branch.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_div_branch.ll
@@ -1,4 +1,3 @@
-;
 ; RUN: opt -mtriple amdgcn-- -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s
 ;
 ; This is to test an if-then-else case with some unmerged basic blocks
@@ -73,6 +72,4 @@ B8:
 }
 
 
-declare i32 @llvm.amdgcn.workitem.id.x() #0
-
-attributes #0 = {nounwind readnone }
+declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_div_loop.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_div_loop.ll
index 54c641862fe79..5b56251312307 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_div_loop.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_div_loop.ll
@@ -1,4 +1,3 @@
-;
 ; RUN: opt -mtriple amdgcn-- -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s
 ;
 ; This is to test a divergent phi involving loops
@@ -77,6 +76,4 @@ B8:
   ret void
 }
 
-declare i32 @llvm.amdgcn.workitem.id.x() #0
-
-attributes #0 = {nounwind readnone }
+declare i32 @llvm.amdgcn.workitem.id.x()

>From 0dbb9b5eb97a25c7bf04b7673b0fcf7f8291afdd Mon Sep 17 00:00:00 2001
From: "Gu, Junjie" <junjie.gu at intel.com>
Date: Mon, 26 May 2025 09:52:53 -0700
Subject: [PATCH 3/5] Fix lit test based on feedback

---
 .../AMDGPU/irreducible/diverged-entry-headers-nested.ll    | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/diverged-entry-headers-nested.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/diverged-entry-headers-nested.ll
index 5bb59602faca4..8dd44eb878e96 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/diverged-entry-headers-nested.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/diverged-entry-headers-nested.ll
@@ -126,10 +126,9 @@ exit:
 ;; only the inner cycle is reported as diverged.
 ;;
 ;; CHECK-LABEL: UniformityInfo for function 'headers_b_t':
-;; NOCHECK: CYCLES ASSSUMED DIVERGENT:
-;; NOCHECK:   depth=2: entries(T P) S Q R
-;; NOCHECK: CYCLES WITH DIVERGENT EXIT:
-;; NOCHECK:   depth=1: entries(B A) D T S Q P R C
+;; CHECK: CYCLES ASSSUMED DIVERGENT:
+;; CHECK:   depth=2: entries(T P) S Q R
+;; CHECK-NOT: CYCLES WITH DIVERGENT EXIT:
 
 define amdgpu_kernel void @headers_b_t(i32 %a, i32 %b, i32 %c) {
 entry:

>From 55b9601cddc53f663c468e3dea7c5a3b73b38405 Mon Sep 17 00:00:00 2001
From: Sameer Sahasrabuddhe <sameer.sahasrabuddhe at amd.com>
Date: Mon, 26 May 2025 09:51:21 +0530
Subject: [PATCH 4/5] [LLVM] [NFC] Add more tests for uniformity analysis

---
 .../AMDGPU/branch-after-join.ll               | 94 +++++++++++++++++++
 .../AMDGPU/irreducible/hidden-post-dom.ll     | 56 +++++++++++
 .../AMDGPU/unstructured-branch.ll             | 87 +++++++++++++++++
 3 files changed, 237 insertions(+)
 create mode 100644 llvm/test/Analysis/UniformityAnalysis/AMDGPU/branch-after-join.ll
 create mode 100644 llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/hidden-post-dom.ll
 create mode 100644 llvm/test/Analysis/UniformityAnalysis/AMDGPU/unstructured-branch.ll

diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/branch-after-join.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/branch-after-join.ll
new file mode 100644
index 0000000000000..7fed0854f0cb3
--- /dev/null
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/branch-after-join.ll
@@ -0,0 +1,94 @@
+;
+; RUN: opt -mtriple amdgcn-- -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s
+;
+;
+;      Entry (div.cond)
+;      /   \
+;     B0   B3
+;     |    |
+;     B1   B4
+;     |    |
+;      \  /
+;       B5 (phi: divergent)
+;       |
+;       B6  (div.uni)
+;      /   \
+;     B7   B9
+;     |    |
+;     B8   B10
+;     |    |
+;      \  /
+;       B11 (phi: uniform)
+
+
+; CHECK-LABEL:  'test_ctrl_divergence':
+; CHECK-LABEL:  BLOCK Entry
+; CHECK:  DIVERGENT:   %div.cond = icmp eq i32 %tid, 0
+; CHECK:  DIVERGENT:   br i1 %div.cond, label %B3, label %B0
+;
+; CHECK-LABEL:  BLOCK B5
+; CHECK:  DIVERGENT:   %div_a = phi i32 [ %a0, %B1 ], [ %a1, %B4 ]
+; CHECK:  DIVERGENT:   %div_b = phi i32 [ %b0, %B1 ], [ %b1, %B4 ]
+;
+; CHECK-LABEL:  BLOCK B6
+; CHECK-NOT:  DIVERGENT:   %uni.cond = icmp
+; CHECK-NOT:  DIVERGENT:   br i1 %div.cond
+;
+; CHECK-LABEL:  BLOCK B11
+; CHECK-NOT:  DIVERGENT:   %div_d = phi i32
+
+
+define amdgpu_kernel void @test_ctrl_divergence(i32 %a, i32 %b, i32 %c, i32 %d) {
+Entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %div.cond = icmp eq i32 %tid, 0
+  br i1 %div.cond, label %B3, label %B0 ; divergent branch
+
+B0:
+  %a0 = add i32 %a, 1
+  br label %B1
+
+B1:
+  %b0 = add i32 %b, 2
+  br label %B5
+
+B3:
+  %a1 = add i32 %a, 10
+  br label %B4
+
+B4:
+  %b1 = add i32 %b, 20
+  br label %B5
+
+B5:
+  %div_a = phi i32 [%a0, %B1], [%a1,  %B4]
+  %div_b = phi i32 [%b0, %B1], [%b1,  %B4]
+  br label %B6
+
+B6:
+  %uni.cond = icmp eq i32 %c, 0
+  br i1 %uni.cond, label %B7, label %B9
+
+B7:
+  %d1 = add i32 %d, 1
+  br label %B8
+
+B8:
+  br label %B11
+
+B9:
+  %d2 = add i32 %d, 3
+  br label %B10
+
+B10:
+  br label %B11
+
+B11:
+  %div_d = phi i32 [%d1, %B8], [%d2, %B10]
+  ret void
+}
+
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = {nounwind readnone }
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/hidden-post-dom.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/hidden-post-dom.ll
new file mode 100644
index 0000000000000..1c76c7de61d72
--- /dev/null
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/hidden-post-dom.ll
@@ -0,0 +1,56 @@
+; RUN: opt %s -mtriple amdgcn-- -passes='print<uniformity>' -disable-output 2>&1 | FileCheck %s
+
+define amdgpu_kernel void @cycle_inner_ipd(i32 %n, i32 %a, i32 %b) #0 {
+;
+;          entry
+;        /      \
+;      E2<------E1
+;       | \     ^^
+;       |  \  /  |
+;       |   v/   |
+;       |   A    |
+;       |  /     |
+;       | /      |
+;       vv       |
+;       B------->C
+;                |
+;                X
+;
+;
+; CHECK-LABEL: BLOCK entry
+; CHECK:  DIVERGENT:   %tid = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK:  DIVERGENT:   %div.cond = icmp slt i32 %tid, 0
+; CHECK: END BLOCK
+;
+; CHECK-LABEL: BLOCK B
+; CHECK:  DIVERGENT:   %div.merge = phi i32 [ 0, %A ], [ %b, %E2 ]
+; CHECK: END BLOCK
+
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %div.cond = icmp slt i32 %tid, 0
+  %uni.cond = icmp slt i32 %a, 0
+  %uni.cond1 = icmp slt i32 %a, 2
+  %uni.cond2 = icmp slt i32 %a, 10
+  br i1 %uni.cond, label %E2, label %E1
+
+E1:
+  br label %E2
+
+E2:
+  br i1 %uni.cond1, label %A, label %B
+
+
+A:
+  br i1 %div.cond, label %E1, label %B
+
+B:
+  %div.merge = phi i32 [ 0, %A ], [ %b, %E2 ]
+  br label %C
+
+C:
+  br i1 %uni.cond2, label %E1, label %X
+
+X:
+  ret void
+}
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/unstructured-branch.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/unstructured-branch.ll
new file mode 100644
index 0000000000000..c6b054fea204a
--- /dev/null
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/unstructured-branch.ll
@@ -0,0 +1,87 @@
+; RUN: opt -mtriple amdgcn-- -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s
+
+;             Alpha (div.uni)
+;              |   \
+;             Entry \
+;          (div.cond)\
+;             /   \   \
+;            B0   B3  |
+;            |    |   |
+;            B1   B4<-+
+;            |    |
+;            B2   B5
+;          /  |    |
+;         /   |   B501
+;        /    |    |
+;     B201->B202  B502
+;             \  /
+;              B6 (phi: divergent)
+;
+;
+; CHECK-LABEL:  'test_ctrl_divergence':
+; CHECK-LABEL:  BLOCK Entry
+; CHECK:  DIVERGENT:   %div.cond = icmp eq i32 %tid, 0
+; CHECK:  DIVERGENT:   br i1 %div.cond, label %B3, label %B0
+;
+; CHECK-LABEL:  BLOCK B6
+; CHECK:  DIVERGENT:   %div_a = phi i32 [ %a0, %B202 ], [ %a1, %B502 ]
+; CHECK:  DIVERGENT:   %div_b = phi i32 [ %b0, %B202 ], [ %b1, %B502 ]
+; CHECK:  DIVERGENT:   %div_c = phi i32 [ %c0, %B202 ], [ %c1, %B502 ]
+
+define amdgpu_kernel void @test_ctrl_divergence(i32 %a, i32 %b, i32 %c, i32 %d) {
+Alpha:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %div.uni = icmp eq i32 %a, 0
+  br i1 %div.uni, label %Entry, label %B4
+
+Entry:
+  %div.cond = icmp eq i32 %tid, 0
+  br i1 %div.cond, label %B3, label %B0 ; divergent branch
+
+B0:
+  br label %B1
+
+B1:
+  br label %B2
+
+B2:
+  %a0 = add i32 %a, 1
+  %b0 = add i32 %b, 2
+  %c0 = add i32 %c, 3
+  br i1 %div.uni, label %B201, label %B202
+
+B201:
+  br label %B202
+
+B202:
+  br label %B6
+
+B3:
+  br label %B4
+
+B4:
+  %a1 = add i32 %a, 10
+  %b1 = add i32 %b, 20
+  %c1 = add i32 %c, 30
+  br i1 %div.uni, label %B5, label %B501
+
+B5:
+  br label %B501
+
+B501:
+  br label %B502
+
+B502:
+  br label %B6
+
+B6:
+  %div_a = phi i32 [%a0, %B202], [%a1,  %B502]
+  %div_b = phi i32 [%b0, %B202], [%b1,  %B502]
+  %div_c = phi i32 [%c0, %B202], [%c1,  %B502]
+  ret void
+}
+
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = {nounwind readnone }

>From 7a45497ded4522db84db61313de104baf1cc6be0 Mon Sep 17 00:00:00 2001
From: Sameer Sahasrabuddhe <sameer.sahasrabuddhe at amd.com>
Date: Tue, 27 May 2025 11:43:04 +0530
Subject: [PATCH 5/5] simpler logic for irreducible parent cycle

---
 llvm/include/llvm/ADT/GenericUniformityImpl.h | 31 ++++++++++---------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h
index 1c7ee02fe0f7a..03cace4e06498 100644
--- a/llvm/include/llvm/ADT/GenericUniformityImpl.h
+++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h
@@ -614,6 +614,22 @@ template <typename ContextT> class DivergencePropagator {
 
     // Bootstrap with branch targets
     auto const *DivTermCycle = CI.getCycle(&DivTermBlock);
+
+    // Locate the largest ancestor cycle that is not reducible and does not
+    // contain a reducible ancestor. This is done with a lambda that is defined
+    // and invoked in the same statement.
+    const CycleT *IrreducibleAncestor = [](const CycleT *C) -> const CycleT* {
+      if (!C) return nullptr;
+      if (C->isReducible()) return nullptr;
+      while (const CycleT *P = C->getParentCycle()) {
+        if (P->isReducible()) return C;
+        C = P;
+      }
+      assert(!C->getParentCycle());
+      assert(!C->isReducible());
+      return C;
+    } (DivTermCycle);
+
     for (const auto *SuccBlock : successors(&DivTermBlock)) {
       if (DivTermCycle && !DivTermCycle->contains(SuccBlock)) {
         // If DivTerm exits the cycle immediately, computeJoin() might
@@ -626,19 +642,6 @@ template <typename ContextT> class DivergencePropagator {
       visitEdge(*SuccBlock, *SuccBlock);
     }
 
-    // Return true if B is inside an irreducible cycle
-    auto IsInIrreducibleCycle = [this](const BlockT *B) {
-      for (const auto *Cycle = CI.getCycle(B); Cycle;
-           Cycle = Cycle->getParentCycle()) {
-        // If everything is inside a reducible cycle, then look no further
-        if (Cycle->isReducible() && Cycle->contains(&DivTermBlock))
-          return false;
-        if (!Cycle->isReducible())
-          return true;
-      }
-      return false;
-    };
-
     // Technically propagation can continue until it reaches the last node.
     //
     // For efficiency, propagation can stop if FreshLabels.count()==1. But
@@ -652,7 +655,7 @@ template <typename ContextT> class DivergencePropagator {
       const auto *Block = CyclePOT[BlockIdx];
       // If no irreducible cycle, stop if freshLable.count() = 1 and Block
       // is the IPD. If it is in any irreducible cycle, continue propagation.
-      if (FreshLabels.count() == 1 && !IsInIrreducibleCycle(Block))
+      if (FreshLabels.count() == 1 && (!IrreducibleAncestor || !IrreducibleAncestor->contains(Block)))
         break;
 
       LLVM_DEBUG(dbgs() << "Current labels:\n"; printDefs(dbgs()));