[llvm] [Uniformity] Avoid marking all PHIs as divergent in join blocks (PR #157808)

Thu Oct 9 01:04:13 PDT 2025

https://github.com/perlfu updated https://github.com/llvm/llvm-project/pull/157808

>From 7c44e3b518c364a586f3beea081b278da0fa5f5e Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson at amd.com>
Date: Mon, 1 Sep 2025 13:34:54 +0900
Subject: [PATCH 1/6] [Uniformity] Avoid marking all PHIs as divergent in join
 blocks

Attempt to avoid marking PHIs divergent in join blocks.
Only mark PHIs which contain values with sync dependence on the
divergent terminator condition.
---
 llvm/include/llvm/ADT/GenericSSAContext.h     |   6 +-
 llvm/include/llvm/ADT/GenericUniformityImpl.h |  51 +++++++-
 llvm/lib/CodeGen/MachineSSAContext.cpp        |  14 +++
 llvm/lib/IR/SSAContext.cpp                    |  15 +++
 .../AMDGPU/phi_no_divergence.ll               | 114 ++++++++++++++++++
 5 files changed, 194 insertions(+), 6 deletions(-)
 create mode 100644 llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_no_divergence.ll

diff --git a/llvm/include/llvm/ADT/GenericSSAContext.h b/llvm/include/llvm/ADT/GenericSSAContext.h
index 6aa3a8b9b6e0b..abcc093dd88ab 100644
--- a/llvm/include/llvm/ADT/GenericSSAContext.h
+++ b/llvm/include/llvm/ADT/GenericSSAContext.h
@@ -54,7 +54,7 @@ template <typename _FunctionT> class GenericSSAContext {
 
   // The null value for ValueRefT. For LLVM IR and MIR, this is simply the
   // default constructed value.
-  static constexpr ValueRefT *ValueRefNull = {};
+  static constexpr ValueRefT ValueRefNull = {};
 
   // An InstructionT usually defines one or more ValueT objects.
   using InstructionT = typename SSATraits::InstructionT;
@@ -96,6 +96,10 @@ template <typename _FunctionT> class GenericSSAContext {
   static bool isConstantOrUndefValuePhi(const InstructionT &Instr);
   const BlockT *getDefBlock(ConstValueRefT value) const;
 
+  void getPhiInputs(const InstructionT &Instr,
+                    SmallVectorImpl<ConstValueRefT> &Values,
+                    SmallVectorImpl<const BlockT *> &Blocks) const;
+
   Printable print(const BlockT *block) const;
   Printable printAsOperand(const BlockT *BB) const;
   Printable print(const InstructionT *inst) const;
diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h
index 3b9b7f2633771..0a1adc30e69e0 100644
--- a/llvm/include/llvm/ADT/GenericUniformityImpl.h
+++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h
@@ -455,9 +455,11 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
   /// the worklist.
   void taintAndPushAllDefs(const BlockT &JoinBlock);
 
-  /// \brief Mark all phi nodes in \p JoinBlock as divergent and push them on
-  /// the worklist.
-  void taintAndPushPhiNodes(const BlockT &JoinBlock);
+  /// \brief Mark phi nodes in \p JoinBlock as divergent and push them on
+  /// the worklist if they are divergent over the by the path \p JoinBlock
+  /// to \p DivTermBlock.
+  void taintAndPushPhiNodes(const BlockT &JoinBlock, const BlockT &DivTermBlock,
+                            const DivergenceDescriptorT &DivDesc);
 
   /// \brief Identify all Instructions that become divergent because \p DivExit
   /// is a divergent cycle exit of \p DivCycle. Mark those instructions as
@@ -917,7 +919,8 @@ void GenericUniformityAnalysisImpl<ContextT>::taintAndPushAllDefs(
 /// Mark divergent phi nodes in a join block
 template <typename ContextT>
 void GenericUniformityAnalysisImpl<ContextT>::taintAndPushPhiNodes(
-    const BlockT &JoinBlock) {
+    const BlockT &JoinBlock, const BlockT &DivTermBlock,
+    const DivergenceDescriptorT &DivDesc) {
   LLVM_DEBUG(dbgs() << "taintAndPushPhiNodes in " << Context.print(&JoinBlock)
                     << "\n");
   for (const auto &Phi : JoinBlock.phis()) {
@@ -930,6 +933,44 @@ void GenericUniformityAnalysisImpl<ContextT>::taintAndPushPhiNodes(
     // https://reviews.llvm.org/D19013
     if (ContextT::isConstantOrUndefValuePhi(Phi))
       continue;
+
+    // Attempt to maintain uniformity for PHIs by considering control
+    // dependencies.
+    SmallVector<ConstValueRefT> Values;
+    SmallVector<const BlockT *> Blocks;
+    Context.getPhiInputs(Phi, Values, Blocks);
+    assert(Blocks.size() == Values.size());
+
+    // Allow an empty Blocks/Values list to signify getPhiInputs is not
+    // implemented; in which case no uniformity is possible.
+    bool Uniform = !Values.empty();
+
+    std::optional<ConstValueRefT> CommonValue;
+    for (unsigned I = 0; I < Blocks.size() && Uniform; ++I) {
+      if (DivDesc.CycleDivBlocks.contains(Blocks[I])) {
+        // If PHI is reachable via divergent exit it is divergent.
+        Uniform = false;
+      } else if (DT.dominates(&DivTermBlock, Blocks[I]) ||
+                 DivDesc.BlockLabels.lookup_or(Blocks[I], nullptr)) {
+        // If all edges from the marked path share a common value then
+        // uniformity is preserved when the value is itself uniform.
+        if (!CommonValue)
+          CommonValue = Values[I];
+        else
+          Uniform = Values[I] == *CommonValue;
+      }
+      // Ignore undefined values when checking definitions.
+      if (!Values[I])
+        continue;
+      // Any value defined on the divergent path is divergent.
+      const BlockT *DefBlock = Context.getDefBlock(Values[I]);
+      if (DivDesc.BlockLabels.lookup_or(DefBlock, nullptr))
+        Uniform = false;
+    }
+    if (Uniform)
+      continue;
+
+    LLVM_DEBUG(dbgs() << "tainted: " << Phi << "\n");
     markDivergent(Phi);
   }
 }
@@ -1087,7 +1128,7 @@ void GenericUniformityAnalysisImpl<ContextT>::analyzeControlDivergence(
       DivCycles.push_back(Outermost);
       continue;
     }
-    taintAndPushPhiNodes(*JoinBlock);
+    taintAndPushPhiNodes(*JoinBlock, *DivTermBlock, DivDesc);
   }
 
   // Sort by order of decreasing depth. This allows later cycles to be skipped
diff --git a/llvm/lib/CodeGen/MachineSSAContext.cpp b/llvm/lib/CodeGen/MachineSSAContext.cpp
index bbbfb3ce2788d..4b6f8d6b0f9f4 100644
--- a/llvm/lib/CodeGen/MachineSSAContext.cpp
+++ b/llvm/lib/CodeGen/MachineSSAContext.cpp
@@ -84,6 +84,20 @@ bool MachineSSAContext::isConstantOrUndefValuePhi(const MachineInstr &Phi) {
   return true;
 }
 
+template <>
+void MachineSSAContext::getPhiInputs(
+    const MachineInstr &Phi, SmallVectorImpl<Register> &Values,
+    SmallVectorImpl<const MachineBasicBlock *> &Blocks) const {
+  if (!Phi.isPHI())
+    return;
+  for (unsigned Idx = 1, End = Phi.getNumOperands(); Idx < End; Idx += 2) {
+    // FIXME: ideally we would turn undef values into ValueRefNull.
+    // This could reduce number of PHIs marked in taintAndPushPhiNodes().
+    Values.push_back(Phi.getOperand(Idx).getReg());
+    Blocks.push_back(Phi.getOperand(Idx + 1).getMBB());
+  }
+}
+
 template <>
 Intrinsic::ID MachineSSAContext::getIntrinsicID(const MachineInstr &MI) {
   if (auto *GI = dyn_cast<GIntrinsic>(&MI))
diff --git a/llvm/lib/IR/SSAContext.cpp b/llvm/lib/IR/SSAContext.cpp
index 20b6ea1e972d4..0249fe94d716e 100644
--- a/llvm/lib/IR/SSAContext.cpp
+++ b/llvm/lib/IR/SSAContext.cpp
@@ -14,6 +14,7 @@
 
 #include "llvm/IR/SSAContext.h"
 #include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
@@ -68,6 +69,20 @@ bool SSAContext::isConstantOrUndefValuePhi(const Instruction &Instr) {
   return false;
 }
 
+template <>
+void SSAContext::getPhiInputs(
+    const Instruction &Instr, SmallVectorImpl<const Value *> &Values,
+    SmallVectorImpl<const BasicBlock *> &Blocks) const {
+  if (auto *Phi = dyn_cast<PHINode>(&Instr)) {
+    for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
+      const Value *Incoming = Phi->getIncomingValue(I);
+      const BasicBlock *Block = Phi->getIncomingBlock(I);
+      Values.push_back(!isa<UndefValue>(Incoming) ? Incoming : ValueRefNull);
+      Blocks.push_back(Block);
+    }
+  }
+}
+
 template <> Intrinsic::ID SSAContext::getIntrinsicID(const Instruction &I) {
   if (auto *CB = dyn_cast<CallBase>(&I))
     return CB->getIntrinsicID();
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_no_divergence.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_no_divergence.ll
new file mode 100644
index 0000000000000..7e1018bf59d5f
--- /dev/null
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_no_divergence.ll
@@ -0,0 +1,114 @@
+; RUN: opt %s -mtriple amdgcn-- -passes='print<uniformity>' -disable-output 2>&1 | FileCheck %s
+
+define amdgpu_kernel void @no_divergent_exit1(i32 %a, i32 %b, i32 %c) #0 {
+; CHECK-LABEL: for function 'no_divergent_exit1'
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %div.cond = icmp slt i32 %tid, 0
+; CHECK: DIVERGENT: %div.cond =
+  br label %header
+
+header:
+  %loop.b = phi i32 [ %b, %entry ], [ %new.b, %body.1 ], [ %new.b, %body.2 ]
+; CHECK-NOT: DIVERGENT: %loop.b =
+  %loop.c = phi i32 [ %c, %entry ], [ %loop.c, %body.1 ], [ %new.c, %body.2 ]
+; CHECK: DIVERGENT: %loop.c =
+  %exit.val = phi i32 [ %a, %entry ], [ %next.exit.val, %body.1 ], [ %next.exit.val, %body.2 ]
+; CHECK-NOT: DIVERGENT: %exit.val =
+  %exit.cond = icmp slt i32 %exit.val, 42
+; CHECK-NOT: DIVERGENT: %exit.cond =
+  br i1 %exit.cond, label %end, label %body.1
+; CHECK-NOT: DIVERGENT: br i1 %exit.cond,
+
+body.1:
+  %new.b = add i32 %loop.b, 1
+; CHECK-NOT: DIVERGENT: %new.b =
+  %next.exit.val = add i32 %exit.val, 1
+; CHECK-NOT: DIVERGENT: %next.exit.val =
+  br i1 %div.cond, label %body.2, label %header
+; CHECK: DIVERGENT: br i1 %div.cond,
+
+body.2:
+  %new.c = add i32 %loop.c, 1
+; CHECK: DIVERGENT: %new.c =
+  br label %header
+
+end:
+  ret void
+}
+
+define amdgpu_kernel void @no_divergent_exit2(i32 %a, i32 %b, i32 %c) #0 {
+; CHECK-LABEL: for function 'no_divergent_exit2'
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %div.cond = icmp slt i32 %tid, 0
+; CHECK: DIVERGENT: %div.cond =
+  br label %header
+
+header:
+  %loop.b = phi i32 [ %b, %entry ], [ %merge.b, %merge ]
+; CHECK-NOT: DIVERGENT: %loop.b =
+  %loop.c = phi i32 [ %c, %entry ], [ %merge.c, %merge ]
+; CHECK: DIVERGENT: %loop.c =
+  %exit.val = phi i32 [ %a, %entry ], [ %next.exit.val, %merge ]
+; CHECK-NOT: DIVERGENT: %exit.val =
+  %exit.cond = icmp slt i32 %exit.val, 42
+; CHECK-NOT: DIVERGENT: %exit.cond =
+  br i1 %exit.cond, label %end, label %body.1
+; CHECK-NOT: DIVERGENT: br i1 %exit.cond,
+
+body.1:
+  %new.b = add i32 %loop.b, 1
+; CHECK-NOT: DIVERGENT: %new.b =
+  %next.exit.val = add i32 %exit.val, 1
+; CHECK-NOT: DIVERGENT: %next.exit.val =
+  br i1 %div.cond, label %body.2, label %merge
+; CHECK: DIVERGENT: br i1 %div.cond,
+
+body.2:
+  %new.c = add i32 %loop.c, 1
+; CHECK: DIVERGENT: %new.c =
+  br label %merge
+
+merge:
+  %merge.b = phi i32 [ %new.b, %body.1 ], [ %new.b, %body.2 ]
+; CHECK-NOT: DIVERGENT: %merge.b =
+  %merge.c = phi i32 [ %loop.c, %body.1 ], [ %new.c, %body.2 ]
+; CHECK: DIVERGENT: %merge.c =
+  br label %header
+
+end:
+  ret void
+}
+
+define amdgpu_kernel void @no_loop_phi_divergence(i32 %a) #0 {
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %uni.cond = icmp slt i32 %a, 0
+; CHECK-NOT: DIVERGENT: %uni.cond =
+  %div.cond = icmp slt i32 %tid, 0
+; CHECK: DIVERGENT: %div.cond =
+  br i1 %uni.cond, label %div.branch.block, label %merge
+; CHECK-NOT: DIVERGENT: br i1 %uni.cond,
+
+div.branch.block:
+  br i1 %div.cond, label %div.block.1, label %div.block.2
+; CHECK: DIVERGENT: br i1 %div.cond,
+
+div.block.1:
+  br label %merge
+
+div.block.2:
+  br label %merge
+
+merge:
+  %uni.val = phi i32 [ 0, %entry ], [ 1, %div.block.1 ], [ 1, %div.block.2 ]
+; CHECK-NOT: DIVERGENT: %uni.val =
+  %div.val = phi i32 [ 0, %entry ], [ 1, %div.block.1 ], [ 2, %div.block.2 ]
+; CHECK: DIVERGENT: %div.val =
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone }

>From 2804480ba4e62498fb09eb63a4bf844ac651738e Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson at amd.com>
Date: Thu, 25 Sep 2025 18:28:44 +0900
Subject: [PATCH 2/6] - Rework based on reviewer feedback

---
 llvm/include/llvm/ADT/GenericUniformityImpl.h | 46 ++++++++-----------
 llvm/lib/CodeGen/MachineSSAContext.cpp        | 12 +++--
 llvm/lib/IR/SSAContext.cpp                    |  5 +-
 .../AMDGPU/hidden_loopdiverge.ll              |  4 +-
 4 files changed, 33 insertions(+), 34 deletions(-)

diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h
index 0a1adc30e69e0..f30345d4ff60b 100644
--- a/llvm/include/llvm/ADT/GenericUniformityImpl.h
+++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h
@@ -456,7 +456,7 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
   void taintAndPushAllDefs(const BlockT &JoinBlock);
 
   /// \brief Mark phi nodes in \p JoinBlock as divergent and push them on
-  /// the worklist if they are divergent over the by the path \p JoinBlock
+  /// the worklist if they are divergent over the path from \p JoinBlock
   /// to \p DivTermBlock.
   void taintAndPushPhiNodes(const BlockT &JoinBlock, const BlockT &DivTermBlock,
                             const DivergenceDescriptorT &DivDesc);
@@ -924,18 +924,8 @@ void GenericUniformityAnalysisImpl<ContextT>::taintAndPushPhiNodes(
   LLVM_DEBUG(dbgs() << "taintAndPushPhiNodes in " << Context.print(&JoinBlock)
                     << "\n");
   for (const auto &Phi : JoinBlock.phis()) {
-    // FIXME: The non-undef value is not constant per se; it just happens to be
-    // uniform and may not dominate this PHI. So assuming that the same value
-    // reaches along all incoming edges may itself be undefined behaviour. This
-    // particular interpretation of the undef value was added to
-    // DivergenceAnalysis in the following review:
-    //
-    // https://reviews.llvm.org/D19013
-    if (ContextT::isConstantOrUndefValuePhi(Phi))
-      continue;
-
     // Attempt to maintain uniformity for PHIs by considering control
-    // dependencies.
+    // dependencies before marking them.
     SmallVector<ConstValueRefT> Values;
     SmallVector<const BlockT *> Blocks;
     Context.getPhiInputs(Phi, Values, Blocks);
@@ -947,24 +937,24 @@ void GenericUniformityAnalysisImpl<ContextT>::taintAndPushPhiNodes(
 
     std::optional<ConstValueRefT> CommonValue;
     for (unsigned I = 0; I < Blocks.size() && Uniform; ++I) {
-      if (DivDesc.CycleDivBlocks.contains(Blocks[I])) {
-        // If PHI is reachable via divergent exit it is divergent.
-        Uniform = false;
-      } else if (DT.dominates(&DivTermBlock, Blocks[I]) ||
-                 DivDesc.BlockLabels.lookup_or(Blocks[I], nullptr)) {
-        // If all edges from the marked path share a common value then
-        // uniformity is preserved when the value is itself uniform.
-        if (!CommonValue)
-          CommonValue = Values[I];
-        else
-          Uniform = Values[I] == *CommonValue;
-      }
-      // Ignore undefined values when checking definitions.
+      // FIXME: We assume undefs are uniform and/or do not dominate the PHI
+      // in the presence of other constant or uniform values.
+      // This particular interpretation of the undef value was added to
+      // DivergenceAnalysis in the following review:
+      //
+      // https://reviews.llvm.org/D19013
       if (!Values[I])
         continue;
-      // Any value defined on the divergent path is divergent.
-      const BlockT *DefBlock = Context.getDefBlock(Values[I]);
-      if (DivDesc.BlockLabels.lookup_or(DefBlock, nullptr))
+
+      // Only consider predecessors on divergent path.
+      if (Blocks[I] != &DivTermBlock &&
+          !DivDesc.BlockLabels.lookup_or(Blocks[I], nullptr))
+        continue;
+
+      // Phi uniformity is maintained if all values on divergent path match.
+      if (!CommonValue)
+        CommonValue = Values[I];
+      else if (Values[I] != *CommonValue)
         Uniform = false;
     }
     if (Uniform)
diff --git a/llvm/lib/CodeGen/MachineSSAContext.cpp b/llvm/lib/CodeGen/MachineSSAContext.cpp
index 4b6f8d6b0f9f4..c77953121a3e0 100644
--- a/llvm/lib/CodeGen/MachineSSAContext.cpp
+++ b/llvm/lib/CodeGen/MachineSSAContext.cpp
@@ -90,10 +90,16 @@ void MachineSSAContext::getPhiInputs(
     SmallVectorImpl<const MachineBasicBlock *> &Blocks) const {
   if (!Phi.isPHI())
     return;
+
+  const MachineRegisterInfo &MRI = Phi.getMF()->getRegInfo();
+  // const Register DstReg = Phi.getOperand(0).getReg();
   for (unsigned Idx = 1, End = Phi.getNumOperands(); Idx < End; Idx += 2) {
-    // FIXME: ideally we would turn undef values into ValueRefNull.
-    // This could reduce number of PHIs marked in taintAndPushPhiNodes().
-    Values.push_back(Phi.getOperand(Idx).getReg());
+    Register Incoming = Phi.getOperand(Idx).getReg();
+    MachineInstr *Def = MRI.getVRegDef(Incoming);
+    // FIXME: should this also consider Incoming == DstReg undef?
+    if (Def && isUndef(*Def))
+      Incoming = ValueRefNull;
+    Values.push_back(Incoming);
     Blocks.push_back(Phi.getOperand(Idx + 1).getMBB());
   }
 }
diff --git a/llvm/lib/IR/SSAContext.cpp b/llvm/lib/IR/SSAContext.cpp
index 0249fe94d716e..0e3ff6f6910e4 100644
--- a/llvm/lib/IR/SSAContext.cpp
+++ b/llvm/lib/IR/SSAContext.cpp
@@ -77,7 +77,10 @@ void SSAContext::getPhiInputs(
     for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
       const Value *Incoming = Phi->getIncomingValue(I);
       const BasicBlock *Block = Phi->getIncomingBlock(I);
-      Values.push_back(!isa<UndefValue>(Incoming) ? Incoming : ValueRefNull);
+      // FIXME: should this also consider Incoming == &Instr undef?
+      if (isa<UndefValue>(Incoming))
+        Incoming = ValueRefNull;
+      Values.push_back(Incoming);
       Blocks.push_back(Block);
     }
   }
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/hidden_loopdiverge.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/hidden_loopdiverge.ll
index a2467a5480940..9ed4ff37dff15 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/hidden_loopdiverge.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/hidden_loopdiverge.ll
@@ -172,9 +172,9 @@ X:
 ; CHECK: DIVERGENT: %div.merge.x =
 
 Y:
-  %div.merge.y = phi i32 [ 42, %X ], [ %b, %B ]
+  %merge.y = phi i32 [ 42, %X ], [ %b, %B ]
   ret void
-; CHECK: DIVERGENT: %div.merge.y =
+; CHECK-NOT: DIVERGENT: %merge.y =
 }
 
 ; divergent loop (G<header>, L<exiting to D>) contained inside a uniform loop (H<header>, B, G, L , D<exiting to x>)

>From ca0707770af2accd49250fbeae3bf6e79995b21b Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson at amd.com>
Date: Mon, 29 Sep 2025 14:37:33 +0900
Subject: [PATCH 3/6] - Address reviewer comments

---
 llvm/include/llvm/ADT/GenericUniformityImpl.h                 | 4 ++++
 llvm/lib/CodeGen/MachineSSAContext.cpp                        | 3 +--
 .../UniformityAnalysis/AMDGPU/MIR/hidden-loop-diverge.mir     | 2 +-
 .../Analysis/UniformityAnalysis/AMDGPU/hidden_loopdiverge.ll  | 4 ++--
 4 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h
index f30345d4ff60b..96158f074886e 100644
--- a/llvm/include/llvm/ADT/GenericUniformityImpl.h
+++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h
@@ -956,6 +956,10 @@ void GenericUniformityAnalysisImpl<ContextT>::taintAndPushPhiNodes(
         CommonValue = Values[I];
       else if (Values[I] != *CommonValue)
         Uniform = false;
+
+      // Phi is reached via divergent exit (i.e. respect temporal divergence).
+      if (DivDesc.CycleDivBlocks.contains(Blocks[I]))
+        Uniform = false;
     }
     if (Uniform)
       continue;
diff --git a/llvm/lib/CodeGen/MachineSSAContext.cpp b/llvm/lib/CodeGen/MachineSSAContext.cpp
index c77953121a3e0..2a378772d93bb 100644
--- a/llvm/lib/CodeGen/MachineSSAContext.cpp
+++ b/llvm/lib/CodeGen/MachineSSAContext.cpp
@@ -88,8 +88,7 @@ template <>
 void MachineSSAContext::getPhiInputs(
     const MachineInstr &Phi, SmallVectorImpl<Register> &Values,
     SmallVectorImpl<const MachineBasicBlock *> &Blocks) const {
-  if (!Phi.isPHI())
-    return;
+  assert(Phi.isPHI());
 
   const MachineRegisterInfo &MRI = Phi.getMF()->getRegInfo();
   // const Register DstReg = Phi.getOperand(0).getReg();
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-loop-diverge.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-loop-diverge.mir
index fd811e276c593..efaa5de006c38 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-loop-diverge.mir
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-loop-diverge.mir
@@ -23,7 +23,7 @@
 # CHECK-NOT: DIVERGENT: G_BR %bb.5
 
 # CHECK-LABEL: BLOCK bb.4
-# CHECK-NOT: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_PHI %{{[0-9]*}}:_(s32), %bb.3, %{{[0-9]*}}:_(s32), %bb.2
+# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_PHI %{{[0-9]*}}:_(s32), %bb.3, %{{[0-9]*}}:_(s32), %bb.2
 
 # CHECK-LABEL: BLOCK bb.5
 # CHECK-NOT: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_PHI %{{[0-9]*}}:_(s32), %bb.3, %{{[0-9]*}}:_(s32), %bb.4
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/hidden_loopdiverge.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/hidden_loopdiverge.ll
index 9ed4ff37dff15..a2467a5480940 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/hidden_loopdiverge.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/hidden_loopdiverge.ll
@@ -172,9 +172,9 @@ X:
 ; CHECK: DIVERGENT: %div.merge.x =
 
 Y:
-  %merge.y = phi i32 [ 42, %X ], [ %b, %B ]
+  %div.merge.y = phi i32 [ 42, %X ], [ %b, %B ]
   ret void
-; CHECK-NOT: DIVERGENT: %merge.y =
+; CHECK: DIVERGENT: %div.merge.y =
 }
 
 ; divergent loop (G<header>, L<exiting to D>) contained inside a uniform loop (H<header>, B, G, L , D<exiting to x>)

>From f5fdd897c5047acea04b501080aa440c3925a445 Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson at amd.com>
Date: Mon, 29 Sep 2025 16:38:45 +0900
Subject: [PATCH 4/6] - Consider Phi with common value and undefs uniform -
 Address reviewer comments

---
 llvm/include/llvm/ADT/GenericUniformityImpl.h | 21 +++++++++++++------
 llvm/lib/IR/SSAContext.cpp                    | 20 +++++++++---------
 .../AMDGPU/MIR/hidden-loop-diverge.mir        |  2 +-
 3 files changed, 26 insertions(+), 17 deletions(-)

diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h
index 96158f074886e..7a3158b8ebc8c 100644
--- a/llvm/include/llvm/ADT/GenericUniformityImpl.h
+++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h
@@ -935,8 +935,10 @@ void GenericUniformityAnalysisImpl<ContextT>::taintAndPushPhiNodes(
     // implemented; in which case no uniformity is possible.
     bool Uniform = !Values.empty();
 
-    std::optional<ConstValueRefT> CommonValue;
-    for (unsigned I = 0; I < Blocks.size() && Uniform; ++I) {
+    std::optional<ConstValueRefT> PhiCommon, PathCommon;
+    bool HasSingleValue = true;
+    for (unsigned I = 0; I < Blocks.size() && (HasSingleValue || Uniform);
+         ++I) {
       // FIXME: We assume undefs are uniform and/or do not dominate the PHI
       // in the presence of other constant or uniform values.
       // This particular interpretation of the undef value was added to
@@ -946,22 +948,29 @@ void GenericUniformityAnalysisImpl<ContextT>::taintAndPushPhiNodes(
       if (!Values[I])
         continue;
 
+      // Track common value for all inputs.
+      if (!PhiCommon)
+        PhiCommon = Values[I];
+      else if (Values[I] != *PhiCommon)
+        HasSingleValue = false;
+
       // Only consider predecessors on divergent path.
       if (Blocks[I] != &DivTermBlock &&
           !DivDesc.BlockLabels.lookup_or(Blocks[I], nullptr))
         continue;
 
       // Phi uniformity is maintained if all values on divergent path match.
-      if (!CommonValue)
-        CommonValue = Values[I];
-      else if (Values[I] != *CommonValue)
+      if (!PathCommon)
+        PathCommon = Values[I];
+      else if (Values[I] != *PathCommon)
         Uniform = false;
 
       // Phi is reached via divergent exit (i.e. respect temporal divergence).
       if (DivDesc.CycleDivBlocks.contains(Blocks[I]))
         Uniform = false;
     }
-    if (Uniform)
+
+    if (Uniform || HasSingleValue)
       continue;
 
     LLVM_DEBUG(dbgs() << "tainted: " << Phi << "\n");
diff --git a/llvm/lib/IR/SSAContext.cpp b/llvm/lib/IR/SSAContext.cpp
index 0e3ff6f6910e4..bc35d904ba61f 100644
--- a/llvm/lib/IR/SSAContext.cpp
+++ b/llvm/lib/IR/SSAContext.cpp
@@ -73,16 +73,16 @@ template <>
 void SSAContext::getPhiInputs(
     const Instruction &Instr, SmallVectorImpl<const Value *> &Values,
     SmallVectorImpl<const BasicBlock *> &Blocks) const {
-  if (auto *Phi = dyn_cast<PHINode>(&Instr)) {
-    for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
-      const Value *Incoming = Phi->getIncomingValue(I);
-      const BasicBlock *Block = Phi->getIncomingBlock(I);
-      // FIXME: should this also consider Incoming == &Instr undef?
-      if (isa<UndefValue>(Incoming))
-        Incoming = ValueRefNull;
-      Values.push_back(Incoming);
-      Blocks.push_back(Block);
-    }
+  assert(isa<PHINode>(Instr));
+  const PHINode *Phi = static_cast<const PHINode *>(&Instr);
+  for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
+    const Value *Incoming = Phi->getIncomingValue(I);
+    const BasicBlock *Block = Phi->getIncomingBlock(I);
+    // FIXME: should this also consider Incoming == &Instr undef?
+    if (isa<UndefValue>(Incoming))
+      Incoming = ValueRefNull;
+    Values.push_back(Incoming);
+    Blocks.push_back(Block);
   }
 }
 
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-loop-diverge.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-loop-diverge.mir
index efaa5de006c38..fd811e276c593 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-loop-diverge.mir
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-loop-diverge.mir
@@ -23,7 +23,7 @@
 # CHECK-NOT: DIVERGENT: G_BR %bb.5
 
 # CHECK-LABEL: BLOCK bb.4
-# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_PHI %{{[0-9]*}}:_(s32), %bb.3, %{{[0-9]*}}:_(s32), %bb.2
+# CHECK-NOT: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_PHI %{{[0-9]*}}:_(s32), %bb.3, %{{[0-9]*}}:_(s32), %bb.2
 
 # CHECK-LABEL: BLOCK bb.5
 # CHECK-NOT: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_PHI %{{[0-9]*}}:_(s32), %bb.3, %{{[0-9]*}}:_(s32), %bb.4

>From d2f469a66f2c81721f1f9692913b4a7130359f8a Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson at amd.com>
Date: Mon, 6 Oct 2025 17:39:51 +0900
Subject: [PATCH 5/6] - Address reviewer comments

---
 llvm/include/llvm/ADT/GenericUniformityImpl.h | 24 ++++++++++++-------
 .../AMDGPU/phi_no_divergence.ll               |  7 ++++++
 2 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h
index 7a3158b8ebc8c..ce1ffac21714b 100644
--- a/llvm/include/llvm/ADT/GenericUniformityImpl.h
+++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h
@@ -933,11 +933,11 @@ void GenericUniformityAnalysisImpl<ContextT>::taintAndPushPhiNodes(
 
     // Allow an empty Blocks/Values list to signify getPhiInputs is not
     // implemented; in which case no uniformity is possible.
-    bool Uniform = !Values.empty();
+    bool HasSingleValue = !Values.empty();
+    bool UniformOnPath = HasSingleValue;
 
     std::optional<ConstValueRefT> PhiCommon, PathCommon;
-    bool HasSingleValue = true;
-    for (unsigned I = 0; I < Blocks.size() && (HasSingleValue || Uniform);
+    for (unsigned I = 0; I < Blocks.size() && (UniformOnPath || HasSingleValue);
          ++I) {
       // FIXME: We assume undefs are uniform and/or do not dominate the PHI
       // in the presence of other constant or uniform values.
@@ -954,23 +954,29 @@ void GenericUniformityAnalysisImpl<ContextT>::taintAndPushPhiNodes(
       else if (Values[I] != *PhiCommon)
         HasSingleValue = false;
 
+      // Divergent path does not have uniform value.
+      if (!UniformOnPath)
+        continue;
+
       // Only consider predecessors on divergent path.
       if (Blocks[I] != &DivTermBlock &&
           !DivDesc.BlockLabels.lookup_or(Blocks[I], nullptr))
         continue;
 
+      // Phi is reached via divergent exit (i.e. respect temporal divergence).
+      if (DivDesc.CycleDivBlocks.contains(Blocks[I])) {
+        UniformOnPath = false;
+        continue;
+      }
+
       // Phi uniformity is maintained if all values on divergent path match.
       if (!PathCommon)
         PathCommon = Values[I];
       else if (Values[I] != *PathCommon)
-        Uniform = false;
-
-      // Phi is reached via divergent exit (i.e. respect temporal divergence).
-      if (DivDesc.CycleDivBlocks.contains(Blocks[I]))
-        Uniform = false;
+        UniformOnPath = false;
     }
 
-    if (Uniform || HasSingleValue)
+    if (UniformOnPath || HasSingleValue)
       continue;
 
     LLVM_DEBUG(dbgs() << "tainted: " << Phi << "\n");
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_no_divergence.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_no_divergence.ll
index 7e1018bf59d5f..be7a5e356e67b 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_no_divergence.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_no_divergence.ll
@@ -1,5 +1,10 @@
 ; RUN: opt %s -mtriple amdgcn-- -passes='print<uniformity>' -disable-output 2>&1 | FileCheck %s
 
+; Test PHIs that are uniform because they have a common/constant value over
+; the divergent paths.
+
+; Loop is uniform because loop exit PHI has constant value over all internal
+; divergent paths.
 define amdgpu_kernel void @no_divergent_exit1(i32 %a, i32 %b, i32 %c) #0 {
 ; CHECK-LABEL: for function 'no_divergent_exit1'
 entry:
@@ -37,6 +42,7 @@ end:
   ret void
 }
 
+; As no_divergent_exit1 but with merge block before exit.
 define amdgpu_kernel void @no_divergent_exit2(i32 %a, i32 %b, i32 %c) #0 {
 ; CHECK-LABEL: for function 'no_divergent_exit2'
 entry:
@@ -81,6 +87,7 @@ end:
   ret void
 }
 
+; Test PHI with constant value over divergent path without a loop.
 define amdgpu_kernel void @no_loop_phi_divergence(i32 %a) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()

>From ce186d51bda576afdcd51263c99b572579401a18 Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson at amd.com>
Date: Thu, 9 Oct 2025 16:58:55 +0900
Subject: [PATCH 6/6] - Incorporate reviewer feedback

---
 llvm/include/llvm/ADT/GenericUniformityImpl.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h
index ce1ffac21714b..888dc074e054c 100644
--- a/llvm/include/llvm/ADT/GenericUniformityImpl.h
+++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h
@@ -972,8 +972,11 @@ void GenericUniformityAnalysisImpl<ContextT>::taintAndPushPhiNodes(
       // Phi uniformity is maintained if all values on divergent path match.
       if (!PathCommon)
         PathCommon = Values[I];
-      else if (Values[I] != *PathCommon)
+      else if (Values[I] != *PathCommon) {
         UniformOnPath = false;
+        assert(!HasSingleValue);
+        break;
+      }
     }
 
     if (UniformOnPath || HasSingleValue)