[llvm] [Uniformity] Avoid marking all PHIs as divergent in join blocks (PR #157808)

Thu Sep 25 02:46:57 PDT 2025

https://github.com/perlfu updated https://github.com/llvm/llvm-project/pull/157808

>From 7c44e3b518c364a586f3beea081b278da0fa5f5e Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson at amd.com>
Date: Mon, 1 Sep 2025 13:34:54 +0900
Subject: [PATCH 1/2] [Uniformity] Avoid marking all PHIs as divergent in join
 blocks

Attempt to avoid marking PHIs divergent in join blocks.
Only mark PHIs which contain values with sync dependence on the
divergent terminator condition.
---
 llvm/include/llvm/ADT/GenericSSAContext.h     |   6 +-
 llvm/include/llvm/ADT/GenericUniformityImpl.h |  51 +++++++-
 llvm/lib/CodeGen/MachineSSAContext.cpp        |  14 +++
 llvm/lib/IR/SSAContext.cpp                    |  15 +++
 .../AMDGPU/phi_no_divergence.ll               | 114 ++++++++++++++++++
 5 files changed, 194 insertions(+), 6 deletions(-)
 create mode 100644 llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_no_divergence.ll

diff --git a/llvm/include/llvm/ADT/GenericSSAContext.h b/llvm/include/llvm/ADT/GenericSSAContext.h
index 6aa3a8b9b6e0b..abcc093dd88ab 100644
--- a/llvm/include/llvm/ADT/GenericSSAContext.h
+++ b/llvm/include/llvm/ADT/GenericSSAContext.h
@@ -54,7 +54,7 @@ template <typename _FunctionT> class GenericSSAContext {
 
   // The null value for ValueRefT. For LLVM IR and MIR, this is simply the
   // default constructed value.
-  static constexpr ValueRefT *ValueRefNull = {};
+  static constexpr ValueRefT ValueRefNull = {};
 
   // An InstructionT usually defines one or more ValueT objects.
   using InstructionT = typename SSATraits::InstructionT;
@@ -96,6 +96,10 @@ template <typename _FunctionT> class GenericSSAContext {
   static bool isConstantOrUndefValuePhi(const InstructionT &Instr);
   const BlockT *getDefBlock(ConstValueRefT value) const;
 
+  void getPhiInputs(const InstructionT &Instr,
+                    SmallVectorImpl<ConstValueRefT> &Values,
+                    SmallVectorImpl<const BlockT *> &Blocks) const;
+
   Printable print(const BlockT *block) const;
   Printable printAsOperand(const BlockT *BB) const;
   Printable print(const InstructionT *inst) const;
diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h
index 3b9b7f2633771..0a1adc30e69e0 100644
--- a/llvm/include/llvm/ADT/GenericUniformityImpl.h
+++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h
@@ -455,9 +455,11 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
   /// the worklist.
   void taintAndPushAllDefs(const BlockT &JoinBlock);
 
-  /// \brief Mark all phi nodes in \p JoinBlock as divergent and push them on
-  /// the worklist.
-  void taintAndPushPhiNodes(const BlockT &JoinBlock);
+  /// \brief Mark phi nodes in \p JoinBlock as divergent and push them on
+  /// the worklist if they are divergent over the by the path \p JoinBlock
+  /// to \p DivTermBlock.
+  void taintAndPushPhiNodes(const BlockT &JoinBlock, const BlockT &DivTermBlock,
+                            const DivergenceDescriptorT &DivDesc);
 
   /// \brief Identify all Instructions that become divergent because \p DivExit
   /// is a divergent cycle exit of \p DivCycle. Mark those instructions as
@@ -917,7 +919,8 @@ void GenericUniformityAnalysisImpl<ContextT>::taintAndPushAllDefs(
 /// Mark divergent phi nodes in a join block
 template <typename ContextT>
 void GenericUniformityAnalysisImpl<ContextT>::taintAndPushPhiNodes(
-    const BlockT &JoinBlock) {
+    const BlockT &JoinBlock, const BlockT &DivTermBlock,
+    const DivergenceDescriptorT &DivDesc) {
   LLVM_DEBUG(dbgs() << "taintAndPushPhiNodes in " << Context.print(&JoinBlock)
                     << "\n");
   for (const auto &Phi : JoinBlock.phis()) {
@@ -930,6 +933,44 @@ void GenericUniformityAnalysisImpl<ContextT>::taintAndPushPhiNodes(
     // https://reviews.llvm.org/D19013
     if (ContextT::isConstantOrUndefValuePhi(Phi))
       continue;
+
+    // Attempt to maintain uniformity for PHIs by considering control
+    // dependencies.
+    SmallVector<ConstValueRefT> Values;
+    SmallVector<const BlockT *> Blocks;
+    Context.getPhiInputs(Phi, Values, Blocks);
+    assert(Blocks.size() == Values.size());
+
+    // Allow an empty Blocks/Values list to signify getPhiInputs is not
+    // implemented; in which case no uniformity is possible.
+    bool Uniform = !Values.empty();
+
+    std::optional<ConstValueRefT> CommonValue;
+    for (unsigned I = 0; I < Blocks.size() && Uniform; ++I) {
+      if (DivDesc.CycleDivBlocks.contains(Blocks[I])) {
+        // If PHI is reachable via divergent exit it is divergent.
+        Uniform = false;
+      } else if (DT.dominates(&DivTermBlock, Blocks[I]) ||
+                 DivDesc.BlockLabels.lookup_or(Blocks[I], nullptr)) {
+        // If all edges from the marked path share a common value then
+        // uniformity is preserved when the value is itself uniform.
+        if (!CommonValue)
+          CommonValue = Values[I];
+        else
+          Uniform = Values[I] == *CommonValue;
+      }
+      // Ignore undefined values when checking definitions.
+      if (!Values[I])
+        continue;
+      // Any value defined on the divergent path is divergent.
+      const BlockT *DefBlock = Context.getDefBlock(Values[I]);
+      if (DivDesc.BlockLabels.lookup_or(DefBlock, nullptr))
+        Uniform = false;
+    }
+    if (Uniform)
+      continue;
+
+    LLVM_DEBUG(dbgs() << "tainted: " << Phi << "\n");
     markDivergent(Phi);
   }
 }
@@ -1087,7 +1128,7 @@ void GenericUniformityAnalysisImpl<ContextT>::analyzeControlDivergence(
       DivCycles.push_back(Outermost);
       continue;
     }
-    taintAndPushPhiNodes(*JoinBlock);
+    taintAndPushPhiNodes(*JoinBlock, *DivTermBlock, DivDesc);
   }
 
   // Sort by order of decreasing depth. This allows later cycles to be skipped
diff --git a/llvm/lib/CodeGen/MachineSSAContext.cpp b/llvm/lib/CodeGen/MachineSSAContext.cpp
index bbbfb3ce2788d..4b6f8d6b0f9f4 100644
--- a/llvm/lib/CodeGen/MachineSSAContext.cpp
+++ b/llvm/lib/CodeGen/MachineSSAContext.cpp
@@ -84,6 +84,20 @@ bool MachineSSAContext::isConstantOrUndefValuePhi(const MachineInstr &Phi) {
   return true;
 }
 
+template <>
+void MachineSSAContext::getPhiInputs(
+    const MachineInstr &Phi, SmallVectorImpl<Register> &Values,
+    SmallVectorImpl<const MachineBasicBlock *> &Blocks) const {
+  if (!Phi.isPHI())
+    return;
+  for (unsigned Idx = 1, End = Phi.getNumOperands(); Idx < End; Idx += 2) {
+    // FIXME: ideally we would turn undef values into ValueRefNull.
+    // This could reduce number of PHIs marked in taintAndPushPhiNodes().
+    Values.push_back(Phi.getOperand(Idx).getReg());
+    Blocks.push_back(Phi.getOperand(Idx + 1).getMBB());
+  }
+}
+
 template <>
 Intrinsic::ID MachineSSAContext::getIntrinsicID(const MachineInstr &MI) {
   if (auto *GI = dyn_cast<GIntrinsic>(&MI))
diff --git a/llvm/lib/IR/SSAContext.cpp b/llvm/lib/IR/SSAContext.cpp
index 20b6ea1e972d4..0249fe94d716e 100644
--- a/llvm/lib/IR/SSAContext.cpp
+++ b/llvm/lib/IR/SSAContext.cpp
@@ -14,6 +14,7 @@
 
 #include "llvm/IR/SSAContext.h"
 #include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
@@ -68,6 +69,20 @@ bool SSAContext::isConstantOrUndefValuePhi(const Instruction &Instr) {
   return false;
 }
 
+template <>
+void SSAContext::getPhiInputs(
+    const Instruction &Instr, SmallVectorImpl<const Value *> &Values,
+    SmallVectorImpl<const BasicBlock *> &Blocks) const {
+  if (auto *Phi = dyn_cast<PHINode>(&Instr)) {
+    for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
+      const Value *Incoming = Phi->getIncomingValue(I);
+      const BasicBlock *Block = Phi->getIncomingBlock(I);
+      Values.push_back(!isa<UndefValue>(Incoming) ? Incoming : ValueRefNull);
+      Blocks.push_back(Block);
+    }
+  }
+}
+
 template <> Intrinsic::ID SSAContext::getIntrinsicID(const Instruction &I) {
   if (auto *CB = dyn_cast<CallBase>(&I))
     return CB->getIntrinsicID();
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_no_divergence.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_no_divergence.ll
new file mode 100644
index 0000000000000..7e1018bf59d5f
--- /dev/null
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_no_divergence.ll
@@ -0,0 +1,114 @@
+; RUN: opt %s -mtriple amdgcn-- -passes='print<uniformity>' -disable-output 2>&1 | FileCheck %s
+
+define amdgpu_kernel void @no_divergent_exit1(i32 %a, i32 %b, i32 %c) #0 {
+; CHECK-LABEL: for function 'no_divergent_exit1'
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %div.cond = icmp slt i32 %tid, 0
+; CHECK: DIVERGENT: %div.cond =
+  br label %header
+
+header:
+  %loop.b = phi i32 [ %b, %entry ], [ %new.b, %body.1 ], [ %new.b, %body.2 ]
+; CHECK-NOT: DIVERGENT: %loop.b =
+  %loop.c = phi i32 [ %c, %entry ], [ %loop.c, %body.1 ], [ %new.c, %body.2 ]
+; CHECK: DIVERGENT: %loop.c =
+  %exit.val = phi i32 [ %a, %entry ], [ %next.exit.val, %body.1 ], [ %next.exit.val, %body.2 ]
+; CHECK-NOT: DIVERGENT: %exit.val =
+  %exit.cond = icmp slt i32 %exit.val, 42
+; CHECK-NOT: DIVERGENT: %exit.cond =
+  br i1 %exit.cond, label %end, label %body.1
+; CHECK-NOT: DIVERGENT: br i1 %exit.cond,
+
+body.1:
+  %new.b = add i32 %loop.b, 1
+; CHECK-NOT: DIVERGENT: %new.b =
+  %next.exit.val = add i32 %exit.val, 1
+; CHECK-NOT: DIVERGENT: %next.exit.val =
+  br i1 %div.cond, label %body.2, label %header
+; CHECK: DIVERGENT: br i1 %div.cond,
+
+body.2:
+  %new.c = add i32 %loop.c, 1
+; CHECK: DIVERGENT: %new.c =
+  br label %header
+
+end:
+  ret void
+}
+
+define amdgpu_kernel void @no_divergent_exit2(i32 %a, i32 %b, i32 %c) #0 {
+; CHECK-LABEL: for function 'no_divergent_exit2'
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %div.cond = icmp slt i32 %tid, 0
+; CHECK: DIVERGENT: %div.cond =
+  br label %header
+
+header:
+  %loop.b = phi i32 [ %b, %entry ], [ %merge.b, %merge ]
+; CHECK-NOT: DIVERGENT: %loop.b =
+  %loop.c = phi i32 [ %c, %entry ], [ %merge.c, %merge ]
+; CHECK: DIVERGENT: %loop.c =
+  %exit.val = phi i32 [ %a, %entry ], [ %next.exit.val, %merge ]
+; CHECK-NOT: DIVERGENT: %exit.val =
+  %exit.cond = icmp slt i32 %exit.val, 42
+; CHECK-NOT: DIVERGENT: %exit.cond =
+  br i1 %exit.cond, label %end, label %body.1
+; CHECK-NOT: DIVERGENT: br i1 %exit.cond,
+
+body.1:
+  %new.b = add i32 %loop.b, 1
+; CHECK-NOT: DIVERGENT: %new.b =
+  %next.exit.val = add i32 %exit.val, 1
+; CHECK-NOT: DIVERGENT: %next.exit.val =
+  br i1 %div.cond, label %body.2, label %merge
+; CHECK: DIVERGENT: br i1 %div.cond,
+
+body.2:
+  %new.c = add i32 %loop.c, 1
+; CHECK: DIVERGENT: %new.c =
+  br label %merge
+
+merge:
+  %merge.b = phi i32 [ %new.b, %body.1 ], [ %new.b, %body.2 ]
+; CHECK-NOT: DIVERGENT: %merge.b =
+  %merge.c = phi i32 [ %loop.c, %body.1 ], [ %new.c, %body.2 ]
+; CHECK: DIVERGENT: %merge.c =
+  br label %header
+
+end:
+  ret void
+}
+
+define amdgpu_kernel void @no_loop_phi_divergence(i32 %a) #0 {
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %uni.cond = icmp slt i32 %a, 0
+; CHECK-NOT: DIVERGENT: %uni.cond =
+  %div.cond = icmp slt i32 %tid, 0
+; CHECK: DIVERGENT: %div.cond =
+  br i1 %uni.cond, label %div.branch.block, label %merge
+; CHECK-NOT: DIVERGENT: br i1 %uni.cond,
+
+div.branch.block:
+  br i1 %div.cond, label %div.block.1, label %div.block.2
+; CHECK: DIVERGENT: br i1 %div.cond,
+
+div.block.1:
+  br label %merge
+
+div.block.2:
+  br label %merge
+
+merge:
+  %uni.val = phi i32 [ 0, %entry ], [ 1, %div.block.1 ], [ 1, %div.block.2 ]
+; CHECK-NOT: DIVERGENT: %uni.val =
+  %div.val = phi i32 [ 0, %entry ], [ 1, %div.block.1 ], [ 2, %div.block.2 ]
+; CHECK: DIVERGENT: %div.val =
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone }

>From 2804480ba4e62498fb09eb63a4bf844ac651738e Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson at amd.com>
Date: Thu, 25 Sep 2025 18:28:44 +0900
Subject: [PATCH 2/2] - Rework based on reviewer feedback

---
 llvm/include/llvm/ADT/GenericUniformityImpl.h | 46 ++++++++-----------
 llvm/lib/CodeGen/MachineSSAContext.cpp        | 12 +++--
 llvm/lib/IR/SSAContext.cpp                    |  5 +-
 .../AMDGPU/hidden_loopdiverge.ll              |  4 +-
 4 files changed, 33 insertions(+), 34 deletions(-)

diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h
index 0a1adc30e69e0..f30345d4ff60b 100644
--- a/llvm/include/llvm/ADT/GenericUniformityImpl.h
+++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h
@@ -456,7 +456,7 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
   void taintAndPushAllDefs(const BlockT &JoinBlock);
 
   /// \brief Mark phi nodes in \p JoinBlock as divergent and push them on
-  /// the worklist if they are divergent over the by the path \p JoinBlock
+  /// the worklist if they are divergent over the path from \p JoinBlock
   /// to \p DivTermBlock.
   void taintAndPushPhiNodes(const BlockT &JoinBlock, const BlockT &DivTermBlock,
                             const DivergenceDescriptorT &DivDesc);
@@ -924,18 +924,8 @@ void GenericUniformityAnalysisImpl<ContextT>::taintAndPushPhiNodes(
   LLVM_DEBUG(dbgs() << "taintAndPushPhiNodes in " << Context.print(&JoinBlock)
                     << "\n");
   for (const auto &Phi : JoinBlock.phis()) {
-    // FIXME: The non-undef value is not constant per se; it just happens to be
-    // uniform and may not dominate this PHI. So assuming that the same value
-    // reaches along all incoming edges may itself be undefined behaviour. This
-    // particular interpretation of the undef value was added to
-    // DivergenceAnalysis in the following review:
-    //
-    // https://reviews.llvm.org/D19013
-    if (ContextT::isConstantOrUndefValuePhi(Phi))
-      continue;
-
     // Attempt to maintain uniformity for PHIs by considering control
-    // dependencies.
+    // dependencies before marking them.
     SmallVector<ConstValueRefT> Values;
     SmallVector<const BlockT *> Blocks;
     Context.getPhiInputs(Phi, Values, Blocks);
@@ -947,24 +937,24 @@ void GenericUniformityAnalysisImpl<ContextT>::taintAndPushPhiNodes(
 
     std::optional<ConstValueRefT> CommonValue;
     for (unsigned I = 0; I < Blocks.size() && Uniform; ++I) {
-      if (DivDesc.CycleDivBlocks.contains(Blocks[I])) {
-        // If PHI is reachable via divergent exit it is divergent.
-        Uniform = false;
-      } else if (DT.dominates(&DivTermBlock, Blocks[I]) ||
-                 DivDesc.BlockLabels.lookup_or(Blocks[I], nullptr)) {
-        // If all edges from the marked path share a common value then
-        // uniformity is preserved when the value is itself uniform.
-        if (!CommonValue)
-          CommonValue = Values[I];
-        else
-          Uniform = Values[I] == *CommonValue;
-      }
-      // Ignore undefined values when checking definitions.
+      // FIXME: We assume undefs are uniform and/or do not dominate the PHI
+      // in the presence of other constant or uniform values.
+      // This particular interpretation of the undef value was added to
+      // DivergenceAnalysis in the following review:
+      //
+      // https://reviews.llvm.org/D19013
       if (!Values[I])
         continue;
-      // Any value defined on the divergent path is divergent.
-      const BlockT *DefBlock = Context.getDefBlock(Values[I]);
-      if (DivDesc.BlockLabels.lookup_or(DefBlock, nullptr))
+
+      // Only consider predecessors on divergent path.
+      if (Blocks[I] != &DivTermBlock &&
+          !DivDesc.BlockLabels.lookup_or(Blocks[I], nullptr))
+        continue;
+
+      // Phi uniformity is maintained if all values on divergent path match.
+      if (!CommonValue)
+        CommonValue = Values[I];
+      else if (Values[I] != *CommonValue)
         Uniform = false;
     }
     if (Uniform)
diff --git a/llvm/lib/CodeGen/MachineSSAContext.cpp b/llvm/lib/CodeGen/MachineSSAContext.cpp
index 4b6f8d6b0f9f4..c77953121a3e0 100644
--- a/llvm/lib/CodeGen/MachineSSAContext.cpp
+++ b/llvm/lib/CodeGen/MachineSSAContext.cpp
@@ -90,10 +90,16 @@ void MachineSSAContext::getPhiInputs(
     SmallVectorImpl<const MachineBasicBlock *> &Blocks) const {
   if (!Phi.isPHI())
     return;
+
+  const MachineRegisterInfo &MRI = Phi.getMF()->getRegInfo();
+  // const Register DstReg = Phi.getOperand(0).getReg();
   for (unsigned Idx = 1, End = Phi.getNumOperands(); Idx < End; Idx += 2) {
-    // FIXME: ideally we would turn undef values into ValueRefNull.
-    // This could reduce number of PHIs marked in taintAndPushPhiNodes().
-    Values.push_back(Phi.getOperand(Idx).getReg());
+    Register Incoming = Phi.getOperand(Idx).getReg();
+    MachineInstr *Def = MRI.getVRegDef(Incoming);
+    // FIXME: should this also consider Incoming == DstReg undef?
+    if (Def && isUndef(*Def))
+      Incoming = ValueRefNull;
+    Values.push_back(Incoming);
     Blocks.push_back(Phi.getOperand(Idx + 1).getMBB());
   }
 }
diff --git a/llvm/lib/IR/SSAContext.cpp b/llvm/lib/IR/SSAContext.cpp
index 0249fe94d716e..0e3ff6f6910e4 100644
--- a/llvm/lib/IR/SSAContext.cpp
+++ b/llvm/lib/IR/SSAContext.cpp
@@ -77,7 +77,10 @@ void SSAContext::getPhiInputs(
     for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
       const Value *Incoming = Phi->getIncomingValue(I);
       const BasicBlock *Block = Phi->getIncomingBlock(I);
-      Values.push_back(!isa<UndefValue>(Incoming) ? Incoming : ValueRefNull);
+      // FIXME: should this also consider Incoming == &Instr undef?
+      if (isa<UndefValue>(Incoming))
+        Incoming = ValueRefNull;
+      Values.push_back(Incoming);
       Blocks.push_back(Block);
     }
   }
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/hidden_loopdiverge.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/hidden_loopdiverge.ll
index a2467a5480940..9ed4ff37dff15 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/hidden_loopdiverge.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/hidden_loopdiverge.ll
@@ -172,9 +172,9 @@ X:
 ; CHECK: DIVERGENT: %div.merge.x =
 
 Y:
-  %div.merge.y = phi i32 [ 42, %X ], [ %b, %B ]
+  %merge.y = phi i32 [ 42, %X ], [ %b, %B ]
   ret void
-; CHECK: DIVERGENT: %div.merge.y =
+; CHECK-NOT: DIVERGENT: %merge.y =
 }
 
 ; divergent loop (G<header>, L<exiting to D>) contained inside a uniform loop (H<header>, B, G, L , D<exiting to x>)