[llvm] [Uniformity] Avoid marking all PHIs as divergent in join blocks (PR #157808)

Wed Sep 10 01:02:56 PDT 2025

https://github.com/perlfu created https://github.com/llvm/llvm-project/pull/157808

Attempt to avoid marking PHIs divergent in join blocks. Only mark PHIs which contain values with sync dependence on the divergent terminator condition.

>From 7c44e3b518c364a586f3beea081b278da0fa5f5e Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson at amd.com>
Date: Mon, 1 Sep 2025 13:34:54 +0900
Subject: [PATCH] [Uniformity] Avoid marking all PHIs as divergent in join
 blocks

Attempt to avoid marking PHIs divergent in join blocks.
Only mark PHIs which contain values with sync dependence on the
divergent terminator condition.
---
 llvm/include/llvm/ADT/GenericSSAContext.h     |   6 +-
 llvm/include/llvm/ADT/GenericUniformityImpl.h |  51 +++++++-
 llvm/lib/CodeGen/MachineSSAContext.cpp        |  14 +++
 llvm/lib/IR/SSAContext.cpp                    |  15 +++
 .../AMDGPU/phi_no_divergence.ll               | 114 ++++++++++++++++++
 5 files changed, 194 insertions(+), 6 deletions(-)
 create mode 100644 llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_no_divergence.ll

diff --git a/llvm/include/llvm/ADT/GenericSSAContext.h b/llvm/include/llvm/ADT/GenericSSAContext.h
index 6aa3a8b9b6e0b..abcc093dd88ab 100644
--- a/llvm/include/llvm/ADT/GenericSSAContext.h
+++ b/llvm/include/llvm/ADT/GenericSSAContext.h
@@ -54,7 +54,7 @@ template <typename _FunctionT> class GenericSSAContext {
 
   // The null value for ValueRefT. For LLVM IR and MIR, this is simply the
   // default constructed value.
-  static constexpr ValueRefT *ValueRefNull = {};
+  static constexpr ValueRefT ValueRefNull = {};
 
   // An InstructionT usually defines one or more ValueT objects.
   using InstructionT = typename SSATraits::InstructionT;
@@ -96,6 +96,10 @@ template <typename _FunctionT> class GenericSSAContext {
   static bool isConstantOrUndefValuePhi(const InstructionT &Instr);
   const BlockT *getDefBlock(ConstValueRefT value) const;
 
+  void getPhiInputs(const InstructionT &Instr,
+                    SmallVectorImpl<ConstValueRefT> &Values,
+                    SmallVectorImpl<const BlockT *> &Blocks) const;
+
   Printable print(const BlockT *block) const;
   Printable printAsOperand(const BlockT *BB) const;
   Printable print(const InstructionT *inst) const;
diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h
index 3b9b7f2633771..0a1adc30e69e0 100644
--- a/llvm/include/llvm/ADT/GenericUniformityImpl.h
+++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h
@@ -455,9 +455,11 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
   /// the worklist.
   void taintAndPushAllDefs(const BlockT &JoinBlock);
 
-  /// \brief Mark all phi nodes in \p JoinBlock as divergent and push them on
-  /// the worklist.
-  void taintAndPushPhiNodes(const BlockT &JoinBlock);
+  /// \brief Mark phi nodes in \p JoinBlock as divergent and push them on
+  /// the worklist if they are divergent over the by the path \p JoinBlock
+  /// to \p DivTermBlock.
+  void taintAndPushPhiNodes(const BlockT &JoinBlock, const BlockT &DivTermBlock,
+                            const DivergenceDescriptorT &DivDesc);
 
   /// \brief Identify all Instructions that become divergent because \p DivExit
   /// is a divergent cycle exit of \p DivCycle. Mark those instructions as
@@ -917,7 +919,8 @@ void GenericUniformityAnalysisImpl<ContextT>::taintAndPushAllDefs(
 /// Mark divergent phi nodes in a join block
 template <typename ContextT>
 void GenericUniformityAnalysisImpl<ContextT>::taintAndPushPhiNodes(
-    const BlockT &JoinBlock) {
+    const BlockT &JoinBlock, const BlockT &DivTermBlock,
+    const DivergenceDescriptorT &DivDesc) {
   LLVM_DEBUG(dbgs() << "taintAndPushPhiNodes in " << Context.print(&JoinBlock)
                     << "\n");
   for (const auto &Phi : JoinBlock.phis()) {
@@ -930,6 +933,44 @@ void GenericUniformityAnalysisImpl<ContextT>::taintAndPushPhiNodes(
     // https://reviews.llvm.org/D19013
     if (ContextT::isConstantOrUndefValuePhi(Phi))
       continue;
+
+    // Attempt to maintain uniformity for PHIs by considering control
+    // dependencies.
+    SmallVector<ConstValueRefT> Values;
+    SmallVector<const BlockT *> Blocks;
+    Context.getPhiInputs(Phi, Values, Blocks);
+    assert(Blocks.size() == Values.size());
+
+    // Allow an empty Blocks/Values list to signify getPhiInputs is not
+    // implemented; in which case no uniformity is possible.
+    bool Uniform = !Values.empty();
+
+    std::optional<ConstValueRefT> CommonValue;
+    for (unsigned I = 0; I < Blocks.size() && Uniform; ++I) {
+      if (DivDesc.CycleDivBlocks.contains(Blocks[I])) {
+        // If PHI is reachable via divergent exit it is divergent.
+        Uniform = false;
+      } else if (DT.dominates(&DivTermBlock, Blocks[I]) ||
+                 DivDesc.BlockLabels.lookup_or(Blocks[I], nullptr)) {
+        // If all edges from the marked path share a common value then
+        // uniformity is preserved when the value is itself uniform.
+        if (!CommonValue)
+          CommonValue = Values[I];
+        else
+          Uniform = Values[I] == *CommonValue;
+      }
+      // Ignore undefined values when checking definitions.
+      if (!Values[I])
+        continue;
+      // Any value defined on the divergent path is divergent.
+      const BlockT *DefBlock = Context.getDefBlock(Values[I]);
+      if (DivDesc.BlockLabels.lookup_or(DefBlock, nullptr))
+        Uniform = false;
+    }
+    if (Uniform)
+      continue;
+
+    LLVM_DEBUG(dbgs() << "tainted: " << Phi << "\n");
     markDivergent(Phi);
   }
 }
@@ -1087,7 +1128,7 @@ void GenericUniformityAnalysisImpl<ContextT>::analyzeControlDivergence(
       DivCycles.push_back(Outermost);
       continue;
     }
-    taintAndPushPhiNodes(*JoinBlock);
+    taintAndPushPhiNodes(*JoinBlock, *DivTermBlock, DivDesc);
   }
 
   // Sort by order of decreasing depth. This allows later cycles to be skipped
diff --git a/llvm/lib/CodeGen/MachineSSAContext.cpp b/llvm/lib/CodeGen/MachineSSAContext.cpp
index bbbfb3ce2788d..4b6f8d6b0f9f4 100644
--- a/llvm/lib/CodeGen/MachineSSAContext.cpp
+++ b/llvm/lib/CodeGen/MachineSSAContext.cpp
@@ -84,6 +84,20 @@ bool MachineSSAContext::isConstantOrUndefValuePhi(const MachineInstr &Phi) {
   return true;
 }
 
+template <>
+void MachineSSAContext::getPhiInputs(
+    const MachineInstr &Phi, SmallVectorImpl<Register> &Values,
+    SmallVectorImpl<const MachineBasicBlock *> &Blocks) const {
+  if (!Phi.isPHI())
+    return;
+  for (unsigned Idx = 1, End = Phi.getNumOperands(); Idx < End; Idx += 2) {
+    // FIXME: ideally we would turn undef values into ValueRefNull.
+    // This could reduce number of PHIs marked in taintAndPushPhiNodes().
+    Values.push_back(Phi.getOperand(Idx).getReg());
+    Blocks.push_back(Phi.getOperand(Idx + 1).getMBB());
+  }
+}
+
 template <>
 Intrinsic::ID MachineSSAContext::getIntrinsicID(const MachineInstr &MI) {
   if (auto *GI = dyn_cast<GIntrinsic>(&MI))
diff --git a/llvm/lib/IR/SSAContext.cpp b/llvm/lib/IR/SSAContext.cpp
index 20b6ea1e972d4..0249fe94d716e 100644
--- a/llvm/lib/IR/SSAContext.cpp
+++ b/llvm/lib/IR/SSAContext.cpp
@@ -14,6 +14,7 @@
 
 #include "llvm/IR/SSAContext.h"
 #include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
@@ -68,6 +69,20 @@ bool SSAContext::isConstantOrUndefValuePhi(const Instruction &Instr) {
   return false;
 }
 
+template <>
+void SSAContext::getPhiInputs(
+    const Instruction &Instr, SmallVectorImpl<const Value *> &Values,
+    SmallVectorImpl<const BasicBlock *> &Blocks) const {
+  if (auto *Phi = dyn_cast<PHINode>(&Instr)) {
+    for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
+      const Value *Incoming = Phi->getIncomingValue(I);
+      const BasicBlock *Block = Phi->getIncomingBlock(I);
+      Values.push_back(!isa<UndefValue>(Incoming) ? Incoming : ValueRefNull);
+      Blocks.push_back(Block);
+    }
+  }
+}
+
 template <> Intrinsic::ID SSAContext::getIntrinsicID(const Instruction &I) {
   if (auto *CB = dyn_cast<CallBase>(&I))
     return CB->getIntrinsicID();
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_no_divergence.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_no_divergence.ll
new file mode 100644
index 0000000000000..7e1018bf59d5f
--- /dev/null
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_no_divergence.ll
@@ -0,0 +1,114 @@
+; RUN: opt %s -mtriple amdgcn-- -passes='print<uniformity>' -disable-output 2>&1 | FileCheck %s
+
+define amdgpu_kernel void @no_divergent_exit1(i32 %a, i32 %b, i32 %c) #0 {
+; CHECK-LABEL: for function 'no_divergent_exit1'
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %div.cond = icmp slt i32 %tid, 0
+; CHECK: DIVERGENT: %div.cond =
+  br label %header
+
+header:
+  %loop.b = phi i32 [ %b, %entry ], [ %new.b, %body.1 ], [ %new.b, %body.2 ]
+; CHECK-NOT: DIVERGENT: %loop.b =
+  %loop.c = phi i32 [ %c, %entry ], [ %loop.c, %body.1 ], [ %new.c, %body.2 ]
+; CHECK: DIVERGENT: %loop.c =
+  %exit.val = phi i32 [ %a, %entry ], [ %next.exit.val, %body.1 ], [ %next.exit.val, %body.2 ]
+; CHECK-NOT: DIVERGENT: %exit.val =
+  %exit.cond = icmp slt i32 %exit.val, 42
+; CHECK-NOT: DIVERGENT: %exit.cond =
+  br i1 %exit.cond, label %end, label %body.1
+; CHECK-NOT: DIVERGENT: br i1 %exit.cond,
+
+body.1:
+  %new.b = add i32 %loop.b, 1
+; CHECK-NOT: DIVERGENT: %new.b =
+  %next.exit.val = add i32 %exit.val, 1
+; CHECK-NOT: DIVERGENT: %next.exit.val =
+  br i1 %div.cond, label %body.2, label %header
+; CHECK: DIVERGENT: br i1 %div.cond,
+
+body.2:
+  %new.c = add i32 %loop.c, 1
+; CHECK: DIVERGENT: %new.c =
+  br label %header
+
+end:
+  ret void
+}
+
+define amdgpu_kernel void @no_divergent_exit2(i32 %a, i32 %b, i32 %c) #0 {
+; CHECK-LABEL: for function 'no_divergent_exit2'
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %div.cond = icmp slt i32 %tid, 0
+; CHECK: DIVERGENT: %div.cond =
+  br label %header
+
+header:
+  %loop.b = phi i32 [ %b, %entry ], [ %merge.b, %merge ]
+; CHECK-NOT: DIVERGENT: %loop.b =
+  %loop.c = phi i32 [ %c, %entry ], [ %merge.c, %merge ]
+; CHECK: DIVERGENT: %loop.c =
+  %exit.val = phi i32 [ %a, %entry ], [ %next.exit.val, %merge ]
+; CHECK-NOT: DIVERGENT: %exit.val =
+  %exit.cond = icmp slt i32 %exit.val, 42
+; CHECK-NOT: DIVERGENT: %exit.cond =
+  br i1 %exit.cond, label %end, label %body.1
+; CHECK-NOT: DIVERGENT: br i1 %exit.cond,
+
+body.1:
+  %new.b = add i32 %loop.b, 1
+; CHECK-NOT: DIVERGENT: %new.b =
+  %next.exit.val = add i32 %exit.val, 1
+; CHECK-NOT: DIVERGENT: %next.exit.val =
+  br i1 %div.cond, label %body.2, label %merge
+; CHECK: DIVERGENT: br i1 %div.cond,
+
+body.2:
+  %new.c = add i32 %loop.c, 1
+; CHECK: DIVERGENT: %new.c =
+  br label %merge
+
+merge:
+  %merge.b = phi i32 [ %new.b, %body.1 ], [ %new.b, %body.2 ]
+; CHECK-NOT: DIVERGENT: %merge.b =
+  %merge.c = phi i32 [ %loop.c, %body.1 ], [ %new.c, %body.2 ]
+; CHECK: DIVERGENT: %merge.c =
+  br label %header
+
+end:
+  ret void
+}
+
+define amdgpu_kernel void @no_loop_phi_divergence(i32 %a) #0 {
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %uni.cond = icmp slt i32 %a, 0
+; CHECK-NOT: DIVERGENT: %uni.cond =
+  %div.cond = icmp slt i32 %tid, 0
+; CHECK: DIVERGENT: %div.cond =
+  br i1 %uni.cond, label %div.branch.block, label %merge
+; CHECK-NOT: DIVERGENT: br i1 %uni.cond,
+
+div.branch.block:
+  br i1 %div.cond, label %div.block.1, label %div.block.2
+; CHECK: DIVERGENT: br i1 %div.cond,
+
+div.block.1:
+  br label %merge
+
+div.block.2:
+  br label %merge
+
+merge:
+  %uni.val = phi i32 [ 0, %entry ], [ 1, %div.block.1 ], [ 1, %div.block.2 ]
+; CHECK-NOT: DIVERGENT: %uni.val =
+  %div.val = phi i32 [ 0, %entry ], [ 1, %div.block.1 ], [ 2, %div.block.2 ]
+; CHECK: DIVERGENT: %div.val =
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone }