[llvm] [Uniformity] Implement per-output machine uniformity analysis (PR #179275)

Fri Feb 6 04:23:46 PST 2026

https://github.com/PankajDwivedi-25 updated https://github.com/llvm/llvm-project/pull/179275

>From b05946d3a3709a96dc0ad476f85d5d0369f86ce4 Mon Sep 17 00:00:00 2001
From: padivedi <padivedi at amd.com>
Date: Mon, 2 Feb 2026 18:18:04 +0530
Subject: [PATCH 1/5] [AMDGPU] Add test for amdgcn.if/else per-output
 uniformity (NFC)

---
 .../AMDGPU/MIR/per-output-uniformity.mir      | 78 +++++++++++++++++++
 1 file changed, 78 insertions(+)
 create mode 100644 llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/per-output-uniformity.mir

diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/per-output-uniformity.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/per-output-uniformity.mir
new file mode 100644
index 0000000000000..db8cb74bd578c
--- /dev/null
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/per-output-uniformity.mir
@@ -0,0 +1,78 @@
+# RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -run-pass=print-machine-uniformity -o - %s 2>&1 | FileCheck %s
+# RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -passes='print<machine-uniformity>' -filetype=null %s 2>&1 | FileCheck %s
+
+# Test per-output uniformity analysis for amdgcn.if and amdgcn.else intrinsics.
+# These intrinsics produce two outputs:
+#   - First result (i1): Inherits divergence from the input condition.
+#   - Second result (i64): Saved exec mask - always uniform.
+
+# Test amdgcn.if with UNIFORM input
+---
+name:            amdgcn_if_uniform_input
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: MachineUniformityInfo for function:  @amdgcn_if_uniform_input
+    ; Currently both outputs are marked divergent even with uniform input
+    ; CHECK: DIVERGENT: %1
+    ; CHECK: DIVERGENT: %2
+    %0:_(s1) = G_IMPLICIT_DEF
+    %1:_(s1), %2:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %0:_(s1)
+    S_ENDPGM 0
+...
+
+# Test amdgcn.if with DIVERGENT input
+---
+name:            amdgcn_if_divergent_input
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: MachineUniformityInfo for function:  @amdgcn_if_divergent_input
+    ; Both outputs are divergent
+    ; CHECK: DIVERGENT: %3
+    ; CHECK: DIVERGENT: %4
+    %0:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x)
+    %1:_(s32) = G_CONSTANT i32 16
+    %2:_(s1) = G_ICMP intpred(slt), %0:_(s32), %1:_
+    %3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2:_(s1)
+    S_ENDPGM 0
+...
+
+# Test amdgcn.else with UNIFORM input
+---
+name:            amdgcn_else_uniform_input
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: MachineUniformityInfo for function:  @amdgcn_else_uniform_input
+    ; Currently both outputs are marked divergent even with uniform input
+    ; CHECK: DIVERGENT: %1
+    ; CHECK: DIVERGENT: %2
+    %0:_(s64) = G_IMPLICIT_DEF
+    %1:_(s1), %2:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.else), %0:_(s64)
+    S_ENDPGM 0
+...
+
+# Test amdgcn.else with DIVERGENT input
+---
+name:            amdgcn_else_divergent_input
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: MachineUniformityInfo for function:  @amdgcn_else_divergent_input
+    ; Both outputs are divergent
+    ; CHECK: DIVERGENT: %2
+    ; CHECK: DIVERGENT: %3
+    %0:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x)
+    %1:_(s64) = G_ZEXT %0:_(s32)
+    %2:_(s1), %3:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.else), %1:_(s64)
+    S_ENDPGM 0
+...

>From c8d779166006fc288aac4d562a10aaf52165fe01 Mon Sep 17 00:00:00 2001
From: padivedi <padivedi at amd.com>
Date: Mon, 2 Feb 2026 18:24:45 +0530
Subject: [PATCH 2/5] Implement per-output machine uniformity analysis

---
 llvm/include/llvm/ADT/GenericUniformityImpl.h | 21 ++++-
 llvm/include/llvm/CodeGen/TargetInstrInfo.h   |  7 +-
 llvm/lib/Analysis/UniformityAnalysis.cpp      |  2 +-
 .../lib/CodeGen/MachineUniformityAnalysis.cpp | 92 ++++++++++++++++---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        | 38 +++++---
 llvm/lib/Target/AMDGPU/SIInstrInfo.h          |  8 +-
 .../AMDGPU/MIR/hidden-diverge-gmir.mir        |  6 +-
 .../AMDGPU/MIR/per-output-uniformity.mir      | 28 +++---
 8 files changed, 147 insertions(+), 55 deletions(-)

diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h
index 2db76a1ad9b13..55af9be64883e 100644
--- a/llvm/include/llvm/ADT/GenericUniformityImpl.h
+++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h
@@ -357,8 +357,8 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
 
   const FunctionT &getFunction() const { return F; }
 
-  /// \brief Mark \p UniVal as a value that is always uniform.
-  void addUniformOverride(const InstructionT &Instr);
+  /// \brief Mark \p V as a value that is always uniform.
+  void addUniformOverride(ConstValueRefT V);
 
   /// \brief Examine \p I for divergent outputs and add to the worklist.
   void markDivergent(const InstructionT &I);
@@ -382,6 +382,9 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
   /// operands
   bool isAlwaysUniform(const InstructionT &Instr) const;
 
+  /// \brief Whether \p V is always uniform (per-value check for MIR).
+  bool isAlwaysUniform(ConstValueRefT V) const;
+
   bool hasDivergentDefs(const InstructionT &I) const;
 
   bool isDivergent(const InstructionT &I) const {
@@ -440,7 +443,7 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
   SyncDependenceAnalysisT SDA;
 
   // Set of known-uniform values.
-  SmallPtrSet<const InstructionT *, 32> UniformOverrides;
+  DenseSet<ConstValueRefT> UniformOverrides;
 
   /// \brief Mark all nodes in \p JoinBlock as divergent and push them on
   /// the worklist.
@@ -801,6 +804,8 @@ void GenericUniformityAnalysisImpl<ContextT>::markDivergent(
 template <typename ContextT>
 bool GenericUniformityAnalysisImpl<ContextT>::markDivergent(
     ConstValueRefT Val) {
+  if (isAlwaysUniform(Val))
+    return false;
   if (DivergentValues.insert(Val).second) {
     LLVM_DEBUG(dbgs() << "marked divergent: " << Context.print(Val) << "\n");
     return true;
@@ -810,8 +815,8 @@ bool GenericUniformityAnalysisImpl<ContextT>::markDivergent(
 
 template <typename ContextT>
 void GenericUniformityAnalysisImpl<ContextT>::addUniformOverride(
-    const InstructionT &Instr) {
-  UniformOverrides.insert(&Instr);
+    ConstValueRefT V) {
+  UniformOverrides.insert(V);
 }
 
 // Mark as divergent all external uses of values defined in \p DefCycle.
@@ -1143,6 +1148,12 @@ bool GenericUniformityAnalysisImpl<ContextT>::isAlwaysUniform(
   return UniformOverrides.contains(&Instr);
 }
 
+template <typename ContextT>
+bool GenericUniformityAnalysisImpl<ContextT>::isAlwaysUniform(
+    ConstValueRefT V) const {
+  return UniformOverrides.contains(V);
+}
+
 template <typename ContextT>
 GenericUniformityInfo<ContextT>::GenericUniformityInfo(
     const DominatorTreeT &DT, const CycleInfoT &CI,
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 91fddce7e7e47..7587e9074870b 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -2367,9 +2367,10 @@ class LLVM_ABI TargetInstrInfo : public MCInstrInfo {
     llvm_unreachable("impossible call instruction");
   }
 
-  /// Return the uniformity behavior of the given instruction.
-  virtual InstructionUniformity
-  getInstructionUniformity(const MachineInstr &MI) const {
+  /// Return the uniformity behavior of the given instruction's output.
+  /// \p DefIdx specifies which output to query (for multi-output instructions).
+  virtual InstructionUniformity getDefUniformity(const MachineInstr &MI,
+                                                 unsigned DefIdx = 0) const {
     return InstructionUniformity::Default;
   }
 
diff --git a/llvm/lib/Analysis/UniformityAnalysis.cpp b/llvm/lib/Analysis/UniformityAnalysis.cpp
index b56534935d7c2..3f021584ea901 100644
--- a/llvm/lib/Analysis/UniformityAnalysis.cpp
+++ b/llvm/lib/Analysis/UniformityAnalysis.cpp
@@ -34,7 +34,7 @@ template <> void llvm::GenericUniformityAnalysisImpl<SSAContext>::initialize() {
     InstructionUniformity IU = TTI->getInstructionUniformity(&I);
     switch (IU) {
     case InstructionUniformity::AlwaysUniform:
-      addUniformOverride(I);
+      addUniformOverride(&I);
       continue;
     case InstructionUniformity::NeverUniform:
       markDivergent(I);
diff --git a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
index dbadb67e1e6d2..6947c12d3eb1e 100644
--- a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
+++ b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
@@ -46,24 +46,86 @@ bool llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::markDefsDivergent(
   return insertedDivergent;
 }
 
+template <>
+bool llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::isAlwaysUniform(
+    const MachineInstr &Instr) const {
+  // For MIR, an instruction is "always uniform" only if it has at least one
+  // virtual register def AND all those defs are in UniformOverrides.
+  // Instructions with no virtual register defs (e.g., terminators like
+  // G_BRCOND, G_BR) return false to ensure they can be properly processed
+  // for divergence during propagation.
+  bool HasVirtualDef = false;
+  for (const MachineOperand &Op : Instr.all_defs()) {
+    if (!Op.getReg().isVirtual())
+      continue;
+    HasVirtualDef = true;
+    if (!UniformOverrides.contains(Op.getReg()))
+      return false;
+  }
+  // Only return true if we found at least one virtual def and all were uniform
+  return HasVirtualDef;
+}
+
 template <>
 void llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::initialize() {
-  const auto &InstrInfo = *F.getSubtarget().getInstrInfo();
-
-  for (const MachineBasicBlock &block : F) {
-    for (const MachineInstr &instr : block) {
-      auto uniformity = InstrInfo.getInstructionUniformity(instr);
-
-      switch (uniformity) {
-      case InstructionUniformity::AlwaysUniform:
-        addUniformOverride(instr);
-        break;
-      case InstructionUniformity::NeverUniform:
-        markDivergent(instr);
-        break;
-      case InstructionUniformity::Default:
-        break;
+  const TargetInstrInfo &InstrInfo = *F.getSubtarget().getInstrInfo();
+  const MachineRegisterInfo &MRI = F.getRegInfo();
+  const RegisterBankInfo &RBI = *F.getSubtarget().getRegBankInfo();
+  const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
+
+  for (const MachineBasicBlock &Block : F) {
+    for (const MachineInstr &Instr : Block) {
+      // Terminators are handled separately because:
+      // 1. Many terminators (G_BRCOND, G_BR) have no def operands, so the
+      //    per-def loop below would skip them entirely.
+      // 2. Divergent terminators mark the BLOCK as divergent
+      // (DivergentTermBlocks),
+      //    not individual values, which is different from regular instructions.
+      // For terminators like SI_IF/SI_ELSE, getDefUniformity() checks the
+      // isNeverUniform() flag, so DefIdx is not relevant here.
+      if (Instr.isTerminator()) {
+        InstructionUniformity Uniformity = InstrInfo.getDefUniformity(Instr);
+        if (Uniformity == InstructionUniformity::NeverUniform) {
+          if (DivergentTermBlocks.insert(Instr.getParent()).second) {
+            Worklist.push_back(&Instr);
+          }
+        }
+        continue;
+      }
+
+      // Query uniformity for each def operand separately.
+      unsigned DefIdx = 0;
+      bool HasDivergentDef = false;
+      for (const MachineOperand &Op : Instr.all_defs()) {
+        if (!Op.getReg().isVirtual()) {
+          DefIdx++;
+          continue;
+        }
+
+        InstructionUniformity Uniformity =
+            InstrInfo.getDefUniformity(Instr, DefIdx);
+
+        switch (Uniformity) {
+        case InstructionUniformity::AlwaysUniform:
+          addUniformOverride(Op.getReg());
+          break;
+        case InstructionUniformity::NeverUniform:
+          // Skip registers that are inherently uniform (e.g., SGPRs on AMDGPU)
+          // even if the instruction is marked as NeverUniform.
+          if (!TRI.isUniformReg(MRI, RBI, Op.getReg())) {
+            if (markDivergent(Op.getReg()))
+              HasDivergentDef = true;
+          }
+          break;
+        case InstructionUniformity::Default:
+          break;
+        }
+        DefIdx++;
       }
+      // If any def was marked divergent, add the instruction to worklist
+      // for divergence propagation to users.
+      if (HasDivergentDef)
+        Worklist.push_back(&Instr);
     }
   }
 }
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 09efba485f6f8..0b775d8232603 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -10614,7 +10614,8 @@ SIInstrInfo::getCalleeOperand(const MachineInstr &MI) const {
 }
 
 InstructionUniformity
-SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const {
+SIInstrInfo::getGenericDefUniformity(const MachineInstr &MI,
+                                     unsigned DefIdx) const {
   const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
   unsigned Opcode = MI.getOpcode();
 
@@ -10639,19 +10640,34 @@ SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const {
   if (Opcode == TargetOpcode::G_ADDRSPACE_CAST)
     return HandleAddrSpaceCast(MI);
 
-  if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
-    auto IID = GI->getIntrinsicID();
-    if (AMDGPU::isIntrinsicSourceOfDivergence(IID))
-      return InstructionUniformity::NeverUniform;
+  if (const GIntrinsic *GI = dyn_cast<GIntrinsic>(&MI)) {
+    Intrinsic::ID IID = GI->getIntrinsicID();
+    if (AMDGPU::isIntrinsicSourceOfDivergence(IID)) {
+      // Some intrinsics produce multiple outputs with mixed uniformity.
+      // For these, we need to check DefIdx to determine which output is being
+      // queried and return the appropriate uniformity.
+      switch (IID) {
+      case Intrinsic::amdgcn_if:
+      case Intrinsic::amdgcn_else:
+        // These intrinsics produce two outputs:
+        //   DefIdx=0: "Exec mask not zero" flag (i1) - inherits divergence from
+        //             the input condition to ensure proper divergence
+        //             propagation.
+        //   DefIdx=1: Saved exec mask (i64) - always uniform as all active
+        //             lanes see the same mask value.
+        return DefIdx == 1 ? InstructionUniformity::AlwaysUniform
+                           : InstructionUniformity::Default;
+      default:
+        return InstructionUniformity::NeverUniform;
+      }
+    }
     if (AMDGPU::isIntrinsicAlwaysUniform(IID))
       return InstructionUniformity::AlwaysUniform;
 
     switch (IID) {
     case Intrinsic::amdgcn_addrspacecast_nonnull:
       return HandleAddrSpaceCast(MI);
-    case Intrinsic::amdgcn_if:
-    case Intrinsic::amdgcn_else:
-      // FIXME: Uniform if second result
+    default:
       break;
     }
 
@@ -10694,8 +10710,8 @@ const MIRFormatter *SIInstrInfo::getMIRFormatter() const {
   return Formatter.get();
 }
 
-InstructionUniformity
-SIInstrInfo::getInstructionUniformity(const MachineInstr &MI) const {
+InstructionUniformity SIInstrInfo::getDefUniformity(const MachineInstr &MI,
+                                                    unsigned DefIdx) const {
 
   if (isNeverUniform(MI))
     return InstructionUniformity::NeverUniform;
@@ -10719,7 +10735,7 @@ SIInstrInfo::getInstructionUniformity(const MachineInstr &MI) const {
 
   // GMIR handling
   if (MI.isPreISelOpcode())
-    return SIInstrInfo::getGenericInstructionUniformity(MI);
+    return SIInstrInfo::getGenericDefUniformity(MI, DefIdx);
 
   // Atomics are divergent because they are executed sequentially: when an
   // atomic operation refers to the same address in each thread, then each
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 05cf804d08ffc..5ab60267fd9ab 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1667,11 +1667,11 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
 
   const MachineOperand &getCalleeOperand(const MachineInstr &MI) const override;
 
-  InstructionUniformity
-  getInstructionUniformity(const MachineInstr &MI) const final;
+  InstructionUniformity getDefUniformity(const MachineInstr &MI,
+                                         unsigned DefIdx = 0) const final;
 
-  InstructionUniformity
-  getGenericInstructionUniformity(const MachineInstr &MI) const;
+  InstructionUniformity getGenericDefUniformity(const MachineInstr &MI,
+                                                unsigned DefIdx = 0) const;
 
   const MIRFormatter *getMIRFormatter() const override;
 
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir
index 27c53815feb06..e0d23d4af85dd 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir
@@ -5,7 +5,8 @@
 # CHECK:     DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x)
 # CHECK:     DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_ICMP intpred(slt)
 # CHECK:     DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_XOR %{{[0-9]*}}:_, %{{[0-9]*}}:_
-# CHECK:     DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if)
+# The first output (i1) of amdgcn.if inherits divergence from input.
+# The second output (exec mask, i64) is always uniform.
 # CHECK:     DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if)
 # CHECK:     DIVERGENT: G_BRCOND %{{[0-9]*}}:_(s1), %bb.1
 # CHECK:     DIVERGENT: G_BR %bb.2
@@ -13,7 +14,8 @@
 # CHECK-LABEL: BLOCK bb.2
 # CHECK-NOT: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_PHI %{{[0-9]*}}:_(s32), %bb.1, %{{[0-9]*}}:_(s32), %bb.0
 # CHECK:     DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_PHI %{{[0-9]*}}:_(s1), %bb.1, %{{[0-9]*}}:_(s1), %bb.0
-# CHECK:     DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if)
+# The first output (i1) of amdgcn.if inherits divergence from input.
+# The second output (exec mask, i64) is always uniform.
 # CHECK:     DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if)
 # CHECK:     DIVERGENT: G_BRCOND %{{[0-9]*}}:_(s1), %bb.3
 # CHECK:     DIVERGENT: G_BR %bb.4
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/per-output-uniformity.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/per-output-uniformity.mir
index db8cb74bd578c..45cc2058d00d4 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/per-output-uniformity.mir
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/per-output-uniformity.mir
@@ -6,7 +6,7 @@
 #   - First result (i1): Inherits divergence from the input condition.
 #   - Second result (i64): Saved exec mask - always uniform.
 
-# Test amdgcn.if with UNIFORM input
+# Test amdgcn.if with UNIFORM input - both outputs should be uniform
 ---
 name:            amdgcn_if_uniform_input
 tracksRegLiveness: true
@@ -15,15 +15,14 @@ machineFunctionInfo:
 body:             |
   bb.0:
     ; CHECK-LABEL: MachineUniformityInfo for function:  @amdgcn_if_uniform_input
-    ; Currently both outputs are marked divergent even with uniform input
-    ; CHECK: DIVERGENT: %1
-    ; CHECK: DIVERGENT: %2
+    ; With uniform input, both outputs are uniform
+    ; CHECK: ALL VALUES UNIFORM
     %0:_(s1) = G_IMPLICIT_DEF
     %1:_(s1), %2:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %0:_(s1)
     S_ENDPGM 0
 ...
 
-# Test amdgcn.if with DIVERGENT input
+# Test amdgcn.if with DIVERGENT input - first output divergent, second uniform
 ---
 name:            amdgcn_if_divergent_input
 tracksRegLiveness: true
@@ -32,9 +31,10 @@ machineFunctionInfo:
 body:             |
   bb.0:
     ; CHECK-LABEL: MachineUniformityInfo for function:  @amdgcn_if_divergent_input
-    ; Both outputs are divergent
+    ; First output (%3, i1) inherits divergence from the divergent input
     ; CHECK: DIVERGENT: %3
-    ; CHECK: DIVERGENT: %4
+    ; Second output (%4, exec mask) is always uniform regardless of input
+    ; CHECK-NOT: DIVERGENT: %4
     %0:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x)
     %1:_(s32) = G_CONSTANT i32 16
     %2:_(s1) = G_ICMP intpred(slt), %0:_(s32), %1:_
@@ -42,7 +42,7 @@ body:             |
     S_ENDPGM 0
 ...
 
-# Test amdgcn.else with UNIFORM input
+# Test amdgcn.else with UNIFORM input - both outputs should be uniform
 ---
 name:            amdgcn_else_uniform_input
 tracksRegLiveness: true
@@ -51,15 +51,14 @@ machineFunctionInfo:
 body:             |
   bb.0:
     ; CHECK-LABEL: MachineUniformityInfo for function:  @amdgcn_else_uniform_input
-    ; Currently both outputs are marked divergent even with uniform input
-    ; CHECK: DIVERGENT: %1
-    ; CHECK: DIVERGENT: %2
+    ; With uniform input, both outputs are uniform
+    ; CHECK: ALL VALUES UNIFORM
     %0:_(s64) = G_IMPLICIT_DEF
     %1:_(s1), %2:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.else), %0:_(s64)
     S_ENDPGM 0
 ...
 
-# Test amdgcn.else with DIVERGENT input
+# Test amdgcn.else with DIVERGENT input - first output divergent, second uniform
 ---
 name:            amdgcn_else_divergent_input
 tracksRegLiveness: true
@@ -68,9 +67,10 @@ machineFunctionInfo:
 body:             |
   bb.0:
     ; CHECK-LABEL: MachineUniformityInfo for function:  @amdgcn_else_divergent_input
-    ; Both outputs are divergent
+    ; First output (%2, i1) inherits divergence from the divergent input
     ; CHECK: DIVERGENT: %2
-    ; CHECK: DIVERGENT: %3
+    ; Second output (%3, exec mask) is always uniform regardless of input
+    ; CHECK-NOT: DIVERGENT: %3
     %0:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x)
     %1:_(s64) = G_ZEXT %0:_(s32)
     %2:_(s1), %3:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.else), %1:_(s64)

>From 111bef25592df954fe02886a96b79b68a3fc0321 Mon Sep 17 00:00:00 2001
From: padivedi <padivedi at amd.com>
Date: Wed, 4 Feb 2026 15:10:22 +0530
Subject: [PATCH 3/5] update uniformity per val instead inst

---
 llvm/include/llvm/ADT/GenericUniformityImpl.h |   2 +-
 llvm/include/llvm/ADT/Uniformity.h            |  12 +-
 .../llvm/Analysis/TargetTransformInfo.h       |   8 +-
 .../llvm/Analysis/TargetTransformInfoImpl.h   |   4 +-
 llvm/include/llvm/CodeGen/TargetInstrInfo.h   |   8 +-
 llvm/lib/Analysis/TargetTransformInfo.cpp     |   8 +-
 llvm/lib/Analysis/UniformityAnalysis.cpp      |  13 +-
 .../lib/CodeGen/MachineUniformityAnalysis.cpp |  26 ++--
 .../AMDGPU/AMDGPUTargetTransformInfo.cpp      |   9 +-
 .../Target/AMDGPU/AMDGPUTargetTransformInfo.h |   2 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |  67 +++++-----
 llvm/lib/Target/AMDGPU/SIInstrInfo.h          |   8 +-
 .../Target/NVPTX/NVPTXTargetTransformInfo.cpp |   7 +-
 .../Target/NVPTX/NVPTXTargetTransformInfo.h   |   2 +-
 .../AMDGPU/MIR/per-output-uniformity.mir      | 114 +++++++++++++++++-
 15 files changed, 197 insertions(+), 93 deletions(-)

diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h
index 55af9be64883e..b497eaff7b264 100644
--- a/llvm/include/llvm/ADT/GenericUniformityImpl.h
+++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h
@@ -443,7 +443,7 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
   SyncDependenceAnalysisT SDA;
 
   // Set of known-uniform values.
-  DenseSet<ConstValueRefT> UniformOverrides;
+  SmallDenseSet<ConstValueRefT, 8> UniformOverrides;
 
   /// \brief Mark all nodes in \p JoinBlock as divergent and push them on
   /// the worklist.
diff --git a/llvm/include/llvm/ADT/Uniformity.h b/llvm/include/llvm/ADT/Uniformity.h
index 21ca106b80be3..1e7104ec9b4b4 100644
--- a/llvm/include/llvm/ADT/Uniformity.h
+++ b/llvm/include/llvm/ADT/Uniformity.h
@@ -11,18 +11,18 @@
 
 namespace llvm {
 
-/// Enum describing how instructions behave with respect to uniformity and
+/// Enum describing how values behave with respect to uniformity and
 /// divergence, to answer the question: if the same instruction is executed by
-/// two threads in a convergent set of threads, will its result value(s) be
+/// two threads in a convergent set of threads, will its result value be
 /// uniform, i.e. the same on both threads?
-enum class InstructionUniformity {
-  /// The result values are uniform if and only if all operands are uniform.
+enum class ValueUniformity {
+  /// The result value is uniform if and only if all operands are uniform.
   Default,
 
-  /// The result values are always uniform.
+  /// The result value is always uniform.
   AlwaysUniform,
 
-  /// The result values can never be assumed to be uniform.
+  /// The result value can never be assumed to be uniform.
   NeverUniform
 };
 
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index a013122df5f06..e4fc27fdd8a41 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -517,14 +517,14 @@ class TargetTransformInfo {
   /// uniformity analysis and assume all values are uniform.
   LLVM_ABI bool hasBranchDivergence(const Function *F = nullptr) const;
 
-  /// Get target-specific uniformity information for an instruction.
+  /// Get target-specific uniformity information for a value.
   /// This allows targets to provide more fine-grained control over
-  /// uniformity analysis by specifying whether specific instructions
+  /// uniformity analysis by specifying whether specific value
   /// should always or never be considered uniform, or require custom
   /// operand-based analysis.
   /// \param V The value to query for uniformity information.
-  /// \return InstructionUniformity.
-  LLVM_ABI InstructionUniformity getInstructionUniformity(const Value *V) const;
+  /// \return ValueUniformity.
+  LLVM_ABI ValueUniformity getValueUniformity(const Value *V) const;
 
   /// Query the target whether the specified address space cast from FromAS to
   /// ToAS is valid.
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 6d27cabf404f8..311cfe9d454b1 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -132,8 +132,8 @@ class TargetTransformInfoImplBase {
     return false;
   }
 
-  virtual InstructionUniformity getInstructionUniformity(const Value *V) const {
-    return InstructionUniformity::Default;
+  virtual ValueUniformity getValueUniformity(const Value *V) const {
+    return ValueUniformity::Default;
   }
 
   virtual bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 7587e9074870b..bb232ba4b4dec 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -2367,11 +2367,11 @@ class LLVM_ABI TargetInstrInfo : public MCInstrInfo {
     llvm_unreachable("impossible call instruction");
   }
 
-  /// Return the uniformity behavior of the given instruction's output.
+  /// Return the uniformity behavior of the given value (def operand).
   /// \p DefIdx specifies which output to query (for multi-output instructions).
-  virtual InstructionUniformity getDefUniformity(const MachineInstr &MI,
-                                                 unsigned DefIdx = 0) const {
-    return InstructionUniformity::Default;
+  virtual ValueUniformity getValueUniformity(const MachineInstr &MI,
+                                             unsigned DefIdx = 0) const {
+    return ValueUniformity::Default;
   }
 
   /// Returns true if the given \p MI defines a TargetIndex operand that can be
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 19785204ed2b3..def1bd37da041 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -294,14 +294,14 @@ bool TargetTransformInfo::hasBranchDivergence(const Function *F) const {
   return TTIImpl->hasBranchDivergence(F);
 }
 
-InstructionUniformity
-llvm::TargetTransformInfo::getInstructionUniformity(const Value *V) const {
+ValueUniformity
+llvm::TargetTransformInfo::getValueUniformity(const Value *V) const {
   // Calls with the NoDivergenceSource attribute are always uniform.
   if (const auto *Call = dyn_cast<CallBase>(V)) {
     if (Call->hasFnAttr(Attribute::NoDivergenceSource))
-      return InstructionUniformity::AlwaysUniform;
+      return ValueUniformity::AlwaysUniform;
   }
-  return TTIImpl->getInstructionUniformity(V);
+  return TTIImpl->getValueUniformity(V);
 }
 
 bool llvm::TargetTransformInfo::isValidAddrSpaceCast(unsigned FromAS,
diff --git a/llvm/lib/Analysis/UniformityAnalysis.cpp b/llvm/lib/Analysis/UniformityAnalysis.cpp
index 3f021584ea901..55b3ca5f8c098 100644
--- a/llvm/lib/Analysis/UniformityAnalysis.cpp
+++ b/llvm/lib/Analysis/UniformityAnalysis.cpp
@@ -31,21 +31,20 @@ bool llvm::GenericUniformityAnalysisImpl<SSAContext>::markDefsDivergent(
 
 template <> void llvm::GenericUniformityAnalysisImpl<SSAContext>::initialize() {
   for (auto &I : instructions(F)) {
-    InstructionUniformity IU = TTI->getInstructionUniformity(&I);
-    switch (IU) {
-    case InstructionUniformity::AlwaysUniform:
+    ValueUniformity VU = TTI->getValueUniformity(&I);
+    switch (VU) {
+    case ValueUniformity::AlwaysUniform:
       addUniformOverride(&I);
       continue;
-    case InstructionUniformity::NeverUniform:
+    case ValueUniformity::NeverUniform:
       markDivergent(I);
       continue;
-    case InstructionUniformity::Default:
+    case ValueUniformity::Default:
       break;
     }
   }
   for (auto &Arg : F.args()) {
-    if (TTI->getInstructionUniformity(&Arg) ==
-        InstructionUniformity::NeverUniform)
+    if (TTI->getValueUniformity(&Arg) == ValueUniformity::NeverUniform)
       markDivergent(&Arg);
   }
 }
diff --git a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
index 6947c12d3eb1e..55eb19ea5f8e8 100644
--- a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
+++ b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
@@ -77,15 +77,15 @@ void llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::initialize() {
     for (const MachineInstr &Instr : Block) {
       // Terminators are handled separately because:
       // 1. Many terminators (G_BRCOND, G_BR) have no def operands, so the
-      //    per-def loop below would skip them entirely.
-      // 2. Divergent terminators mark the BLOCK as divergent
-      // (DivergentTermBlocks),
-      //    not individual values, which is different from regular instructions.
-      // For terminators like SI_IF/SI_ELSE, getDefUniformity() checks the
-      // isNeverUniform() flag, so DefIdx is not relevant here.
+      // per-def loop below would skip them entirely.
+      // 2. Divergent terminators mark the BLOCK as
+      // divergent(DivergentTermBlocks), not individual values, which is
+      // different from regular instructions. For terminators like
+      // SI_IF/SI_ELSE, getValueUniformity() checks the isNeverUniform() flag,
+      // so DefIdx is not relevant here.
       if (Instr.isTerminator()) {
-        InstructionUniformity Uniformity = InstrInfo.getDefUniformity(Instr);
-        if (Uniformity == InstructionUniformity::NeverUniform) {
+        ValueUniformity Uniformity = InstrInfo.getValueUniformity(Instr);
+        if (Uniformity == ValueUniformity::NeverUniform) {
           if (DivergentTermBlocks.insert(Instr.getParent()).second) {
             Worklist.push_back(&Instr);
           }
@@ -102,14 +102,14 @@ void llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::initialize() {
           continue;
         }
 
-        InstructionUniformity Uniformity =
-            InstrInfo.getDefUniformity(Instr, DefIdx);
+        ValueUniformity Uniformity =
+            InstrInfo.getValueUniformity(Instr, DefIdx);
 
         switch (Uniformity) {
-        case InstructionUniformity::AlwaysUniform:
+        case ValueUniformity::AlwaysUniform:
           addUniformOverride(Op.getReg());
           break;
-        case InstructionUniformity::NeverUniform:
+        case ValueUniformity::NeverUniform:
           // Skip registers that are inherently uniform (e.g., SGPRs on AMDGPU)
           // even if the instruction is marked as NeverUniform.
           if (!TRI.isUniformReg(MRI, RBI, Op.getReg())) {
@@ -117,7 +117,7 @@ void llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::initialize() {
               HasDivergentDef = true;
           }
           break;
-        case InstructionUniformity::Default:
+        case ValueUniformity::Default:
           break;
         }
         DefIdx++;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 3e02587f61336..9199001c02a05 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -1620,13 +1620,12 @@ unsigned GCNTTIImpl::getNumberOfParts(Type *Tp) const {
   return BaseT::getNumberOfParts(Tp);
 }
 
-InstructionUniformity
-GCNTTIImpl::getInstructionUniformity(const Value *V) const {
+ValueUniformity GCNTTIImpl::getValueUniformity(const Value *V) const {
   if (isAlwaysUniform(V))
-    return InstructionUniformity::AlwaysUniform;
+    return ValueUniformity::AlwaysUniform;
 
   if (isSourceOfDivergence(V))
-    return InstructionUniformity::NeverUniform;
+    return ValueUniformity::NeverUniform;
 
-  return InstructionUniformity::Default;
+  return ValueUniformity::Default;
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 3ec157aacd0aa..dd1a11ad6fe04 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -310,7 +310,7 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
   /// implementation.
   unsigned getNumberOfParts(Type *Tp) const override;
 
-  InstructionUniformity getInstructionUniformity(const Value *V) const override;
+  ValueUniformity getValueUniformity(const Value *V) const override;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 0b775d8232603..9c9139a1de0c6 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -10613,9 +10613,8 @@ SIInstrInfo::getCalleeOperand(const MachineInstr &MI) const {
   return TargetInstrInfo::getCalleeOperand(MI);
 }
 
-InstructionUniformity
-SIInstrInfo::getGenericDefUniformity(const MachineInstr &MI,
-                                     unsigned DefIdx) const {
+ValueUniformity SIInstrInfo::getGenericValueUniformity(const MachineInstr &MI,
+                                                       unsigned DefIdx) const {
   const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
   unsigned Opcode = MI.getOpcode();
 
@@ -10630,8 +10629,8 @@ SIInstrInfo::getGenericDefUniformity(const MachineInstr &MI,
     return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
                    DstAS == AMDGPUAS::FLAT_ADDRESS &&
                    ST.hasGloballyAddressableScratch()
-               ? InstructionUniformity::NeverUniform
-               : InstructionUniformity::Default;
+               ? ValueUniformity::NeverUniform
+               : ValueUniformity::Default;
   };
 
   // If the target supports globally addressable scratch, the mapping from
@@ -10650,19 +10649,21 @@ SIInstrInfo::getGenericDefUniformity(const MachineInstr &MI,
       case Intrinsic::amdgcn_if:
       case Intrinsic::amdgcn_else:
         // These intrinsics produce two outputs:
-        //   DefIdx=0: "Exec mask not zero" flag (i1) - inherits divergence from
-        //             the input condition to ensure proper divergence
-        //             propagation.
+        //   DefIdx=0: Boolean (i1) indicating whether the "then" block should
+        //             execute. After the exec mask is updated (ANDed with the
+        //             condition), this is true if exec is non-zero (at least
+        //             one lane active), false if exec is zero (no lanes
+        //             active). Inherits divergence from the input condition.
         //   DefIdx=1: Saved exec mask (i64) - always uniform as all active
-        //             lanes see the same mask value.
-        return DefIdx == 1 ? InstructionUniformity::AlwaysUniform
-                           : InstructionUniformity::Default;
+        //             lanes observe the same mask value.
+        return DefIdx == 1 ? ValueUniformity::AlwaysUniform
+                           : ValueUniformity::Default;
       default:
-        return InstructionUniformity::NeverUniform;
+        return ValueUniformity::NeverUniform;
       }
     }
     if (AMDGPU::isIntrinsicAlwaysUniform(IID))
-      return InstructionUniformity::AlwaysUniform;
+      return ValueUniformity::AlwaysUniform;
 
     switch (IID) {
     case Intrinsic::amdgcn_addrspacecast_nonnull:
@@ -10671,7 +10672,7 @@ SIInstrInfo::getGenericDefUniformity(const MachineInstr &MI,
       break;
     }
 
-    return InstructionUniformity::Default;
+    return ValueUniformity::Default;
   }
 
   // Loads from the private and flat address spaces are divergent, because
@@ -10683,25 +10684,25 @@ SIInstrInfo::getGenericDefUniformity(const MachineInstr &MI,
   if (Opcode == AMDGPU::G_LOAD || Opcode == AMDGPU::G_ZEXTLOAD ||
       Opcode == AMDGPU::G_SEXTLOAD) {
     if (MI.memoperands_empty())
-      return InstructionUniformity::NeverUniform; // conservative assumption
+      return ValueUniformity::NeverUniform; // conservative assumption
 
     if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
           return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
                  mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
         })) {
       // At least one MMO in a non-global address space.
-      return InstructionUniformity::NeverUniform;
+      return ValueUniformity::NeverUniform;
     }
-    return InstructionUniformity::Default;
+    return ValueUniformity::Default;
   }
 
   if (SIInstrInfo::isGenericAtomicRMWOpcode(Opcode) ||
       Opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
       Opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
       AMDGPU::isGenericAtomic(Opcode)) {
-    return InstructionUniformity::NeverUniform;
+    return ValueUniformity::NeverUniform;
   }
-  return InstructionUniformity::Default;
+  return ValueUniformity::Default;
 }
 
 const MIRFormatter *SIInstrInfo::getMIRFormatter() const {
@@ -10710,32 +10711,32 @@ const MIRFormatter *SIInstrInfo::getMIRFormatter() const {
   return Formatter.get();
 }
 
-InstructionUniformity SIInstrInfo::getDefUniformity(const MachineInstr &MI,
-                                                    unsigned DefIdx) const {
+ValueUniformity SIInstrInfo::getValueUniformity(const MachineInstr &MI,
+                                                unsigned DefIdx) const {
 
   if (isNeverUniform(MI))
-    return InstructionUniformity::NeverUniform;
+    return ValueUniformity::NeverUniform;
 
   unsigned opcode = MI.getOpcode();
   if (opcode == AMDGPU::V_READLANE_B32 ||
       opcode == AMDGPU::V_READFIRSTLANE_B32 ||
       opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
-    return InstructionUniformity::AlwaysUniform;
+    return ValueUniformity::AlwaysUniform;
 
   if (isCopyInstr(MI)) {
     const MachineOperand &srcOp = MI.getOperand(1);
     if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
       const TargetRegisterClass *regClass =
           RI.getPhysRegBaseClass(srcOp.getReg());
-      return RI.isSGPRClass(regClass) ? InstructionUniformity::AlwaysUniform
-                                      : InstructionUniformity::NeverUniform;
+      return RI.isSGPRClass(regClass) ? ValueUniformity::AlwaysUniform
+                                      : ValueUniformity::NeverUniform;
     }
-    return InstructionUniformity::Default;
+    return ValueUniformity::Default;
   }
 
   // GMIR handling
   if (MI.isPreISelOpcode())
-    return SIInstrInfo::getGenericDefUniformity(MI, DefIdx);
+    return SIInstrInfo::getGenericValueUniformity(MI, DefIdx);
 
   // Atomics are divergent because they are executed sequentially: when an
   // atomic operation refers to the same address in each thread, then each
@@ -10743,24 +10744,24 @@ InstructionUniformity SIInstrInfo::getDefUniformity(const MachineInstr &MI,
   // original value.
 
   if (isAtomic(MI))
-    return InstructionUniformity::NeverUniform;
+    return ValueUniformity::NeverUniform;
 
   // Loads from the private and flat address spaces are divergent, because
   // threads can execute the load instruction with the same inputs and get
   // different results.
   if (isFLAT(MI) && MI.mayLoad()) {
     if (MI.memoperands_empty())
-      return InstructionUniformity::NeverUniform; // conservative assumption
+      return ValueUniformity::NeverUniform; // conservative assumption
 
     if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
           return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
                  mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
         })) {
       // At least one MMO in a non-global address space.
-      return InstructionUniformity::NeverUniform;
+      return ValueUniformity::NeverUniform;
     }
 
-    return InstructionUniformity::Default;
+    return ValueUniformity::Default;
   }
 
   const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
@@ -10782,7 +10783,7 @@ InstructionUniformity SIInstrInfo::getDefUniformity(const MachineInstr &MI,
     // register, which are all scalars.
     const RegisterBank *RegBank = RBI->getRegBank(Reg, MRI, RI);
     if (RegBank && RegBank->getID() != AMDGPU::SGPRRegBankID)
-      return InstructionUniformity::NeverUniform;
+      return ValueUniformity::NeverUniform;
   }
 
   // TODO: Uniformity check condtions above can be rearranged for more
@@ -10792,7 +10793,7 @@ InstructionUniformity SIInstrInfo::getDefUniformity(const MachineInstr &MI,
   //       currently turned into no-op COPYs by SelectionDAG ISel and are
   //       therefore no longer recognizable.
 
-  return InstructionUniformity::Default;
+  return ValueUniformity::Default;
 }
 
 unsigned SIInstrInfo::getDSShaderTypeValue(const MachineFunction &MF) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 5ab60267fd9ab..4366d6f55d767 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1667,11 +1667,11 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
 
   const MachineOperand &getCalleeOperand(const MachineInstr &MI) const override;
 
-  InstructionUniformity getDefUniformity(const MachineInstr &MI,
-                                         unsigned DefIdx = 0) const final;
+  ValueUniformity getValueUniformity(const MachineInstr &MI,
+                                     unsigned DefIdx = 0) const final;
 
-  InstructionUniformity getGenericDefUniformity(const MachineInstr &MI,
-                                                unsigned DefIdx = 0) const;
+  ValueUniformity getGenericValueUniformity(const MachineInstr &MI,
+                                            unsigned DefIdx = 0) const;
 
   const MIRFormatter *getMIRFormatter() const override;
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
index c1fe9300785a3..01948d8e3264a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -671,10 +671,9 @@ void NVPTXTTIImpl::collectKernelLaunchBounds(
     LB.push_back({"maxntidz", MaxNTID[2]});
 }
 
-InstructionUniformity
-NVPTXTTIImpl::getInstructionUniformity(const Value *V) const {
+ValueUniformity NVPTXTTIImpl::getValueUniformity(const Value *V) const {
   if (isSourceOfDivergence(V))
-    return InstructionUniformity::NeverUniform;
+    return ValueUniformity::NeverUniform;
 
-  return InstructionUniformity::Default;
+  return ValueUniformity::Default;
 }
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
index 40eb161bc8666..dca2a0893f2fa 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@@ -207,7 +207,7 @@ class NVPTXTTIImpl final : public BasicTTIImplBase<NVPTXTTIImpl> {
     return false;
   }
 
-  InstructionUniformity getInstructionUniformity(const Value *V) const override;
+  ValueUniformity getValueUniformity(const Value *V) const override;
 };
 
 } // end namespace llvm
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/per-output-uniformity.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/per-output-uniformity.mir
index 45cc2058d00d4..53bae05a2bb09 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/per-output-uniformity.mir
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/per-output-uniformity.mir
@@ -1,10 +1,19 @@
 # RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -run-pass=print-machine-uniformity -o - %s 2>&1 | FileCheck %s
 # RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -passes='print<machine-uniformity>' -filetype=null %s 2>&1 | FileCheck %s
 
-# Test per-output uniformity analysis for amdgcn.if and amdgcn.else intrinsics.
-# These intrinsics produce two outputs:
-#   - First result (i1): Inherits divergence from the input condition.
-#   - Second result (i64): Saved exec mask - always uniform.
+# Test per-output (per-value) uniformity analysis for instructions with multiple
+# definitions where each definition can have different uniformity characteristics.
+#
+# The amdgcn.if and amdgcn.else intrinsics are the primary AMDGPU instructions
+# that produce multiple outputs with DIFFERENT uniformity:
+#   - First result (i1): Boolean flag indicating if any lanes are active.
+#                        Inherits divergence from the input condition.
+#   - Second result (i64): Saved exec mask - always uniform as all active lanes
+#                          see the same mask value.
+#
+# Most other multi-output instructions (G_SDIVREM, G_UADDO, G_ATOMIC_CMPXCHG_WITH_SUCCESS,
+# etc.) have all outputs with the SAME uniformity, so they don't require special
+# per-output handling.
 
 # Test amdgcn.if with UNIFORM input - both outputs should be uniform
 ---
@@ -76,3 +85,100 @@ body:             |
     %2:_(s1), %3:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.else), %1:_(s64)
     S_ENDPGM 0
 ...
+
+---
+name:            chained_if_else
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: MachineUniformityInfo for function:  @chained_if_else
+    ; Divergent condition from workitem.id.x
+    ; CHECK: DIVERGENT: %0
+    ; CHECK: DIVERGENT: %2
+    ; CHECK: DIVERGENT: %3
+    ; CHECK-NOT: DIVERGENT: %4
+    ; CHECK-NOT: DIVERGENT: %5
+    ; CHECK-NOT: DIVERGENT: %6
+    %0:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x)
+    %1:_(s32) = G_CONSTANT i32 16
+    %2:_(s1) = G_ICMP intpred(slt), %0:_(s32), %1:_
+    %3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2:_(s1)
+    ; The exec mask from if (%4) is always uniform, so else sees uniform input
+    %5:_(s1), %6:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.else), %4:_(s64)
+    S_ENDPGM 0
+...
+
+---
+name:            uniform_exec_mask_usage
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: MachineUniformityInfo for function:  @uniform_exec_mask_usage
+    ; CHECK: DIVERGENT: %0
+    ; CHECK: DIVERGENT: %2
+    ; CHECK: DIVERGENT: %3
+    ; CHECK-NOT: DIVERGENT: %4
+    ; CHECK-NOT: DIVERGENT: %5
+    ; CHECK-NOT: DIVERGENT: %6
+    %0:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x)
+    %1:_(s32) = G_CONSTANT i32 16
+    %2:_(s1) = G_ICMP intpred(slt), %0:_(s32), %1:_
+    %3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2:_(s1)
+    ; The exec mask is uniform, so operations on it are uniform
+    %5:_(s64) = G_CONSTANT i64 0
+    %6:_(s1) = G_ICMP intpred(eq), %4:_(s64), %5:_
+    S_ENDPGM 0
+...
+
+---
+name:            divergent_flag_propagation
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: MachineUniformityInfo for function:  @divergent_flag_propagation
+    ; Divergent input
+    ; CHECK: DIVERGENT: %0
+    ; CHECK: DIVERGENT: %2
+    ; CHECK: DIVERGENT: %3
+    ; CHECK-NOT: DIVERGENT: %4
+    ; CHECK: DIVERGENT: %5
+    ; CHECK: DIVERGENT: %6
+    %0:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x)
+    %1:_(s32) = G_CONSTANT i32 16
+    %2:_(s1) = G_ICMP intpred(slt), %0:_(s32), %1:_
+    %3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2:_(s1)
+    ; The divergent flag (%3) propagates divergence
+    %5:_(s1) = G_XOR %3:_, %3:_
+    %6:_(s32) = G_ZEXT %3:_(s1)
+    S_ENDPGM 0
+...
+
+---
+name:            nested_if_divergent
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: MachineUniformityInfo for function:  @nested_if_divergent
+    ; First amdgcn.if with divergent input
+    ; CHECK: DIVERGENT: %0
+    ; CHECK: DIVERGENT: %2
+    ; CHECK: DIVERGENT: %3
+    ; CHECK-NOT: DIVERGENT: %4
+    ; CHECK: DIVERGENT: %5
+    ; CHECK-NOT: DIVERGENT: %6
+    %0:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x)
+    %1:_(s32) = G_CONSTANT i32 16
+    %2:_(s1) = G_ICMP intpred(slt), %0:_(s32), %1:_
+    %3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2:_(s1)
+    ; Nested if using the divergent output from first if
+    %5:_(s1), %6:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %3:_(s1)
+    S_ENDPGM 0
+...

>From bd8ae2c90e7abb4e9f3cda7b364f7df4a813a216 Mon Sep 17 00:00:00 2001
From: padivedi <padivedi at amd.com>
Date: Fri, 6 Feb 2026 13:47:47 +0530
Subject: [PATCH 4/5] review: add assert for def idx and reg

---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 9c9139a1de0c6..ed54971e78d89 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -10656,6 +10656,10 @@ ValueUniformity SIInstrInfo::getGenericValueUniformity(const MachineInstr &MI,
         //             active). Inherits divergence from the input condition.
         //   DefIdx=1: Saved exec mask (i64) - always uniform as all active
         //             lanes observe the same mask value.
+        assert(DefIdx < 2 && "amdgcn_if/amdgcn_else have exactly 2 defs");
+        assert(MI.getOperand(DefIdx).isReg() &&
+               MI.getOperand(DefIdx).getReg().isVirtual() &&
+               "Expected virtual register def");
         return DefIdx == 1 ? ValueUniformity::AlwaysUniform
                            : ValueUniformity::Default;
       default:

>From 1ab00fd9b51b01fc82c60b9e05c638d3a210cefe Mon Sep 17 00:00:00 2001
From: padivedi <padivedi at amd.com>
Date: Fri, 6 Feb 2026 17:52:22 +0530
Subject: [PATCH 5/5] review: remove using IntrinsicLaneMaskAnalyzer for SI_IF
 and SI_ELSE

---
 llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
index f36935d8c0e8f..6650a0fa02a21 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
@@ -95,11 +95,6 @@ void IntrinsicLaneMaskAnalyzer::initLaneMaskIntrinsics(MachineFunction &MF) {
         S32S64LaneMask.insert(MI.getOperand(3).getReg());
         S32S64LaneMask.insert(MI.getOperand(0).getReg());
       }
-
-      if (MI.getOpcode() == AMDGPU::SI_IF ||
-          MI.getOpcode() == AMDGPU::SI_ELSE) {
-        S32S64LaneMask.insert(MI.getOperand(0).getReg());
-      }
     }
   }
 }