[llvm] [Uniformity] Implement per-output machine uniformity analysis (PR #179275)
Pankaj Dwivedi via llvm-commits
llvm-commits at lists.llvm.org
Fri Feb 6 04:23:46 PST 2026
https://github.com/PankajDwivedi-25 updated https://github.com/llvm/llvm-project/pull/179275
>From b05946d3a3709a96dc0ad476f85d5d0369f86ce4 Mon Sep 17 00:00:00 2001
From: padivedi <padivedi at amd.com>
Date: Mon, 2 Feb 2026 18:18:04 +0530
Subject: [PATCH 1/5] [AMDGPU] Add test for amdgcn.if/else per-output
uniformity (NFC)
---
.../AMDGPU/MIR/per-output-uniformity.mir | 78 +++++++++++++++++++
1 file changed, 78 insertions(+)
create mode 100644 llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/per-output-uniformity.mir
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/per-output-uniformity.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/per-output-uniformity.mir
new file mode 100644
index 0000000000000..db8cb74bd578c
--- /dev/null
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/per-output-uniformity.mir
@@ -0,0 +1,78 @@
+# RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -run-pass=print-machine-uniformity -o - %s 2>&1 | FileCheck %s
+# RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -passes='print<machine-uniformity>' -filetype=null %s 2>&1 | FileCheck %s
+
+# Test per-output uniformity analysis for amdgcn.if and amdgcn.else intrinsics.
+# These intrinsics produce two outputs:
+# - First result (i1): Inherits divergence from the input condition.
+# - Second result (i64): Saved exec mask - always uniform.
+
+# Test amdgcn.if with UNIFORM input
+---
+name: amdgcn_if_uniform_input
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: MachineUniformityInfo for function: @amdgcn_if_uniform_input
+ ; Currently both outputs are marked divergent even with uniform input
+ ; CHECK: DIVERGENT: %1
+ ; CHECK: DIVERGENT: %2
+ %0:_(s1) = G_IMPLICIT_DEF
+ %1:_(s1), %2:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %0:_(s1)
+ S_ENDPGM 0
+...
+
+# Test amdgcn.if with DIVERGENT input
+---
+name: amdgcn_if_divergent_input
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: MachineUniformityInfo for function: @amdgcn_if_divergent_input
+ ; Both outputs are divergent
+ ; CHECK: DIVERGENT: %3
+ ; CHECK: DIVERGENT: %4
+ %0:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x)
+ %1:_(s32) = G_CONSTANT i32 16
+ %2:_(s1) = G_ICMP intpred(slt), %0:_(s32), %1:_
+ %3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2:_(s1)
+ S_ENDPGM 0
+...
+
+# Test amdgcn.else with UNIFORM input
+---
+name: amdgcn_else_uniform_input
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: MachineUniformityInfo for function: @amdgcn_else_uniform_input
+ ; Currently both outputs are marked divergent even with uniform input
+ ; CHECK: DIVERGENT: %1
+ ; CHECK: DIVERGENT: %2
+ %0:_(s64) = G_IMPLICIT_DEF
+ %1:_(s1), %2:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.else), %0:_(s64)
+ S_ENDPGM 0
+...
+
+# Test amdgcn.else with DIVERGENT input
+---
+name: amdgcn_else_divergent_input
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: MachineUniformityInfo for function: @amdgcn_else_divergent_input
+ ; Both outputs are divergent
+ ; CHECK: DIVERGENT: %2
+ ; CHECK: DIVERGENT: %3
+ %0:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x)
+ %1:_(s64) = G_ZEXT %0:_(s32)
+ %2:_(s1), %3:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.else), %1:_(s64)
+ S_ENDPGM 0
+...
>From c8d779166006fc288aac4d562a10aaf52165fe01 Mon Sep 17 00:00:00 2001
From: padivedi <padivedi at amd.com>
Date: Mon, 2 Feb 2026 18:24:45 +0530
Subject: [PATCH 2/5] Implement per-output machine uniformity analysis
---
llvm/include/llvm/ADT/GenericUniformityImpl.h | 21 ++++-
llvm/include/llvm/CodeGen/TargetInstrInfo.h | 7 +-
llvm/lib/Analysis/UniformityAnalysis.cpp | 2 +-
.../lib/CodeGen/MachineUniformityAnalysis.cpp | 92 ++++++++++++++++---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 38 +++++---
llvm/lib/Target/AMDGPU/SIInstrInfo.h | 8 +-
.../AMDGPU/MIR/hidden-diverge-gmir.mir | 6 +-
.../AMDGPU/MIR/per-output-uniformity.mir | 28 +++---
8 files changed, 147 insertions(+), 55 deletions(-)
diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h
index 2db76a1ad9b13..55af9be64883e 100644
--- a/llvm/include/llvm/ADT/GenericUniformityImpl.h
+++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h
@@ -357,8 +357,8 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
const FunctionT &getFunction() const { return F; }
- /// \brief Mark \p UniVal as a value that is always uniform.
- void addUniformOverride(const InstructionT &Instr);
+ /// \brief Mark \p V as a value that is always uniform.
+ void addUniformOverride(ConstValueRefT V);
/// \brief Examine \p I for divergent outputs and add to the worklist.
void markDivergent(const InstructionT &I);
@@ -382,6 +382,9 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
/// operands
bool isAlwaysUniform(const InstructionT &Instr) const;
+ /// \brief Whether \p V is always uniform (per-value check for MIR).
+ bool isAlwaysUniform(ConstValueRefT V) const;
+
bool hasDivergentDefs(const InstructionT &I) const;
bool isDivergent(const InstructionT &I) const {
@@ -440,7 +443,7 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
SyncDependenceAnalysisT SDA;
// Set of known-uniform values.
- SmallPtrSet<const InstructionT *, 32> UniformOverrides;
+ DenseSet<ConstValueRefT> UniformOverrides;
/// \brief Mark all nodes in \p JoinBlock as divergent and push them on
/// the worklist.
@@ -801,6 +804,8 @@ void GenericUniformityAnalysisImpl<ContextT>::markDivergent(
template <typename ContextT>
bool GenericUniformityAnalysisImpl<ContextT>::markDivergent(
ConstValueRefT Val) {
+ if (isAlwaysUniform(Val))
+ return false;
if (DivergentValues.insert(Val).second) {
LLVM_DEBUG(dbgs() << "marked divergent: " << Context.print(Val) << "\n");
return true;
@@ -810,8 +815,8 @@ bool GenericUniformityAnalysisImpl<ContextT>::markDivergent(
template <typename ContextT>
void GenericUniformityAnalysisImpl<ContextT>::addUniformOverride(
- const InstructionT &Instr) {
- UniformOverrides.insert(&Instr);
+ ConstValueRefT V) {
+ UniformOverrides.insert(V);
}
// Mark as divergent all external uses of values defined in \p DefCycle.
@@ -1143,6 +1148,12 @@ bool GenericUniformityAnalysisImpl<ContextT>::isAlwaysUniform(
return UniformOverrides.contains(&Instr);
}
+template <typename ContextT>
+bool GenericUniformityAnalysisImpl<ContextT>::isAlwaysUniform(
+ ConstValueRefT V) const {
+ return UniformOverrides.contains(V);
+}
+
template <typename ContextT>
GenericUniformityInfo<ContextT>::GenericUniformityInfo(
const DominatorTreeT &DT, const CycleInfoT &CI,
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 91fddce7e7e47..7587e9074870b 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -2367,9 +2367,10 @@ class LLVM_ABI TargetInstrInfo : public MCInstrInfo {
llvm_unreachable("impossible call instruction");
}
- /// Return the uniformity behavior of the given instruction.
- virtual InstructionUniformity
- getInstructionUniformity(const MachineInstr &MI) const {
+ /// Return the uniformity behavior of the given instruction's output.
+ /// \p DefIdx specifies which output to query (for multi-output instructions).
+ virtual InstructionUniformity getDefUniformity(const MachineInstr &MI,
+ unsigned DefIdx = 0) const {
return InstructionUniformity::Default;
}
diff --git a/llvm/lib/Analysis/UniformityAnalysis.cpp b/llvm/lib/Analysis/UniformityAnalysis.cpp
index b56534935d7c2..3f021584ea901 100644
--- a/llvm/lib/Analysis/UniformityAnalysis.cpp
+++ b/llvm/lib/Analysis/UniformityAnalysis.cpp
@@ -34,7 +34,7 @@ template <> void llvm::GenericUniformityAnalysisImpl<SSAContext>::initialize() {
InstructionUniformity IU = TTI->getInstructionUniformity(&I);
switch (IU) {
case InstructionUniformity::AlwaysUniform:
- addUniformOverride(I);
+ addUniformOverride(&I);
continue;
case InstructionUniformity::NeverUniform:
markDivergent(I);
diff --git a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
index dbadb67e1e6d2..6947c12d3eb1e 100644
--- a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
+++ b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
@@ -46,24 +46,86 @@ bool llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::markDefsDivergent(
return insertedDivergent;
}
+template <>
+bool llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::isAlwaysUniform(
+ const MachineInstr &Instr) const {
+ // For MIR, an instruction is "always uniform" only if it has at least one
+ // virtual register def AND all those defs are in UniformOverrides.
+ // Instructions with no virtual register defs (e.g., terminators like
+ // G_BRCOND, G_BR) return false to ensure they can be properly processed
+ // for divergence during propagation.
+ bool HasVirtualDef = false;
+ for (const MachineOperand &Op : Instr.all_defs()) {
+ if (!Op.getReg().isVirtual())
+ continue;
+ HasVirtualDef = true;
+ if (!UniformOverrides.contains(Op.getReg()))
+ return false;
+ }
+ // Only return true if we found at least one virtual def and all were uniform
+ return HasVirtualDef;
+}
+
template <>
void llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::initialize() {
- const auto &InstrInfo = *F.getSubtarget().getInstrInfo();
-
- for (const MachineBasicBlock &block : F) {
- for (const MachineInstr &instr : block) {
- auto uniformity = InstrInfo.getInstructionUniformity(instr);
-
- switch (uniformity) {
- case InstructionUniformity::AlwaysUniform:
- addUniformOverride(instr);
- break;
- case InstructionUniformity::NeverUniform:
- markDivergent(instr);
- break;
- case InstructionUniformity::Default:
- break;
+ const TargetInstrInfo &InstrInfo = *F.getSubtarget().getInstrInfo();
+ const MachineRegisterInfo &MRI = F.getRegInfo();
+ const RegisterBankInfo &RBI = *F.getSubtarget().getRegBankInfo();
+ const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
+
+ for (const MachineBasicBlock &Block : F) {
+ for (const MachineInstr &Instr : Block) {
+ // Terminators are handled separately because:
+ // 1. Many terminators (G_BRCOND, G_BR) have no def operands, so the
+ // per-def loop below would skip them entirely.
+ // 2. Divergent terminators mark the BLOCK as divergent
+ // (DivergentTermBlocks),
+ // not individual values, which is different from regular instructions.
+ // For terminators like SI_IF/SI_ELSE, getDefUniformity() checks the
+ // isNeverUniform() flag, so DefIdx is not relevant here.
+ if (Instr.isTerminator()) {
+ InstructionUniformity Uniformity = InstrInfo.getDefUniformity(Instr);
+ if (Uniformity == InstructionUniformity::NeverUniform) {
+ if (DivergentTermBlocks.insert(Instr.getParent()).second) {
+ Worklist.push_back(&Instr);
+ }
+ }
+ continue;
+ }
+
+ // Query uniformity for each def operand separately.
+ unsigned DefIdx = 0;
+ bool HasDivergentDef = false;
+ for (const MachineOperand &Op : Instr.all_defs()) {
+ if (!Op.getReg().isVirtual()) {
+ DefIdx++;
+ continue;
+ }
+
+ InstructionUniformity Uniformity =
+ InstrInfo.getDefUniformity(Instr, DefIdx);
+
+ switch (Uniformity) {
+ case InstructionUniformity::AlwaysUniform:
+ addUniformOverride(Op.getReg());
+ break;
+ case InstructionUniformity::NeverUniform:
+ // Skip registers that are inherently uniform (e.g., SGPRs on AMDGPU)
+ // even if the instruction is marked as NeverUniform.
+ if (!TRI.isUniformReg(MRI, RBI, Op.getReg())) {
+ if (markDivergent(Op.getReg()))
+ HasDivergentDef = true;
+ }
+ break;
+ case InstructionUniformity::Default:
+ break;
+ }
+ DefIdx++;
}
+ // If any def was marked divergent, add the instruction to worklist
+ // for divergence propagation to users.
+ if (HasDivergentDef)
+ Worklist.push_back(&Instr);
}
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 09efba485f6f8..0b775d8232603 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -10614,7 +10614,8 @@ SIInstrInfo::getCalleeOperand(const MachineInstr &MI) const {
}
InstructionUniformity
-SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const {
+SIInstrInfo::getGenericDefUniformity(const MachineInstr &MI,
+ unsigned DefIdx) const {
const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
unsigned Opcode = MI.getOpcode();
@@ -10639,19 +10640,34 @@ SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const {
if (Opcode == TargetOpcode::G_ADDRSPACE_CAST)
return HandleAddrSpaceCast(MI);
- if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
- auto IID = GI->getIntrinsicID();
- if (AMDGPU::isIntrinsicSourceOfDivergence(IID))
- return InstructionUniformity::NeverUniform;
+ if (const GIntrinsic *GI = dyn_cast<GIntrinsic>(&MI)) {
+ Intrinsic::ID IID = GI->getIntrinsicID();
+ if (AMDGPU::isIntrinsicSourceOfDivergence(IID)) {
+ // Some intrinsics produce multiple outputs with mixed uniformity.
+ // For these, we need to check DefIdx to determine which output is being
+ // queried and return the appropriate uniformity.
+ switch (IID) {
+ case Intrinsic::amdgcn_if:
+ case Intrinsic::amdgcn_else:
+ // These intrinsics produce two outputs:
+ // DefIdx=0: "Exec mask not zero" flag (i1) - inherits divergence from
+ // the input condition to ensure proper divergence
+ // propagation.
+ // DefIdx=1: Saved exec mask (i64) - always uniform as all active
+ // lanes see the same mask value.
+ return DefIdx == 1 ? InstructionUniformity::AlwaysUniform
+ : InstructionUniformity::Default;
+ default:
+ return InstructionUniformity::NeverUniform;
+ }
+ }
if (AMDGPU::isIntrinsicAlwaysUniform(IID))
return InstructionUniformity::AlwaysUniform;
switch (IID) {
case Intrinsic::amdgcn_addrspacecast_nonnull:
return HandleAddrSpaceCast(MI);
- case Intrinsic::amdgcn_if:
- case Intrinsic::amdgcn_else:
- // FIXME: Uniform if second result
+ default:
break;
}
@@ -10694,8 +10710,8 @@ const MIRFormatter *SIInstrInfo::getMIRFormatter() const {
return Formatter.get();
}
-InstructionUniformity
-SIInstrInfo::getInstructionUniformity(const MachineInstr &MI) const {
+InstructionUniformity SIInstrInfo::getDefUniformity(const MachineInstr &MI,
+ unsigned DefIdx) const {
if (isNeverUniform(MI))
return InstructionUniformity::NeverUniform;
@@ -10719,7 +10735,7 @@ SIInstrInfo::getInstructionUniformity(const MachineInstr &MI) const {
// GMIR handling
if (MI.isPreISelOpcode())
- return SIInstrInfo::getGenericInstructionUniformity(MI);
+ return SIInstrInfo::getGenericDefUniformity(MI, DefIdx);
// Atomics are divergent because they are executed sequentially: when an
// atomic operation refers to the same address in each thread, then each
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 05cf804d08ffc..5ab60267fd9ab 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1667,11 +1667,11 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
const MachineOperand &getCalleeOperand(const MachineInstr &MI) const override;
- InstructionUniformity
- getInstructionUniformity(const MachineInstr &MI) const final;
+ InstructionUniformity getDefUniformity(const MachineInstr &MI,
+ unsigned DefIdx = 0) const final;
- InstructionUniformity
- getGenericInstructionUniformity(const MachineInstr &MI) const;
+ InstructionUniformity getGenericDefUniformity(const MachineInstr &MI,
+ unsigned DefIdx = 0) const;
const MIRFormatter *getMIRFormatter() const override;
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir
index 27c53815feb06..e0d23d4af85dd 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir
@@ -5,7 +5,8 @@
# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x)
# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_ICMP intpred(slt)
# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_XOR %{{[0-9]*}}:_, %{{[0-9]*}}:_
-# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if)
+# The first output (i1) of amdgcn.if inherits divergence from input.
+# The second output (exec mask, i64) is always uniform.
# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if)
# CHECK: DIVERGENT: G_BRCOND %{{[0-9]*}}:_(s1), %bb.1
# CHECK: DIVERGENT: G_BR %bb.2
@@ -13,7 +14,8 @@
# CHECK-LABEL: BLOCK bb.2
# CHECK-NOT: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_PHI %{{[0-9]*}}:_(s32), %bb.1, %{{[0-9]*}}:_(s32), %bb.0
# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_PHI %{{[0-9]*}}:_(s1), %bb.1, %{{[0-9]*}}:_(s1), %bb.0
-# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if)
+# The first output (i1) of amdgcn.if inherits divergence from input.
+# The second output (exec mask, i64) is always uniform.
# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if)
# CHECK: DIVERGENT: G_BRCOND %{{[0-9]*}}:_(s1), %bb.3
# CHECK: DIVERGENT: G_BR %bb.4
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/per-output-uniformity.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/per-output-uniformity.mir
index db8cb74bd578c..45cc2058d00d4 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/per-output-uniformity.mir
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/per-output-uniformity.mir
@@ -6,7 +6,7 @@
# - First result (i1): Inherits divergence from the input condition.
# - Second result (i64): Saved exec mask - always uniform.
-# Test amdgcn.if with UNIFORM input
+# Test amdgcn.if with UNIFORM input - both outputs should be uniform
---
name: amdgcn_if_uniform_input
tracksRegLiveness: true
@@ -15,15 +15,14 @@ machineFunctionInfo:
body: |
bb.0:
; CHECK-LABEL: MachineUniformityInfo for function: @amdgcn_if_uniform_input
- ; Currently both outputs are marked divergent even with uniform input
- ; CHECK: DIVERGENT: %1
- ; CHECK: DIVERGENT: %2
+ ; With uniform input, both outputs are uniform
+ ; CHECK: ALL VALUES UNIFORM
%0:_(s1) = G_IMPLICIT_DEF
%1:_(s1), %2:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %0:_(s1)
S_ENDPGM 0
...
-# Test amdgcn.if with DIVERGENT input
+# Test amdgcn.if with DIVERGENT input - first output divergent, second uniform
---
name: amdgcn_if_divergent_input
tracksRegLiveness: true
@@ -32,9 +31,10 @@ machineFunctionInfo:
body: |
bb.0:
; CHECK-LABEL: MachineUniformityInfo for function: @amdgcn_if_divergent_input
- ; Both outputs are divergent
+ ; First output (%3, i1) inherits divergence from the divergent input
; CHECK: DIVERGENT: %3
- ; CHECK: DIVERGENT: %4
+ ; Second output (%4, exec mask) is always uniform regardless of input
+ ; CHECK-NOT: DIVERGENT: %4
%0:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x)
%1:_(s32) = G_CONSTANT i32 16
%2:_(s1) = G_ICMP intpred(slt), %0:_(s32), %1:_
@@ -42,7 +42,7 @@ body: |
S_ENDPGM 0
...
-# Test amdgcn.else with UNIFORM input
+# Test amdgcn.else with UNIFORM input - both outputs should be uniform
---
name: amdgcn_else_uniform_input
tracksRegLiveness: true
@@ -51,15 +51,14 @@ machineFunctionInfo:
body: |
bb.0:
; CHECK-LABEL: MachineUniformityInfo for function: @amdgcn_else_uniform_input
- ; Currently both outputs are marked divergent even with uniform input
- ; CHECK: DIVERGENT: %1
- ; CHECK: DIVERGENT: %2
+ ; With uniform input, both outputs are uniform
+ ; CHECK: ALL VALUES UNIFORM
%0:_(s64) = G_IMPLICIT_DEF
%1:_(s1), %2:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.else), %0:_(s64)
S_ENDPGM 0
...
-# Test amdgcn.else with DIVERGENT input
+# Test amdgcn.else with DIVERGENT input - first output divergent, second uniform
---
name: amdgcn_else_divergent_input
tracksRegLiveness: true
@@ -68,9 +67,10 @@ machineFunctionInfo:
body: |
bb.0:
; CHECK-LABEL: MachineUniformityInfo for function: @amdgcn_else_divergent_input
- ; Both outputs are divergent
+ ; First output (%2, i1) inherits divergence from the divergent input
; CHECK: DIVERGENT: %2
- ; CHECK: DIVERGENT: %3
+ ; Second output (%3, exec mask) is always uniform regardless of input
+ ; CHECK-NOT: DIVERGENT: %3
%0:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x)
%1:_(s64) = G_ZEXT %0:_(s32)
%2:_(s1), %3:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.else), %1:_(s64)
>From 111bef25592df954fe02886a96b79b68a3fc0321 Mon Sep 17 00:00:00 2001
From: padivedi <padivedi at amd.com>
Date: Wed, 4 Feb 2026 15:10:22 +0530
Subject: [PATCH 3/5] update uniformity per val instead inst
---
llvm/include/llvm/ADT/GenericUniformityImpl.h | 2 +-
llvm/include/llvm/ADT/Uniformity.h | 12 +-
.../llvm/Analysis/TargetTransformInfo.h | 8 +-
.../llvm/Analysis/TargetTransformInfoImpl.h | 4 +-
llvm/include/llvm/CodeGen/TargetInstrInfo.h | 8 +-
llvm/lib/Analysis/TargetTransformInfo.cpp | 8 +-
llvm/lib/Analysis/UniformityAnalysis.cpp | 13 +-
.../lib/CodeGen/MachineUniformityAnalysis.cpp | 26 ++--
.../AMDGPU/AMDGPUTargetTransformInfo.cpp | 9 +-
.../Target/AMDGPU/AMDGPUTargetTransformInfo.h | 2 +-
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 67 +++++-----
llvm/lib/Target/AMDGPU/SIInstrInfo.h | 8 +-
.../Target/NVPTX/NVPTXTargetTransformInfo.cpp | 7 +-
.../Target/NVPTX/NVPTXTargetTransformInfo.h | 2 +-
.../AMDGPU/MIR/per-output-uniformity.mir | 114 +++++++++++++++++-
15 files changed, 197 insertions(+), 93 deletions(-)
diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h
index 55af9be64883e..b497eaff7b264 100644
--- a/llvm/include/llvm/ADT/GenericUniformityImpl.h
+++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h
@@ -443,7 +443,7 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
SyncDependenceAnalysisT SDA;
// Set of known-uniform values.
- DenseSet<ConstValueRefT> UniformOverrides;
+ SmallDenseSet<ConstValueRefT, 8> UniformOverrides;
/// \brief Mark all nodes in \p JoinBlock as divergent and push them on
/// the worklist.
diff --git a/llvm/include/llvm/ADT/Uniformity.h b/llvm/include/llvm/ADT/Uniformity.h
index 21ca106b80be3..1e7104ec9b4b4 100644
--- a/llvm/include/llvm/ADT/Uniformity.h
+++ b/llvm/include/llvm/ADT/Uniformity.h
@@ -11,18 +11,18 @@
namespace llvm {
-/// Enum describing how instructions behave with respect to uniformity and
+/// Enum describing how values behave with respect to uniformity and
/// divergence, to answer the question: if the same instruction is executed by
-/// two threads in a convergent set of threads, will its result value(s) be
+/// two threads in a convergent set of threads, will its result value be
/// uniform, i.e. the same on both threads?
-enum class InstructionUniformity {
- /// The result values are uniform if and only if all operands are uniform.
+enum class ValueUniformity {
+ /// The result value is uniform if and only if all operands are uniform.
Default,
- /// The result values are always uniform.
+ /// The result value is always uniform.
AlwaysUniform,
- /// The result values can never be assumed to be uniform.
+ /// The result value can never be assumed to be uniform.
NeverUniform
};
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index a013122df5f06..e4fc27fdd8a41 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -517,14 +517,14 @@ class TargetTransformInfo {
/// uniformity analysis and assume all values are uniform.
LLVM_ABI bool hasBranchDivergence(const Function *F = nullptr) const;
- /// Get target-specific uniformity information for an instruction.
+ /// Get target-specific uniformity information for a value.
/// This allows targets to provide more fine-grained control over
- /// uniformity analysis by specifying whether specific instructions
+ /// uniformity analysis by specifying whether specific value
/// should always or never be considered uniform, or require custom
/// operand-based analysis.
/// \param V The value to query for uniformity information.
- /// \return InstructionUniformity.
- LLVM_ABI InstructionUniformity getInstructionUniformity(const Value *V) const;
+ /// \return ValueUniformity.
+ LLVM_ABI ValueUniformity getValueUniformity(const Value *V) const;
/// Query the target whether the specified address space cast from FromAS to
/// ToAS is valid.
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 6d27cabf404f8..311cfe9d454b1 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -132,8 +132,8 @@ class TargetTransformInfoImplBase {
return false;
}
- virtual InstructionUniformity getInstructionUniformity(const Value *V) const {
- return InstructionUniformity::Default;
+ virtual ValueUniformity getValueUniformity(const Value *V) const {
+ return ValueUniformity::Default;
}
virtual bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 7587e9074870b..bb232ba4b4dec 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -2367,11 +2367,11 @@ class LLVM_ABI TargetInstrInfo : public MCInstrInfo {
llvm_unreachable("impossible call instruction");
}
- /// Return the uniformity behavior of the given instruction's output.
+ /// Return the uniformity behavior of the given value (def operand).
/// \p DefIdx specifies which output to query (for multi-output instructions).
- virtual InstructionUniformity getDefUniformity(const MachineInstr &MI,
- unsigned DefIdx = 0) const {
- return InstructionUniformity::Default;
+ virtual ValueUniformity getValueUniformity(const MachineInstr &MI,
+ unsigned DefIdx = 0) const {
+ return ValueUniformity::Default;
}
/// Returns true if the given \p MI defines a TargetIndex operand that can be
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 19785204ed2b3..def1bd37da041 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -294,14 +294,14 @@ bool TargetTransformInfo::hasBranchDivergence(const Function *F) const {
return TTIImpl->hasBranchDivergence(F);
}
-InstructionUniformity
-llvm::TargetTransformInfo::getInstructionUniformity(const Value *V) const {
+ValueUniformity
+llvm::TargetTransformInfo::getValueUniformity(const Value *V) const {
// Calls with the NoDivergenceSource attribute are always uniform.
if (const auto *Call = dyn_cast<CallBase>(V)) {
if (Call->hasFnAttr(Attribute::NoDivergenceSource))
- return InstructionUniformity::AlwaysUniform;
+ return ValueUniformity::AlwaysUniform;
}
- return TTIImpl->getInstructionUniformity(V);
+ return TTIImpl->getValueUniformity(V);
}
bool llvm::TargetTransformInfo::isValidAddrSpaceCast(unsigned FromAS,
diff --git a/llvm/lib/Analysis/UniformityAnalysis.cpp b/llvm/lib/Analysis/UniformityAnalysis.cpp
index 3f021584ea901..55b3ca5f8c098 100644
--- a/llvm/lib/Analysis/UniformityAnalysis.cpp
+++ b/llvm/lib/Analysis/UniformityAnalysis.cpp
@@ -31,21 +31,20 @@ bool llvm::GenericUniformityAnalysisImpl<SSAContext>::markDefsDivergent(
template <> void llvm::GenericUniformityAnalysisImpl<SSAContext>::initialize() {
for (auto &I : instructions(F)) {
- InstructionUniformity IU = TTI->getInstructionUniformity(&I);
- switch (IU) {
- case InstructionUniformity::AlwaysUniform:
+ ValueUniformity VU = TTI->getValueUniformity(&I);
+ switch (VU) {
+ case ValueUniformity::AlwaysUniform:
addUniformOverride(&I);
continue;
- case InstructionUniformity::NeverUniform:
+ case ValueUniformity::NeverUniform:
markDivergent(I);
continue;
- case InstructionUniformity::Default:
+ case ValueUniformity::Default:
break;
}
}
for (auto &Arg : F.args()) {
- if (TTI->getInstructionUniformity(&Arg) ==
- InstructionUniformity::NeverUniform)
+ if (TTI->getValueUniformity(&Arg) == ValueUniformity::NeverUniform)
markDivergent(&Arg);
}
}
diff --git a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
index 6947c12d3eb1e..55eb19ea5f8e8 100644
--- a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
+++ b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
@@ -77,15 +77,15 @@ void llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::initialize() {
for (const MachineInstr &Instr : Block) {
// Terminators are handled separately because:
// 1. Many terminators (G_BRCOND, G_BR) have no def operands, so the
- // per-def loop below would skip them entirely.
- // 2. Divergent terminators mark the BLOCK as divergent
- // (DivergentTermBlocks),
- // not individual values, which is different from regular instructions.
- // For terminators like SI_IF/SI_ELSE, getDefUniformity() checks the
- // isNeverUniform() flag, so DefIdx is not relevant here.
+ // per-def loop below would skip them entirely.
+ // 2. Divergent terminators mark the BLOCK as
+ // divergent(DivergentTermBlocks), not individual values, which is
+ // different from regular instructions. For terminators like
+ // SI_IF/SI_ELSE, getValueUniformity() checks the isNeverUniform() flag,
+ // so DefIdx is not relevant here.
if (Instr.isTerminator()) {
- InstructionUniformity Uniformity = InstrInfo.getDefUniformity(Instr);
- if (Uniformity == InstructionUniformity::NeverUniform) {
+ ValueUniformity Uniformity = InstrInfo.getValueUniformity(Instr);
+ if (Uniformity == ValueUniformity::NeverUniform) {
if (DivergentTermBlocks.insert(Instr.getParent()).second) {
Worklist.push_back(&Instr);
}
@@ -102,14 +102,14 @@ void llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::initialize() {
continue;
}
- InstructionUniformity Uniformity =
- InstrInfo.getDefUniformity(Instr, DefIdx);
+ ValueUniformity Uniformity =
+ InstrInfo.getValueUniformity(Instr, DefIdx);
switch (Uniformity) {
- case InstructionUniformity::AlwaysUniform:
+ case ValueUniformity::AlwaysUniform:
addUniformOverride(Op.getReg());
break;
- case InstructionUniformity::NeverUniform:
+ case ValueUniformity::NeverUniform:
// Skip registers that are inherently uniform (e.g., SGPRs on AMDGPU)
// even if the instruction is marked as NeverUniform.
if (!TRI.isUniformReg(MRI, RBI, Op.getReg())) {
@@ -117,7 +117,7 @@ void llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::initialize() {
HasDivergentDef = true;
}
break;
- case InstructionUniformity::Default:
+ case ValueUniformity::Default:
break;
}
DefIdx++;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 3e02587f61336..9199001c02a05 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -1620,13 +1620,12 @@ unsigned GCNTTIImpl::getNumberOfParts(Type *Tp) const {
return BaseT::getNumberOfParts(Tp);
}
-InstructionUniformity
-GCNTTIImpl::getInstructionUniformity(const Value *V) const {
+ValueUniformity GCNTTIImpl::getValueUniformity(const Value *V) const {
if (isAlwaysUniform(V))
- return InstructionUniformity::AlwaysUniform;
+ return ValueUniformity::AlwaysUniform;
if (isSourceOfDivergence(V))
- return InstructionUniformity::NeverUniform;
+ return ValueUniformity::NeverUniform;
- return InstructionUniformity::Default;
+ return ValueUniformity::Default;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 3ec157aacd0aa..dd1a11ad6fe04 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -310,7 +310,7 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
/// implementation.
unsigned getNumberOfParts(Type *Tp) const override;
- InstructionUniformity getInstructionUniformity(const Value *V) const override;
+ ValueUniformity getValueUniformity(const Value *V) const override;
};
} // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 0b775d8232603..9c9139a1de0c6 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -10613,9 +10613,8 @@ SIInstrInfo::getCalleeOperand(const MachineInstr &MI) const {
return TargetInstrInfo::getCalleeOperand(MI);
}
-InstructionUniformity
-SIInstrInfo::getGenericDefUniformity(const MachineInstr &MI,
- unsigned DefIdx) const {
+ValueUniformity SIInstrInfo::getGenericValueUniformity(const MachineInstr &MI,
+ unsigned DefIdx) const {
const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
unsigned Opcode = MI.getOpcode();
@@ -10630,8 +10629,8 @@ SIInstrInfo::getGenericDefUniformity(const MachineInstr &MI,
return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
DstAS == AMDGPUAS::FLAT_ADDRESS &&
ST.hasGloballyAddressableScratch()
- ? InstructionUniformity::NeverUniform
- : InstructionUniformity::Default;
+ ? ValueUniformity::NeverUniform
+ : ValueUniformity::Default;
};
// If the target supports globally addressable scratch, the mapping from
@@ -10650,19 +10649,21 @@ SIInstrInfo::getGenericDefUniformity(const MachineInstr &MI,
case Intrinsic::amdgcn_if:
case Intrinsic::amdgcn_else:
// These intrinsics produce two outputs:
- // DefIdx=0: "Exec mask not zero" flag (i1) - inherits divergence from
- // the input condition to ensure proper divergence
- // propagation.
+ // DefIdx=0: Boolean (i1) indicating whether the "then" block should
+ // execute. After the exec mask is updated (ANDed with the
+ // condition), this is true if exec is non-zero (at least
+ // one lane active), false if exec is zero (no lanes
+ // active). Inherits divergence from the input condition.
// DefIdx=1: Saved exec mask (i64) - always uniform as all active
- // lanes see the same mask value.
- return DefIdx == 1 ? InstructionUniformity::AlwaysUniform
- : InstructionUniformity::Default;
+ // lanes observe the same mask value.
+ return DefIdx == 1 ? ValueUniformity::AlwaysUniform
+ : ValueUniformity::Default;
default:
- return InstructionUniformity::NeverUniform;
+ return ValueUniformity::NeverUniform;
}
}
if (AMDGPU::isIntrinsicAlwaysUniform(IID))
- return InstructionUniformity::AlwaysUniform;
+ return ValueUniformity::AlwaysUniform;
switch (IID) {
case Intrinsic::amdgcn_addrspacecast_nonnull:
@@ -10671,7 +10672,7 @@ SIInstrInfo::getGenericDefUniformity(const MachineInstr &MI,
break;
}
- return InstructionUniformity::Default;
+ return ValueUniformity::Default;
}
// Loads from the private and flat address spaces are divergent, because
@@ -10683,25 +10684,25 @@ SIInstrInfo::getGenericDefUniformity(const MachineInstr &MI,
if (Opcode == AMDGPU::G_LOAD || Opcode == AMDGPU::G_ZEXTLOAD ||
Opcode == AMDGPU::G_SEXTLOAD) {
if (MI.memoperands_empty())
- return InstructionUniformity::NeverUniform; // conservative assumption
+ return ValueUniformity::NeverUniform; // conservative assumption
if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
})) {
// At least one MMO in a non-global address space.
- return InstructionUniformity::NeverUniform;
+ return ValueUniformity::NeverUniform;
}
- return InstructionUniformity::Default;
+ return ValueUniformity::Default;
}
if (SIInstrInfo::isGenericAtomicRMWOpcode(Opcode) ||
Opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
Opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
AMDGPU::isGenericAtomic(Opcode)) {
- return InstructionUniformity::NeverUniform;
+ return ValueUniformity::NeverUniform;
}
- return InstructionUniformity::Default;
+ return ValueUniformity::Default;
}
const MIRFormatter *SIInstrInfo::getMIRFormatter() const {
@@ -10710,32 +10711,32 @@ const MIRFormatter *SIInstrInfo::getMIRFormatter() const {
return Formatter.get();
}
-InstructionUniformity SIInstrInfo::getDefUniformity(const MachineInstr &MI,
- unsigned DefIdx) const {
+ValueUniformity SIInstrInfo::getValueUniformity(const MachineInstr &MI,
+ unsigned DefIdx) const {
if (isNeverUniform(MI))
- return InstructionUniformity::NeverUniform;
+ return ValueUniformity::NeverUniform;
unsigned opcode = MI.getOpcode();
if (opcode == AMDGPU::V_READLANE_B32 ||
opcode == AMDGPU::V_READFIRSTLANE_B32 ||
opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
- return InstructionUniformity::AlwaysUniform;
+ return ValueUniformity::AlwaysUniform;
if (isCopyInstr(MI)) {
const MachineOperand &srcOp = MI.getOperand(1);
if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
const TargetRegisterClass *regClass =
RI.getPhysRegBaseClass(srcOp.getReg());
- return RI.isSGPRClass(regClass) ? InstructionUniformity::AlwaysUniform
- : InstructionUniformity::NeverUniform;
+ return RI.isSGPRClass(regClass) ? ValueUniformity::AlwaysUniform
+ : ValueUniformity::NeverUniform;
}
- return InstructionUniformity::Default;
+ return ValueUniformity::Default;
}
// GMIR handling
if (MI.isPreISelOpcode())
- return SIInstrInfo::getGenericDefUniformity(MI, DefIdx);
+ return SIInstrInfo::getGenericValueUniformity(MI, DefIdx);
// Atomics are divergent because they are executed sequentially: when an
// atomic operation refers to the same address in each thread, then each
@@ -10743,24 +10744,24 @@ InstructionUniformity SIInstrInfo::getDefUniformity(const MachineInstr &MI,
// original value.
if (isAtomic(MI))
- return InstructionUniformity::NeverUniform;
+ return ValueUniformity::NeverUniform;
// Loads from the private and flat address spaces are divergent, because
// threads can execute the load instruction with the same inputs and get
// different results.
if (isFLAT(MI) && MI.mayLoad()) {
if (MI.memoperands_empty())
- return InstructionUniformity::NeverUniform; // conservative assumption
+ return ValueUniformity::NeverUniform; // conservative assumption
if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
})) {
// At least one MMO in a non-global address space.
- return InstructionUniformity::NeverUniform;
+ return ValueUniformity::NeverUniform;
}
- return InstructionUniformity::Default;
+ return ValueUniformity::Default;
}
const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
@@ -10782,7 +10783,7 @@ InstructionUniformity SIInstrInfo::getDefUniformity(const MachineInstr &MI,
// register, which are all scalars.
const RegisterBank *RegBank = RBI->getRegBank(Reg, MRI, RI);
if (RegBank && RegBank->getID() != AMDGPU::SGPRRegBankID)
- return InstructionUniformity::NeverUniform;
+ return ValueUniformity::NeverUniform;
}
// TODO: Uniformity check condtions above can be rearranged for more
@@ -10792,7 +10793,7 @@ InstructionUniformity SIInstrInfo::getDefUniformity(const MachineInstr &MI,
// currently turned into no-op COPYs by SelectionDAG ISel and are
// therefore no longer recognizable.
- return InstructionUniformity::Default;
+ return ValueUniformity::Default;
}
unsigned SIInstrInfo::getDSShaderTypeValue(const MachineFunction &MF) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 5ab60267fd9ab..4366d6f55d767 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1667,11 +1667,11 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
const MachineOperand &getCalleeOperand(const MachineInstr &MI) const override;
- InstructionUniformity getDefUniformity(const MachineInstr &MI,
- unsigned DefIdx = 0) const final;
+ ValueUniformity getValueUniformity(const MachineInstr &MI,
+ unsigned DefIdx = 0) const final;
- InstructionUniformity getGenericDefUniformity(const MachineInstr &MI,
- unsigned DefIdx = 0) const;
+ ValueUniformity getGenericValueUniformity(const MachineInstr &MI,
+ unsigned DefIdx = 0) const;
const MIRFormatter *getMIRFormatter() const override;
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
index c1fe9300785a3..01948d8e3264a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -671,10 +671,9 @@ void NVPTXTTIImpl::collectKernelLaunchBounds(
LB.push_back({"maxntidz", MaxNTID[2]});
}
-InstructionUniformity
-NVPTXTTIImpl::getInstructionUniformity(const Value *V) const {
+ValueUniformity NVPTXTTIImpl::getValueUniformity(const Value *V) const {
if (isSourceOfDivergence(V))
- return InstructionUniformity::NeverUniform;
+ return ValueUniformity::NeverUniform;
- return InstructionUniformity::Default;
+ return ValueUniformity::Default;
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
index 40eb161bc8666..dca2a0893f2fa 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@@ -207,7 +207,7 @@ class NVPTXTTIImpl final : public BasicTTIImplBase<NVPTXTTIImpl> {
return false;
}
- InstructionUniformity getInstructionUniformity(const Value *V) const override;
+ ValueUniformity getValueUniformity(const Value *V) const override;
};
} // end namespace llvm
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/per-output-uniformity.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/per-output-uniformity.mir
index 45cc2058d00d4..53bae05a2bb09 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/per-output-uniformity.mir
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/per-output-uniformity.mir
@@ -1,10 +1,19 @@
# RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -run-pass=print-machine-uniformity -o - %s 2>&1 | FileCheck %s
# RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -passes='print<machine-uniformity>' -filetype=null %s 2>&1 | FileCheck %s
-# Test per-output uniformity analysis for amdgcn.if and amdgcn.else intrinsics.
-# These intrinsics produce two outputs:
-# - First result (i1): Inherits divergence from the input condition.
-# - Second result (i64): Saved exec mask - always uniform.
+# Test per-output (per-value) uniformity analysis for instructions with multiple
+# definitions where each definition can have different uniformity characteristics.
+#
+# The amdgcn.if and amdgcn.else intrinsics are the primary AMDGPU instructions
+# that produce multiple outputs with DIFFERENT uniformity:
+# - First result (i1): Boolean flag indicating if any lanes are active.
+# Inherits divergence from the input condition.
+# - Second result (i64): Saved exec mask - always uniform as all active lanes
+# see the same mask value.
+#
+# Most other multi-output instructions (G_SDIVREM, G_UADDO, G_ATOMIC_CMPXCHG_WITH_SUCCESS,
+# etc.) have all outputs with the SAME uniformity, so they don't require special
+# per-output handling.
# Test amdgcn.if with UNIFORM input - both outputs should be uniform
---
@@ -76,3 +85,100 @@ body: |
%2:_(s1), %3:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.else), %1:_(s64)
S_ENDPGM 0
...
+
+---
+name: chained_if_else
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: MachineUniformityInfo for function: @chained_if_else
+ ; Divergent condition from workitem.id.x
+ ; CHECK: DIVERGENT: %0
+ ; CHECK: DIVERGENT: %2
+ ; CHECK: DIVERGENT: %3
+ ; CHECK-NOT: DIVERGENT: %4
+ ; CHECK-NOT: DIVERGENT: %5
+ ; CHECK-NOT: DIVERGENT: %6
+ %0:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x)
+ %1:_(s32) = G_CONSTANT i32 16
+ %2:_(s1) = G_ICMP intpred(slt), %0:_(s32), %1:_
+ %3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2:_(s1)
+ ; The exec mask from if (%4) is always uniform, so else sees uniform input
+ %5:_(s1), %6:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.else), %4:_(s64)
+ S_ENDPGM 0
+...
+
+---
+name: uniform_exec_mask_usage
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: MachineUniformityInfo for function: @uniform_exec_mask_usage
+ ; CHECK: DIVERGENT: %0
+ ; CHECK: DIVERGENT: %2
+ ; CHECK: DIVERGENT: %3
+ ; CHECK-NOT: DIVERGENT: %4
+ ; CHECK-NOT: DIVERGENT: %5
+ ; CHECK-NOT: DIVERGENT: %6
+ %0:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x)
+ %1:_(s32) = G_CONSTANT i32 16
+ %2:_(s1) = G_ICMP intpred(slt), %0:_(s32), %1:_
+ %3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2:_(s1)
+ ; The exec mask is uniform, so operations on it are uniform
+ %5:_(s64) = G_CONSTANT i64 0
+ %6:_(s1) = G_ICMP intpred(eq), %4:_(s64), %5:_
+ S_ENDPGM 0
+...
+
+---
+name: divergent_flag_propagation
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: MachineUniformityInfo for function: @divergent_flag_propagation
+ ; Divergent input
+ ; CHECK: DIVERGENT: %0
+ ; CHECK: DIVERGENT: %2
+ ; CHECK: DIVERGENT: %3
+ ; CHECK-NOT: DIVERGENT: %4
+ ; CHECK: DIVERGENT: %5
+ ; CHECK: DIVERGENT: %6
+ %0:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x)
+ %1:_(s32) = G_CONSTANT i32 16
+ %2:_(s1) = G_ICMP intpred(slt), %0:_(s32), %1:_
+ %3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2:_(s1)
+ ; The divergent flag (%3) propagates divergence
+ %5:_(s1) = G_XOR %3:_, %3:_
+ %6:_(s32) = G_ZEXT %3:_(s1)
+ S_ENDPGM 0
+...
+
+---
+name: nested_if_divergent
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: MachineUniformityInfo for function: @nested_if_divergent
+ ; First amdgcn.if with divergent input
+ ; CHECK: DIVERGENT: %0
+ ; CHECK: DIVERGENT: %2
+ ; CHECK: DIVERGENT: %3
+ ; CHECK-NOT: DIVERGENT: %4
+ ; CHECK: DIVERGENT: %5
+ ; CHECK-NOT: DIVERGENT: %6
+ %0:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x)
+ %1:_(s32) = G_CONSTANT i32 16
+ %2:_(s1) = G_ICMP intpred(slt), %0:_(s32), %1:_
+ %3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2:_(s1)
+ ; Nested if using the divergent output from first if
+ %5:_(s1), %6:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %3:_(s1)
+ S_ENDPGM 0
+...
>From bd8ae2c90e7abb4e9f3cda7b364f7df4a813a216 Mon Sep 17 00:00:00 2001
From: padivedi <padivedi at amd.com>
Date: Fri, 6 Feb 2026 13:47:47 +0530
Subject: [PATCH 4/5] review: add assert for def idx and reg
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 9c9139a1de0c6..ed54971e78d89 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -10656,6 +10656,10 @@ ValueUniformity SIInstrInfo::getGenericValueUniformity(const MachineInstr &MI,
// active). Inherits divergence from the input condition.
// DefIdx=1: Saved exec mask (i64) - always uniform as all active
// lanes observe the same mask value.
+ assert(DefIdx < 2 && "amdgcn_if/amdgcn_else have exactly 2 defs");
+ assert(MI.getOperand(DefIdx).isReg() &&
+ MI.getOperand(DefIdx).getReg().isVirtual() &&
+ "Expected virtual register def");
return DefIdx == 1 ? ValueUniformity::AlwaysUniform
: ValueUniformity::Default;
default:
>From 1ab00fd9b51b01fc82c60b9e05c638d3a210cefe Mon Sep 17 00:00:00 2001
From: padivedi <padivedi at amd.com>
Date: Fri, 6 Feb 2026 17:52:22 +0530
Subject: [PATCH 5/5] review: remove using IntrinsicLaneMaskAnalyzer for SI_IF
and SI_ELSE
---
llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp | 5 -----
1 file changed, 5 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
index f36935d8c0e8f..6650a0fa02a21 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
@@ -95,11 +95,6 @@ void IntrinsicLaneMaskAnalyzer::initLaneMaskIntrinsics(MachineFunction &MF) {
S32S64LaneMask.insert(MI.getOperand(3).getReg());
S32S64LaneMask.insert(MI.getOperand(0).getReg());
}
-
- if (MI.getOpcode() == AMDGPU::SI_IF ||
- MI.getOpcode() == AMDGPU::SI_ELSE) {
- S32S64LaneMask.insert(MI.getOperand(0).getReg());
- }
}
}
}
More information about the llvm-commits
mailing list