[llvm] [AMDGPU][Uniformity][TTI] Make Uniformity Analysis Operand-Aware via Custom Divergence Checks (PR #137639)
Pankaj Dwivedi via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 24 04:03:08 PDT 2026
https://github.com/PankajDwivedi-25 updated https://github.com/llvm/llvm-project/pull/137639
>From a30a915e7f241a840f96668d0a1be250de6c4b1c Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Tue, 23 Sep 2025 17:25:29 +0530
Subject: [PATCH 01/17] [NFC] move isDivergentUse so later dependent function
in pushUsers can safely use it
---
llvm/lib/Analysis/UniformityAnalysis.cpp | 26 ++++++++++++------------
1 file changed, 13 insertions(+), 13 deletions(-)
diff --git a/llvm/lib/Analysis/UniformityAnalysis.cpp b/llvm/lib/Analysis/UniformityAnalysis.cpp
index b56534935d7c2..f6cf67463c8a1 100644
--- a/llvm/lib/Analysis/UniformityAnalysis.cpp
+++ b/llvm/lib/Analysis/UniformityAnalysis.cpp
@@ -29,6 +29,19 @@ bool llvm::GenericUniformityAnalysisImpl<SSAContext>::markDefsDivergent(
return markDivergent(cast<Value>(&Instr));
}
+template <>
+bool llvm::GenericUniformityAnalysisImpl<SSAContext>::isDivergentUse(
+ const Use &U) const {
+ const auto *V = U.get();
+ if (isDivergent(V))
+ return true;
+ if (const auto *DefInstr = dyn_cast<Instruction>(V)) {
+ const auto *UseInstr = cast<Instruction>(U.getUser());
+ return isTemporalDivergent(*UseInstr->getParent(), *DefInstr);
+ }
+ return false;
+}
+
template <> void llvm::GenericUniformityAnalysisImpl<SSAContext>::initialize() {
for (auto &I : instructions(F)) {
InstructionUniformity IU = TTI->getInstructionUniformity(&I);
@@ -95,19 +108,6 @@ void llvm::GenericUniformityAnalysisImpl<
}
}
-template <>
-bool llvm::GenericUniformityAnalysisImpl<SSAContext>::isDivergentUse(
- const Use &U) const {
- const auto *V = U.get();
- if (isDivergent(V))
- return true;
- if (const auto *DefInstr = dyn_cast<Instruction>(V)) {
- const auto *UseInstr = cast<Instruction>(U.getUser());
- return isTemporalDivergent(*UseInstr->getParent(), *DefInstr);
- }
- return false;
-}
-
// This ensures explicit instantiation of
// GenericUniformityAnalysisImpl::ImplDeleter::operator()
template class llvm::GenericUniformityInfo<SSAContext>;
>From c41a9ec639d8bf07578b16d037c13768e6bbd224 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Sat, 1 Nov 2025 02:02:14 +0530
Subject: [PATCH 02/17] add target hook to capture special operand uniformity
and update UA to use it
---
llvm/include/llvm/ADT/GenericUniformityImpl.h | 22 +++++++
llvm/include/llvm/ADT/Uniformity.h | 5 +-
.../llvm/Analysis/TargetTransformInfo.h | 1 +
.../llvm/Analysis/TargetTransformInfoImpl.h | 4 ++
llvm/lib/Analysis/TargetTransformInfo.cpp | 5 ++
llvm/lib/Analysis/UniformityAnalysis.cpp | 39 ++++++++----
.../lib/CodeGen/MachineUniformityAnalysis.cpp | 11 ++++
.../UniformityAnalysis/AMDGPU/intrinsics.ll | 4 +-
.../AMDGPU/uniform_intrinsic.ll | 59 +++++++++++++++++++
9 files changed, 134 insertions(+), 16 deletions(-)
create mode 100644 llvm/test/Analysis/UniformityAnalysis/AMDGPU/uniform_intrinsic.ll
diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h
index 2db76a1ad9b13..23a774bed21a9 100644
--- a/llvm/include/llvm/ADT/GenericUniformityImpl.h
+++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h
@@ -51,6 +51,7 @@
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SparseBitVector.h"
#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Uniformity.h"
#include "llvm/Support/raw_ostream.h"
#define DEBUG_TYPE "uniformity"
@@ -407,6 +408,11 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
void recordTemporalDivergence(ConstValueRefT, const InstructionT *,
const CycleT *);
+ bool isOperandUniform(const InstructionT &I, InstructionUniformity IU) const;
+
+ /// \brief keep track of target instruction that can be proven uniform.
+ void addUniformInstruction(const InstructionT *I, InstructionUniformity IU);
+
protected:
const ContextT &Context;
const FunctionT &F;
@@ -420,6 +426,11 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
// Internal worklist for divergence propagation.
std::vector<const InstructionT *> Worklist;
+ // Map containing tracked instruction that can be proven uniform based on its
+ // operand Uniformity.
+ llvm::DenseMap<const InstructionT *, InstructionUniformity>
+ UniformInstruction;
+
/// \brief Mark \p Term as divergent and push all Instructions that become
/// divergent as a result on the worklist.
void analyzeControlDivergence(const InstructionT &Term);
@@ -783,6 +794,11 @@ void GenericUniformityAnalysisImpl<ContextT>::markDivergent(
const InstructionT &I) {
if (isAlwaysUniform(I))
return;
+ auto It = UniformInstruction.find(&I);
+ if (It != UniformInstruction.end() && isOperandUniform(I, It->second)) {
+ addUniformOverride(I);
+ return;
+ }
bool Marked = false;
if (I.isTerminator()) {
Marked = DivergentTermBlocks.insert(I.getParent()).second;
@@ -814,6 +830,12 @@ void GenericUniformityAnalysisImpl<ContextT>::addUniformOverride(
UniformOverrides.insert(&Instr);
}
+template <typename ContextT>
+void GenericUniformityAnalysisImpl<ContextT>::addUniformInstruction(
+ const InstructionT *I, InstructionUniformity IU) {
+ UniformInstruction[I] = IU;
+}
+
// Mark as divergent all external uses of values defined in \p DefCycle.
//
// A value V defined by a block B inside \p DefCycle may be used outside the
diff --git a/llvm/include/llvm/ADT/Uniformity.h b/llvm/include/llvm/ADT/Uniformity.h
index 21ca106b80be3..9571d43b8a9b9 100644
--- a/llvm/include/llvm/ADT/Uniformity.h
+++ b/llvm/include/llvm/ADT/Uniformity.h
@@ -23,7 +23,10 @@ enum class InstructionUniformity {
AlwaysUniform,
/// The result values can never be assumed to be uniform.
- NeverUniform
+ NeverUniform,
+
+ /// Result value can be uniform if either of first two operand are uniform.
+ EitherOfFirstTwoOp
};
} // namespace llvm
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 9cb4a97f9459c..707c57e1160ba 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -2083,6 +2083,7 @@ class TargetTransformInfo {
/// Returns true if GEP should not be used to index into vectors for this
/// target.
LLVM_ABI bool allowVectorElementIndexingUsingGEP() const;
+ InstructionUniformity getInstructionUniformity(const Instruction &I) const;
private:
std::unique_ptr<const TargetTransformInfoImplBase> TTIImpl;
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 410b9187a5fed..94d6d9ecba35c 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -1227,6 +1227,10 @@ class TargetTransformInfoImplBase {
SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const {}
virtual bool allowVectorElementIndexingUsingGEP() const { return true; }
+ virtual InstructionUniformity
+ getInstructionUniformity(const Instruction &I) const {
+ return InstructionUniformity::Default;
+ }
protected:
// Obtain the minimum required size to hold the value (without the sign)
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index b0afba2c78884..bf26ff4079da5 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1551,6 +1551,11 @@ bool TargetTransformInfo::allowVectorElementIndexingUsingGEP() const {
return TTIImpl->allowVectorElementIndexingUsingGEP();
}
+InstructionUniformity
+TargetTransformInfo::getInstructionUniformity(const Instruction &I) const {
+ return TTIImpl->getInstructionUniformity(I);
+}
+
TargetTransformInfoImplBase::~TargetTransformInfoImplBase() = default;
TargetIRAnalysis::TargetIRAnalysis() : TTICallback(&getDefaultTTI) {}
diff --git a/llvm/lib/Analysis/UniformityAnalysis.cpp b/llvm/lib/Analysis/UniformityAnalysis.cpp
index f6cf67463c8a1..9b7cb33d7f76a 100644
--- a/llvm/lib/Analysis/UniformityAnalysis.cpp
+++ b/llvm/lib/Analysis/UniformityAnalysis.cpp
@@ -8,6 +8,7 @@
#include "llvm/Analysis/UniformityAnalysis.h"
#include "llvm/ADT/GenericUniformityImpl.h"
+#include "llvm/ADT/Uniformity.h"
#include "llvm/Analysis/CycleAnalysis.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/Dominators.h"
@@ -29,19 +30,6 @@ bool llvm::GenericUniformityAnalysisImpl<SSAContext>::markDefsDivergent(
return markDivergent(cast<Value>(&Instr));
}
-template <>
-bool llvm::GenericUniformityAnalysisImpl<SSAContext>::isDivergentUse(
- const Use &U) const {
- const auto *V = U.get();
- if (isDivergent(V))
- return true;
- if (const auto *DefInstr = dyn_cast<Instruction>(V)) {
- const auto *UseInstr = cast<Instruction>(U.getUser());
- return isTemporalDivergent(*UseInstr->getParent(), *DefInstr);
- }
- return false;
-}
-
template <> void llvm::GenericUniformityAnalysisImpl<SSAContext>::initialize() {
for (auto &I : instructions(F)) {
InstructionUniformity IU = TTI->getInstructionUniformity(&I);
@@ -108,6 +96,31 @@ void llvm::GenericUniformityAnalysisImpl<
}
}
+template <>
+bool llvm::GenericUniformityAnalysisImpl<SSAContext>::isDivergentUse(
+ const Use &U) const {
+ const auto *V = U.get();
+ if (isDivergent(V))
+ return true;
+ if (const auto *DefInstr = dyn_cast<Instruction>(V)) {
+ const auto *UseInstr = cast<Instruction>(U.getUser());
+ return isTemporalDivergent(*UseInstr->getParent(), *DefInstr);
+ }
+ return false;
+}
+
+template <>
+bool GenericUniformityAnalysisImpl<SSAContext>::isOperandUniform(
+ const Instruction &I, InstructionUniformity IU) const {
+ switch (IU) {
+ case InstructionUniformity::EitherOfFirstTwoOp:
+ return !isDivergentUse(I.getOperandUse(0)) ||
+ !isDivergentUse(I.getOperandUse(1));
+ default:
+ return false;
+ }
+}
+
// This ensures explicit instantiation of
// GenericUniformityAnalysisImpl::ImplDeleter::operator()
template class llvm::GenericUniformityInfo<SSAContext>;
diff --git a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
index dbadb67e1e6d2..ffe878f82f207 100644
--- a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
+++ b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
@@ -151,6 +151,17 @@ bool llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::isDivergentUse(
return isTemporalDivergent(*UseInstr->getParent(), *DefInstr);
}
+template <>
+bool GenericUniformityAnalysisImpl<MachineSSAContext>::isOperandUniform(
+ const MachineInstr &I, InstructionUniformity IU) const {
+ switch (IU) {
+ case InstructionUniformity::EitherOfFirstTwoOp:
+ return !isDivergentUse(I.getOperand(0)) || !isDivergentUse(I.getOperand(1));
+ default:
+ return false;
+ }
+}
+
// This ensures explicit instantiation of
// GenericUniformityAnalysisImpl::ImplDeleter::operator()
template class llvm::GenericUniformityInfo<MachineSSAContext>;
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
index 46cb8cc1312dc..9b91c7bee84bd 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
@@ -7,14 +7,14 @@ define amdgpu_kernel void @ds_swizzle(ptr addrspace(1) %out, i32 %src) #0 {
ret void
}
-; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
+; CHECK: ALL VALUES UNIFORM
define amdgpu_kernel void @v_permlane16_b32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
%v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
store i32 %v, ptr addrspace(1) %out
ret void
}
-; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
+; CHECK: ALL VALUES UNIFORM
define amdgpu_kernel void @v_permlanex16_b32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
%v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
store i32 %v, ptr addrspace(1) %out
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/uniform_intrinsic.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/uniform_intrinsic.ll
new file mode 100644
index 0000000000000..37be465a7796b
--- /dev/null
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/uniform_intrinsic.ll
@@ -0,0 +1,59 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple amdgcn-- -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s
+
+; CHECK: ALL VALUES UNIFORM
+define amdgpu_kernel void @v_permlane16_b32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
+ %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+; CHECK: ALL VALUES UNIFORM
+define amdgpu_kernel void @v_permlanex16_b32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
+ %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+; CHECK: %v1 = call i32 @llvm.amdgcn.permlanex16.i32(i32 %v, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+; CHECK: store i32 %v1, ptr addrspace(1) %out, align 4
+define amdgpu_kernel void @div_permlane16_var_uni_usr_x16(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
+ %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
+ %v1 = call i32 @llvm.amdgcn.permlanex16.i32(i32 %v, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
+ store i32 %v1, ptr addrspace(1) %out
+ ret void
+}
+
+; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+; CHECK: %v1 = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+; CHECK: store i32 %v1, ptr addrspace(1) %out, align 4
+define amdgpu_kernel void @div_permlane16_var_uni_x16(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
+ %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
+ %v1 = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
+ store i32 %v1, ptr addrspace(1) %out
+ ret void
+}
+
+; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+; CHECK: %v1 = call i32 @llvm.amdgcn.permlane16.i32(i32 %v, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+; CHECK: store i32 %v1, ptr addrspace(1) %out, align 4
+define amdgpu_kernel void @div_permlane16_var_uni_usr_16(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
+ %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
+ %v1 = call i32 @llvm.amdgcn.permlane16.i32(i32 %v, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
+ store i32 %v1, ptr addrspace(1) %out
+ ret void
+}
+
+; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+; CHECK: %v1 = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+; CHECK: store i32 %v1, ptr addrspace(1) %out, align 4
+define amdgpu_kernel void @div_permlane16_var_uni_16(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
+ %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
+ %v1 = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
+ store i32 %v1, ptr addrspace(1) %out
+ ret void
+}
+
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
>From c9a6192d5e6a253cea7e049e4accb22947fdd658 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Tue, 4 Nov 2025 14:21:00 +0530
Subject: [PATCH 03/17] update enum name for more clarity
---
llvm/include/llvm/ADT/Uniformity.h | 5 +++--
llvm/lib/Analysis/UniformityAnalysis.cpp | 2 +-
llvm/lib/CodeGen/MachineUniformityAnalysis.cpp | 4 ++--
3 files changed, 6 insertions(+), 5 deletions(-)
diff --git a/llvm/include/llvm/ADT/Uniformity.h b/llvm/include/llvm/ADT/Uniformity.h
index 9571d43b8a9b9..ed558b004d322 100644
--- a/llvm/include/llvm/ADT/Uniformity.h
+++ b/llvm/include/llvm/ADT/Uniformity.h
@@ -25,8 +25,9 @@ enum class InstructionUniformity {
/// The result values can never be assumed to be uniform.
NeverUniform,
- /// Result value can be uniform if either of first two operand are uniform.
- EitherOfFirstTwoOp
+ /// Result value can be uniform if any of the first two use operand are
+ /// uniform.
+ AnyOfFirstTwoUseOp
};
} // namespace llvm
diff --git a/llvm/lib/Analysis/UniformityAnalysis.cpp b/llvm/lib/Analysis/UniformityAnalysis.cpp
index 9b7cb33d7f76a..494ba272aecf7 100644
--- a/llvm/lib/Analysis/UniformityAnalysis.cpp
+++ b/llvm/lib/Analysis/UniformityAnalysis.cpp
@@ -113,7 +113,7 @@ template <>
bool GenericUniformityAnalysisImpl<SSAContext>::isOperandUniform(
const Instruction &I, InstructionUniformity IU) const {
switch (IU) {
- case InstructionUniformity::EitherOfFirstTwoOp:
+ case InstructionUniformity::AnyOfFirstTwoUseOp:
return !isDivergentUse(I.getOperandUse(0)) ||
!isDivergentUse(I.getOperandUse(1));
default:
diff --git a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
index ffe878f82f207..8f6652ee27f25 100644
--- a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
+++ b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
@@ -155,8 +155,8 @@ template <>
bool GenericUniformityAnalysisImpl<MachineSSAContext>::isOperandUniform(
const MachineInstr &I, InstructionUniformity IU) const {
switch (IU) {
- case InstructionUniformity::EitherOfFirstTwoOp:
- return !isDivergentUse(I.getOperand(0)) || !isDivergentUse(I.getOperand(1));
+ case InstructionUniformity::AnyOfFirstTwoUseOp:
+ return !isDivergentUse(I.getOperand(1)) || !isDivergentUse(I.getOperand(2));
default:
return false;
}
>From f59ff790dbc92c76097dee809567775ca502cbe9 Mon Sep 17 00:00:00 2001
From: Pankaj Dwivedi <divedi.pk.117 at gmail.com>
Date: Wed, 5 Nov 2025 10:53:47 +0530
Subject: [PATCH 04/17] Apply suggestion from @arsenm
Co-authored-by: Matt Arsenault <arsenm2 at gmail.com>
---
llvm/include/llvm/ADT/GenericUniformityImpl.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h
index 23a774bed21a9..da305ba283222 100644
--- a/llvm/include/llvm/ADT/GenericUniformityImpl.h
+++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h
@@ -428,7 +428,7 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
// Map containing tracked instruction that can be proven uniform based on its
// operand Uniformity.
- llvm::DenseMap<const InstructionT *, InstructionUniformity>
+ DenseMap<const InstructionT *, InstructionUniformity>
UniformInstruction;
/// \brief Mark \p Term as divergent and push all Instructions that become
>From e64e3a6ffaf443fd630fa96826049ba5eeddabf4 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Mon, 17 Nov 2025 15:17:36 +0530
Subject: [PATCH 05/17] let getInstructionUniformity hook wrap
isSourceOfDivergence/isAlwaysUniform
---
llvm/include/llvm/Analysis/TargetTransformInfo.h | 3 ++-
llvm/include/llvm/Analysis/TargetTransformInfoImpl.h | 7 +++++--
llvm/lib/Analysis/TargetTransformInfo.cpp | 4 ++--
llvm/lib/CodeGen/MachineUniformityAnalysis.cpp | 10 +++-------
llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp | 2 ++
llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h | 1 -
6 files changed, 14 insertions(+), 13 deletions(-)
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 707c57e1160ba..437b47c5a59ef 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -2083,7 +2083,8 @@ class TargetTransformInfo {
/// Returns true if GEP should not be used to index into vectors for this
/// target.
LLVM_ABI bool allowVectorElementIndexingUsingGEP() const;
- InstructionUniformity getInstructionUniformity(const Instruction &I) const;
+
+ InstructionUniformity getInstructionUniformity(const Value *V) const;
private:
std::unique_ptr<const TargetTransformInfoImplBase> TTIImpl;
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 94d6d9ecba35c..43d73c3fc72cb 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -1227,8 +1227,11 @@ class TargetTransformInfoImplBase {
SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const {}
virtual bool allowVectorElementIndexingUsingGEP() const { return true; }
- virtual InstructionUniformity
- getInstructionUniformity(const Instruction &I) const {
+
+ // New API for uniformity classification
+ // Targets should override this to provide target-specific uniformity analysis
+ // The default implementation returns Default (conservative behavior)
+ virtual InstructionUniformity getInstructionUniformity(const Value *V) const {
return InstructionUniformity::Default;
}
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index bf26ff4079da5..a26742ea23a96 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1552,8 +1552,8 @@ bool TargetTransformInfo::allowVectorElementIndexingUsingGEP() const {
}
InstructionUniformity
-TargetTransformInfo::getInstructionUniformity(const Instruction &I) const {
- return TTIImpl->getInstructionUniformity(I);
+TargetTransformInfo::getInstructionUniformity(const Value *V) const {
+ return TTIImpl->getInstructionUniformity(V);
}
TargetTransformInfoImplBase::~TargetTransformInfoImplBase() = default;
diff --git a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
index 8f6652ee27f25..782bb11fe0cb7 100644
--- a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
+++ b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
@@ -151,15 +151,11 @@ bool llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::isDivergentUse(
return isTemporalDivergent(*UseInstr->getParent(), *DefInstr);
}
+// This can be defined later depending on use of the MachineUniformityAnalysis.
template <>
bool GenericUniformityAnalysisImpl<MachineSSAContext>::isOperandUniform(
- const MachineInstr &I, InstructionUniformity IU) const {
- switch (IU) {
- case InstructionUniformity::AnyOfFirstTwoUseOp:
- return !isDivergentUse(I.getOperand(1)) || !isDivergentUse(I.getOperand(2));
- default:
- return false;
- }
+ const MachineInstr &MI, InstructionUniformity IU) const {
+ return false;
}
// This ensures explicit instantiation of
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index e81ce4aaf6fe9..d3f01046faa2a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -1727,6 +1727,8 @@ unsigned GCNTTIImpl::getNumberOfParts(Type *Tp) const {
return BaseT::getNumberOfParts(Tp);
}
+// New API that wraps the old isSourceOfDivergence and isAlwaysUniform APIs
+// with additional support for new uniformity classifications
InstructionUniformity
GCNTTIImpl::getInstructionUniformity(const Value *V) const {
if (isAlwaysUniform(V))
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
index 899249db54574..3f59684b00b44 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@@ -221,7 +221,6 @@ class NVPTXTTIImpl final : public BasicTTIImplBase<NVPTXTTIImpl> {
// Self-referential globals are not supported.
return false;
}
-
InstructionUniformity getInstructionUniformity(const Value *V) const override;
};
>From 12ceaf7c935ddfd608855d0196a7b881106229f5 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Wed, 19 Nov 2025 17:38:58 +0530
Subject: [PATCH 06/17] update the operand check & update machine inst
uniformity
---
llvm/lib/Analysis/UniformityAnalysis.cpp | 7 +-
.../lib/CodeGen/MachineUniformityAnalysis.cpp | 35 +++++++-
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 7 ++
.../AMDGPU/MIR/uniform-permlane.mir | 86 +++++++++++++++++++
.../AMDGPU/uniform_intrinsic.ll | 8 +-
5 files changed, 135 insertions(+), 8 deletions(-)
create mode 100644 llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/uniform-permlane.mir
diff --git a/llvm/lib/Analysis/UniformityAnalysis.cpp b/llvm/lib/Analysis/UniformityAnalysis.cpp
index 494ba272aecf7..9a02b65f97bc5 100644
--- a/llvm/lib/Analysis/UniformityAnalysis.cpp
+++ b/llvm/lib/Analysis/UniformityAnalysis.cpp
@@ -114,8 +114,11 @@ bool GenericUniformityAnalysisImpl<SSAContext>::isOperandUniform(
const Instruction &I, InstructionUniformity IU) const {
switch (IU) {
case InstructionUniformity::AnyOfFirstTwoUseOp:
- return !isDivergentUse(I.getOperandUse(0)) ||
- !isDivergentUse(I.getOperandUse(1));
+ // For permlane16/permlanex16: <old> <src0> <src1> <src2> <fi>
+ // <bound_control> Check if either src0 (operand 1) or src1 (operand 2 -
+ // lane select) is uniform
+ return !isDivergentUse(I.getOperandUse(1)) ||
+ !isDivergentUse(I.getOperandUse(2));
default:
return false;
}
diff --git a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
index 782bb11fe0cb7..1cbd9450f4e99 100644
--- a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
+++ b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
@@ -151,11 +151,42 @@ bool llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::isDivergentUse(
return isTemporalDivergent(*UseInstr->getParent(), *DefInstr);
}
-// This can be defined later depending on use of the MachineUniformityAnalysis.
template <>
bool GenericUniformityAnalysisImpl<MachineSSAContext>::isOperandUniform(
const MachineInstr &MI, InstructionUniformity IU) const {
- return false;
+ switch (IU) {
+ // For permlane16/permlanex16, check if either src or lane select is uniform
+ // These instructions have mixed immediate and register operands:
+ // Operand 1 is src0 (the source value to permute)
+ // Operand 3 is src1 (lane select - which lane within the 16 to read from)
+ // Result is uniform if EITHER the source OR lane select is uniform
+ case InstructionUniformity::AnyOfFirstTwoUseOp: {
+ // Check if any of the first two register use operands is uniform
+ // Result is uniform if ANY of these operands is uniform
+ const MachineOperand *FirstRegOp = nullptr;
+ const MachineOperand *SecondRegOp = nullptr;
+
+ // Find the first two register use operands
+ for (const MachineOperand &MO : MI.uses()) {
+ if (MO.isReg() && MO.getReg().isVirtual()) {
+ if (!FirstRegOp)
+ FirstRegOp = &MO;
+ else if (!SecondRegOp) {
+ SecondRegOp = &MO;
+ break;
+ }
+ }
+ }
+
+ if (!FirstRegOp || !SecondRegOp)
+ return false;
+
+ // Return true if either operand is uniform
+ return !isDivergentUse(*FirstRegOp) || !isDivergentUse(*SecondRegOp);
+ }
+ default:
+ return false;
+ }
}
// This ensures explicit instantiation of
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 5086c553da101..6eea224babdbe 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -10803,6 +10803,13 @@ SIInstrInfo::getInstructionUniformity(const MachineInstr &MI) const {
return InstructionUniformity::NeverUniform;
unsigned opcode = MI.getOpcode();
+
+ // Special handling for permlane16/permlanex16 - uniformity depends on
+ // operands
+ if (opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
+ opcode == AMDGPU::V_PERMLANEX16_B32_e64)
+ return InstructionUniformity::AnyOfFirstTwoUseOp;
+
if (opcode == AMDGPU::V_READLANE_B32 ||
opcode == AMDGPU::V_READFIRSTLANE_B32 ||
opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/uniform-permlane.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/uniform-permlane.mir
new file mode 100644
index 0000000000000..f08d16affef23
--- /dev/null
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/uniform-permlane.mir
@@ -0,0 +1,86 @@
+# RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -run-pass=print-machine-uniformity -o - %s 2>&1 | FileCheck %s
+
+# Test the machine-level uniformity analysis for permlane16/permlanex16 instructions.
+#
+# NOTE: Permlane instructions have a hardware constraint that src1 (lane select) and src2
+# must be SGPR (scalar) registers. Since SGPRs are always uniform at machine level,
+# permlane results are always uniform according to the AnyOfFirstTwoUseOp logic
+# (either src0 OR src1 being uniform makes the result uniform, and src1 is always uniform).
+#
+# These tests verify that the uniformity analysis correctly handles permlane instructions
+# and that uniform results propagate through chains of operations.
+
+---
+# Test: permlane16 with divergent VGPR src and uniform SGPR lane select
+# Result is UNIFORM because lane select (SGPR) is always uniform
+name: permlane16_basic
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: MachineUniformityInfo for function: permlane16_basic
+ ; CHECK: ALL VALUES UNIFORM
+ %0:vgpr_32 = IMPLICIT_DEF
+ %1:sreg_32 = S_MOV_B32 5
+ %2:sreg_32 = IMPLICIT_DEF
+ %3:vgpr_32 = V_PERMLANE16_B32_e64 0, %0, 0, %1, 0, %2, %0, 0, implicit $exec
+ S_ENDPGM 0
+
+...
+---
+# Test: permlanex16 with divergent VGPR src and uniform SGPR lane select
+# Result is UNIFORM because lane select (SGPR) is always uniform
+name: permlanex16_basic
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: MachineUniformityInfo for function: permlanex16_basic
+ ; CHECK: ALL VALUES UNIFORM
+ %0:vgpr_32 = IMPLICIT_DEF
+ %1:sreg_32 = S_MOV_B32 7
+ %2:sreg_32 = IMPLICIT_DEF
+ %3:vgpr_32 = V_PERMLANEX16_B32_e64 0, %0, 0, %1, 0, %2, %0, 0, implicit $exec
+ S_ENDPGM 0
+
+...
+---
+# Test: Chain of permlane operations - uniformity propagates
+# Both permlanes are uniform, second uses result of first as source
+name: permlane16_chain_uniform
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: MachineUniformityInfo for function: permlane16_chain_uniform
+ ; CHECK: ALL VALUES UNIFORM
+ %0:vgpr_32 = IMPLICIT_DEF
+ %1:sreg_32 = S_MOV_B32 3
+ %2:sreg_32 = IMPLICIT_DEF
+ ; First permlane - uniform because lane select is SGPR
+ %3:vgpr_32 = V_PERMLANE16_B32_e64 0, %0, 0, %1, 0, %2, %0, 0, implicit $exec
+ ; Second permlane uses uniform result - also uniform
+ %4:vgpr_32 = V_PERMLANEX16_B32_e64 0, %3, 0, %1, 0, %2, %3, 0, implicit $exec
+ S_ENDPGM 0
+
+...
+---
+# Test: Multiple permlane operations in sequence
+# Verifies that uniformity is correctly tracked through complex chains
+name: permlane_multiple
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: MachineUniformityInfo for function: permlane_multiple
+ ; CHECK: ALL VALUES UNIFORM
+ %0:vgpr_32 = IMPLICIT_DEF
+ %1:sreg_32 = S_MOV_B32 1
+ %2:sreg_32 = S_MOV_B32 2
+ %3:vgpr_32 = V_PERMLANE16_B32_e64 0, %0, 0, %1, 0, %2, %0, 0, implicit $exec
+ %4:vgpr_32 = V_PERMLANEX16_B32_e64 0, %3, 0, %1, 0, %2, %3, 0, implicit $exec
+ %5:vgpr_32 = V_PERMLANE16_B32_e64 0, %4, 0, %2, 0, %1, %4, 0, implicit $exec
+ S_ENDPGM 0
+
+...
+
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/uniform_intrinsic.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/uniform_intrinsic.ll
index 37be465a7796b..e7391ee0c265b 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/uniform_intrinsic.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/uniform_intrinsic.ll
@@ -16,11 +16,11 @@ define amdgpu_kernel void @v_permlanex16_b32(ptr addrspace(1) %out, i32 %src0, i
}
; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
-; CHECK: %v1 = call i32 @llvm.amdgcn.permlanex16.i32(i32 %v, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+; CHECK: %v1 = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %v, i32 %src1, i32 %src2, i1 false, i1 false)
; CHECK: store i32 %v1, ptr addrspace(1) %out, align 4
define amdgpu_kernel void @div_permlane16_var_uni_usr_x16(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
%v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
- %v1 = call i32 @llvm.amdgcn.permlanex16.i32(i32 %v, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
+ %v1 = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %v, i32 %src1, i32 %src2, i1 false, i1 false) #0
store i32 %v1, ptr addrspace(1) %out
ret void
}
@@ -36,11 +36,11 @@ define amdgpu_kernel void @div_permlane16_var_uni_x16(ptr addrspace(1) %out, i32
}
; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
-; CHECK: %v1 = call i32 @llvm.amdgcn.permlane16.i32(i32 %v, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+; CHECK: %v1 = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %v, i32 %src1, i32 %src2, i1 false, i1 false)
; CHECK: store i32 %v1, ptr addrspace(1) %out, align 4
define amdgpu_kernel void @div_permlane16_var_uni_usr_16(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
%v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
- %v1 = call i32 @llvm.amdgcn.permlane16.i32(i32 %v, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
+ %v1 = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %v, i32 %src1, i32 %src2, i1 false, i1 false) #0
store i32 %v1, ptr addrspace(1) %out
ret void
}
>From cf4dcef9b39841bc53d0a5b3f7774a916a902bc5 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Wed, 19 Nov 2025 18:59:57 +0530
Subject: [PATCH 07/17] Fix formatting
---
llvm/include/llvm/ADT/GenericUniformityImpl.h | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h
index da305ba283222..d06a8d3f28716 100644
--- a/llvm/include/llvm/ADT/GenericUniformityImpl.h
+++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h
@@ -1,4 +1,4 @@
-//===- GenericUniformityImpl.h -----------------------*- C++ -*------------===//
+//===- GenericUniformityImpl.h -----------------------*- C++ -*------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -428,8 +428,7 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
// Map containing tracked instruction that can be proven uniform based on its
// operand Uniformity.
- DenseMap<const InstructionT *, InstructionUniformity>
- UniformInstruction;
+ DenseMap<const InstructionT *, InstructionUniformity> UniformInstruction;
/// \brief Mark \p Term as divergent and push all Instructions that become
/// divergent as a result on the worklist.
>From 96fe05eea124cdb6492be22d1df51457a9a10a26 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Wed, 19 Nov 2025 19:50:03 +0530
Subject: [PATCH 08/17] update mir test check
---
.../UniformityAnalysis/AMDGPU/MIR/uniform-permlane.mir | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/uniform-permlane.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/uniform-permlane.mir
index f08d16affef23..da6048d86b2dd 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/uniform-permlane.mir
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/uniform-permlane.mir
@@ -18,7 +18,7 @@ machineFunctionInfo:
isEntryFunction: true
body: |
bb.0:
- ; CHECK-LABEL: MachineUniformityInfo for function: permlane16_basic
+ ; CHECK-LABEL: MachineUniformityInfo for function: @permlane16_basic
; CHECK: ALL VALUES UNIFORM
%0:vgpr_32 = IMPLICIT_DEF
%1:sreg_32 = S_MOV_B32 5
@@ -35,7 +35,7 @@ machineFunctionInfo:
isEntryFunction: true
body: |
bb.0:
- ; CHECK-LABEL: MachineUniformityInfo for function: permlanex16_basic
+ ; CHECK-LABEL: MachineUniformityInfo for function: @permlanex16_basic
; CHECK: ALL VALUES UNIFORM
%0:vgpr_32 = IMPLICIT_DEF
%1:sreg_32 = S_MOV_B32 7
@@ -52,7 +52,7 @@ machineFunctionInfo:
isEntryFunction: true
body: |
bb.0:
- ; CHECK-LABEL: MachineUniformityInfo for function: permlane16_chain_uniform
+ ; CHECK-LABEL: MachineUniformityInfo for function: @permlane16_chain_uniform
; CHECK: ALL VALUES UNIFORM
%0:vgpr_32 = IMPLICIT_DEF
%1:sreg_32 = S_MOV_B32 3
@@ -72,7 +72,7 @@ machineFunctionInfo:
isEntryFunction: true
body: |
bb.0:
- ; CHECK-LABEL: MachineUniformityInfo for function: permlane_multiple
+ ; CHECK-LABEL: MachineUniformityInfo for function: @permlane_multiple
; CHECK: ALL VALUES UNIFORM
%0:vgpr_32 = IMPLICIT_DEF
%1:sreg_32 = S_MOV_B32 1
>From 41eea7b41f985489dc14a0c881f575abfde42250 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Thu, 20 Nov 2025 16:30:26 +0530
Subject: [PATCH 09/17] seperate complex target based custom logic through
target hook
---
llvm/include/llvm/ADT/GenericUniformityImpl.h | 15 ++++--
llvm/include/llvm/ADT/Uniformity.h | 7 +--
.../llvm/Analysis/TargetTransformInfo.h | 12 +++++
.../llvm/Analysis/TargetTransformInfoImpl.h | 8 +++
llvm/include/llvm/CodeGen/TargetInstrInfo.h | 16 ++++++
llvm/lib/Analysis/TargetTransformInfo.cpp | 5 ++
llvm/lib/Analysis/UniformityAnalysis.cpp | 29 +++++-----
.../lib/CodeGen/MachineUniformityAnalysis.cpp | 53 ++++++++-----------
.../AMDGPU/AMDGPUTargetTransformInfo.cpp | 38 ++++++++++++-
.../Target/AMDGPU/AMDGPUTargetTransformInfo.h | 3 ++
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 23 ++++++--
llvm/lib/Target/AMDGPU/SIInstrInfo.h | 3 ++
12 files changed, 154 insertions(+), 58 deletions(-)
diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h
index d06a8d3f28716..c76964f9d7571 100644
--- a/llvm/include/llvm/ADT/GenericUniformityImpl.h
+++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h
@@ -408,9 +408,11 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
void recordTemporalDivergence(ConstValueRefT, const InstructionT *,
const CycleT *);
- bool isOperandUniform(const InstructionT &I, InstructionUniformity IU) const;
+ /// Check if an instruction with Custom uniformity can be proven uniform
+ /// based on its operands. This queries the target-specific callback.
+ bool isCustomUniform(const InstructionT &I) const;
- /// \brief keep track of target instruction that can be proven uniform.
+ /// \brief keep track of instructions that require custom uniformity analysis.
void addUniformInstruction(const InstructionT *I, InstructionUniformity IU);
protected:
@@ -793,10 +795,13 @@ void GenericUniformityAnalysisImpl<ContextT>::markDivergent(
const InstructionT &I) {
if (isAlwaysUniform(I))
return;
+ // Check if instruction requires custom uniformity analysis
auto It = UniformInstruction.find(&I);
- if (It != UniformInstruction.end() && isOperandUniform(I, It->second)) {
- addUniformOverride(I);
- return;
+ if (It != UniformInstruction.end()) {
+ if (It->second == InstructionUniformity::Custom && isCustomUniform(I)) {
+ addUniformOverride(I);
+ return;
+ }
}
bool Marked = false;
if (I.isTerminator()) {
diff --git a/llvm/include/llvm/ADT/Uniformity.h b/llvm/include/llvm/ADT/Uniformity.h
index ed558b004d322..43e588745f73f 100644
--- a/llvm/include/llvm/ADT/Uniformity.h
+++ b/llvm/include/llvm/ADT/Uniformity.h
@@ -25,9 +25,10 @@ enum class InstructionUniformity {
/// The result values can never be assumed to be uniform.
NeverUniform,
- /// Result value can be uniform if any of the first two use operand are
- /// uniform.
- AnyOfFirstTwoUseOp
+ /// If all operands are uniform, the result values are uniform. Otherwise,
+ /// the result values may be divergent, and a custom check may be used to
+ /// determine uniformity via a callback.
+ Custom
};
} // namespace llvm
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 437b47c5a59ef..bbead5aab4bab 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -2086,6 +2086,18 @@ class TargetTransformInfo {
InstructionUniformity getInstructionUniformity(const Value *V) const;
+ /// Determine if an instruction with some operands uniform can be proven
+ /// uniform. This is used for custom uniformity analysis where the target
+ /// can define complex rules that depend on which specific operands are
+ /// uniform.
+ ///
+ /// \param I The instruction to check.
+ /// \param UniformArgs A bitvector indicating which operands are known to be
+ /// uniform (bit N corresponds to operand N).
+ /// \returns true if the instruction result can be proven uniform given the
+ /// uniform operands, false otherwise.
+ bool isUniform(const Instruction *I, const SmallBitVector &UniformArgs) const;
+
private:
std::unique_ptr<const TargetTransformInfoImplBase> TTIImpl;
};
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 43d73c3fc72cb..24032caab5ac1 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -1235,6 +1235,14 @@ class TargetTransformInfoImplBase {
return InstructionUniformity::Default;
}
+ // Custom uniformity check for instructions marked as Custom
+ // Override this to provide complex uniformity rules based on which operands
+ // are uniform
+ virtual bool isUniform(const Instruction *I,
+ const SmallBitVector &UniformArgs) const {
+ return false; // Conservative: assume divergent
+ }
+
protected:
// Obtain the minimum required size to hold the value (without the sign)
// In case of a vector it returns the min required size for one element.
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 77f710203d1fc..67c3ac5f4c40e 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -2367,6 +2367,22 @@ class LLVM_ABI TargetInstrInfo : public MCInstrInfo {
return InstructionUniformity::Default;
}
+ /// Determine if a machine instruction with some operands uniform can be
+ /// proven uniform. This is used for custom uniformity analysis where the
+ /// target can define complex rules that depend on which specific operands
+ /// are uniform.
+ ///
+ /// \param MI The machine instruction to check.
+ /// \param UniformArgs A bitvector indicating which register operands are
+ /// known to be uniform (bit N corresponds to the Nth
+ /// register use operand).
+ /// \returns true if the instruction result can be proven uniform given the
+ /// uniform operands, false otherwise.
+ virtual bool isUniform(const MachineInstr &MI,
+ const SmallBitVector &UniformArgs) const {
+ return false; // Conservative: assume divergent
+ }
+
/// Returns true if the given \p MI defines a TargetIndex operand that can be
/// tracked by their offset, can have values, and can have debug info
/// associated with it. If so, sets \p Index and \p Offset of the target index
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index a26742ea23a96..ce2063fbb8008 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1556,6 +1556,11 @@ TargetTransformInfo::getInstructionUniformity(const Value *V) const {
return TTIImpl->getInstructionUniformity(V);
}
+bool TargetTransformInfo::isUniform(const Instruction *I,
+ const SmallBitVector &UniformArgs) const {
+ return TTIImpl->isUniform(I, UniformArgs);
+}
+
TargetTransformInfoImplBase::~TargetTransformInfoImplBase() = default;
TargetIRAnalysis::TargetIRAnalysis() : TTICallback(&getDefaultTTI) {}
diff --git a/llvm/lib/Analysis/UniformityAnalysis.cpp b/llvm/lib/Analysis/UniformityAnalysis.cpp
index 9a02b65f97bc5..663faf88e915f 100644
--- a/llvm/lib/Analysis/UniformityAnalysis.cpp
+++ b/llvm/lib/Analysis/UniformityAnalysis.cpp
@@ -8,6 +8,7 @@
#include "llvm/Analysis/UniformityAnalysis.h"
#include "llvm/ADT/GenericUniformityImpl.h"
+#include "llvm/ADT/SmallBitVector.h"
#include "llvm/ADT/Uniformity.h"
#include "llvm/Analysis/CycleAnalysis.h"
#include "llvm/Analysis/TargetTransformInfo.h"
@@ -36,10 +37,14 @@ template <> void llvm::GenericUniformityAnalysisImpl<SSAContext>::initialize() {
switch (IU) {
case InstructionUniformity::AlwaysUniform:
addUniformOverride(I);
- continue;
+ break;
case InstructionUniformity::NeverUniform:
markDivergent(I);
- continue;
+ break;
+ case InstructionUniformity::Custom:
+ // Instructions requiring custom uniformity analysis based on operands
+ addUniformInstruction(&I, IU);
+ break;
case InstructionUniformity::Default:
break;
}
@@ -110,18 +115,16 @@ bool llvm::GenericUniformityAnalysisImpl<SSAContext>::isDivergentUse(
}
template <>
-bool GenericUniformityAnalysisImpl<SSAContext>::isOperandUniform(
- const Instruction &I, InstructionUniformity IU) const {
- switch (IU) {
- case InstructionUniformity::AnyOfFirstTwoUseOp:
- // For permlane16/permlanex16: <old> <src0> <src1> <src2> <fi>
- // <bound_control> Check if either src0 (operand 1) or src1 (operand 2 -
- // lane select) is uniform
- return !isDivergentUse(I.getOperandUse(1)) ||
- !isDivergentUse(I.getOperandUse(2));
- default:
- return false;
+bool GenericUniformityAnalysisImpl<SSAContext>::isCustomUniform(
+ const Instruction &I) const {
+ // Build bitvector of uniform operands
+ SmallBitVector UniformArgs(I.getNumOperands());
+ for (unsigned OpIdx = 0; OpIdx < I.getNumOperands(); ++OpIdx) {
+ UniformArgs[OpIdx] = !isDivergentUse(I.getOperandUse(OpIdx));
}
+
+ // Query target-specific uniformity callback
+ return TTI->isUniform(&I, UniformArgs);
}
// This ensures explicit instantiation of
diff --git a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
index 1cbd9450f4e99..4e43287dc9424 100644
--- a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
+++ b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
@@ -8,6 +8,7 @@
#include "llvm/CodeGen/MachineUniformityAnalysis.h"
#include "llvm/ADT/GenericUniformityImpl.h"
+#include "llvm/ADT/SmallBitVector.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/MachineCycleAnalysis.h"
#include "llvm/CodeGen/MachineDominators.h"
@@ -61,6 +62,10 @@ void llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::initialize() {
case InstructionUniformity::NeverUniform:
markDivergent(instr);
break;
+ case InstructionUniformity::Custom:
+ // Instructions requiring custom uniformity analysis based on operands
+ addUniformInstruction(&instr, uniformity);
+ break;
case InstructionUniformity::Default:
break;
}
@@ -152,41 +157,25 @@ bool llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::isDivergentUse(
}
template <>
-bool GenericUniformityAnalysisImpl<MachineSSAContext>::isOperandUniform(
- const MachineInstr &MI, InstructionUniformity IU) const {
- switch (IU) {
- // For permlane16/permlanex16, check if either src or lane select is uniform
- // These instructions have mixed immediate and register operands:
- // Operand 1 is src0 (the source value to permute)
- // Operand 3 is src1 (lane select - which lane within the 16 to read from)
- // Result is uniform if EITHER the source OR lane select is uniform
- case InstructionUniformity::AnyOfFirstTwoUseOp: {
- // Check if any of the first two register use operands is uniform
- // Result is uniform if ANY of these operands is uniform
- const MachineOperand *FirstRegOp = nullptr;
- const MachineOperand *SecondRegOp = nullptr;
-
- // Find the first two register use operands
- for (const MachineOperand &MO : MI.uses()) {
- if (MO.isReg() && MO.getReg().isVirtual()) {
- if (!FirstRegOp)
- FirstRegOp = &MO;
- else if (!SecondRegOp) {
- SecondRegOp = &MO;
- break;
- }
- }
- }
-
- if (!FirstRegOp || !SecondRegOp)
- return false;
+bool GenericUniformityAnalysisImpl<MachineSSAContext>::isCustomUniform(
+ const MachineInstr &MI) const {
+ const auto &InstrInfo = *F.getSubtarget().getInstrInfo();
- // Return true if either operand is uniform
- return !isDivergentUse(*FirstRegOp) || !isDivergentUse(*SecondRegOp);
+ // Build bitvector of uniform register use operands
+ SmallVector<const MachineOperand *, 4> RegUseOps;
+ for (const MachineOperand &MO : MI.uses()) {
+ if (MO.isReg() && MO.getReg().isVirtual()) {
+ RegUseOps.push_back(&MO);
+ }
}
- default:
- return false;
+
+ SmallBitVector UniformArgs(RegUseOps.size());
+ for (unsigned i = 0; i < RegUseOps.size(); ++i) {
+ UniformArgs[i] = !isDivergentUse(*RegUseOps[i]);
}
+
+ // Query target-specific uniformity callback
+ return InstrInfo.isUniform(MI, UniformArgs);
}
// This ensures explicit instantiation of
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index d3f01046faa2a..2f50a4a22927a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -19,6 +19,7 @@
#include "AMDGPUTargetMachine.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIModeRegisterDefaults.h"
+#include "llvm/ADT/SmallBitVector.h"
#include "llvm/Analysis/InlineCost.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ValueTracking.h"
@@ -1727,10 +1728,18 @@ unsigned GCNTTIImpl::getNumberOfParts(Type *Tp) const {
return BaseT::getNumberOfParts(Tp);
}
-// New API that wraps the old isSourceOfDivergence and isAlwaysUniform APIs
-// with additional support for new uniformity classifications
InstructionUniformity
GCNTTIImpl::getInstructionUniformity(const Value *V) const {
+ if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
+ switch (Intrinsic->getIntrinsicID()) {
+ case Intrinsic::amdgcn_permlane16:
+ case Intrinsic::amdgcn_permlanex16:
+ return InstructionUniformity::Custom;
+ default:
+ break;
+ }
+ }
+
if (isAlwaysUniform(V))
return InstructionUniformity::AlwaysUniform;
@@ -1786,3 +1795,28 @@ bool GCNTTIImpl::shouldDropLSRSolutionIfLessProfitable() const {
// Prefer the baseline when LSR cannot clearly reduce per-iteration work.
return true;
}
+
+bool GCNTTIImpl::isUniform(const Instruction *I,
+ const SmallBitVector &UniformArgs) const {
+ // Custom uniformity check for permlane16/permlanex16
+ if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(I)) {
+ switch (Intrinsic->getIntrinsicID()) {
+ case Intrinsic::amdgcn_permlane16:
+ case Intrinsic::amdgcn_permlanex16:
+ // For permlane16/permlanex16:
+ // Operand 0: old value (ignored for uniformity)
+ // Operand 1: src0 (source value to permute)
+ // Operand 2: src1 (lane select within 16-lane group)
+ // Operand 3: src2 (which 16-lane group)
+ // Result is uniform if either src0 (op 1) or src1 (op 2) is uniform
+ if (UniformArgs.size() > 2) {
+ return UniformArgs[1] || UniformArgs[2];
+ }
+ return false;
+ default:
+ break;
+ }
+ }
+
+ return false;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index dc7d01533da02..ea2bf72836199 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -321,6 +321,9 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
const TTI::LSRCost &B) const override;
bool isNumRegsMajorCostOfLSR() const override;
bool shouldDropLSRSolutionIfLessProfitable() const override;
+
+ bool isUniform(const Instruction *I,
+ const SmallBitVector &UniformArgs) const override;
};
} // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 6eea224babdbe..b15d57d346669 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -20,6 +20,7 @@
#include "SIMachineFunctionInfo.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallBitVector.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
#include "llvm/CodeGen/LiveIntervals.h"
@@ -10804,11 +10805,10 @@ SIInstrInfo::getInstructionUniformity(const MachineInstr &MI) const {
unsigned opcode = MI.getOpcode();
- // Special handling for permlane16/permlanex16 - uniformity depends on
- // operands
+ // permlane16/permlanex16 require custom uniformity analysis
if (opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
opcode == AMDGPU::V_PERMLANEX16_B32_e64)
- return InstructionUniformity::AnyOfFirstTwoUseOp;
+ return InstructionUniformity::Custom;
if (opcode == AMDGPU::V_READLANE_B32 ||
opcode == AMDGPU::V_READFIRSTLANE_B32 ||
@@ -10887,6 +10887,23 @@ SIInstrInfo::getInstructionUniformity(const MachineInstr &MI) const {
return InstructionUniformity::Default;
}
+bool SIInstrInfo::isUniform(const MachineInstr &MI,
+ const SmallBitVector &UniformArgs) const {
+ unsigned opcode = MI.getOpcode();
+
+ // Custom uniformity check for permlane16/permlanex16
+ if (opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
+ opcode == AMDGPU::V_PERMLANEX16_B32_e64) {
+ // Result is uniform if either src0 or src1 is uniform
+ // UniformArgs[0] = src0 (source value)
+ // UniformArgs[1] = src1 (lane select)
+ if (UniformArgs.size() >= 2) {
+ return UniformArgs[0] || UniformArgs[1];
+ }
+ }
+
+ return false;
+}
unsigned SIInstrInfo::getDSShaderTypeValue(const MachineFunction &MF) {
switch (MF.getFunction().getCallingConv()) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 93d28d22bfd16..429982f75f29d 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1680,6 +1680,9 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
InstructionUniformity
getInstructionUniformity(const MachineInstr &MI) const final;
+ bool isUniform(const MachineInstr &MI,
+ const SmallBitVector &UniformArgs) const final;
+
InstructionUniformity
getGenericInstructionUniformity(const MachineInstr &MI) const;
>From 1865da335920214d7046702d64a6ac48bd294a6a Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Tue, 16 Dec 2025 18:28:29 +0530
Subject: [PATCH 10/17] refactor: rebased with latest changes
---
llvm/include/llvm/Analysis/TargetTransformInfo.h | 2 --
llvm/include/llvm/Analysis/TargetTransformInfoImpl.h | 7 -------
llvm/lib/Analysis/TargetTransformInfo.cpp | 5 -----
llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp | 1 -
4 files changed, 15 deletions(-)
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index bbead5aab4bab..b6dc19a873ec4 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -2084,8 +2084,6 @@ class TargetTransformInfo {
/// target.
LLVM_ABI bool allowVectorElementIndexingUsingGEP() const;
- InstructionUniformity getInstructionUniformity(const Value *V) const;
-
/// Determine if an instruction with some operands uniform can be proven
/// uniform. This is used for custom uniformity analysis where the target
/// can define complex rules that depend on which specific operands are
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 24032caab5ac1..246c05709c4d8 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -1228,13 +1228,6 @@ class TargetTransformInfoImplBase {
virtual bool allowVectorElementIndexingUsingGEP() const { return true; }
- // New API for uniformity classification
- // Targets should override this to provide target-specific uniformity analysis
- // The default implementation returns Default (conservative behavior)
- virtual InstructionUniformity getInstructionUniformity(const Value *V) const {
- return InstructionUniformity::Default;
- }
-
// Custom uniformity check for instructions marked as Custom
// Override this to provide complex uniformity rules based on which operands
// are uniform
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index ce2063fbb8008..9345595555c2d 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1551,11 +1551,6 @@ bool TargetTransformInfo::allowVectorElementIndexingUsingGEP() const {
return TTIImpl->allowVectorElementIndexingUsingGEP();
}
-InstructionUniformity
-TargetTransformInfo::getInstructionUniformity(const Value *V) const {
- return TTIImpl->getInstructionUniformity(V);
-}
-
bool TargetTransformInfo::isUniform(const Instruction *I,
const SmallBitVector &UniformArgs) const {
return TTIImpl->isUniform(I, UniformArgs);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 2f50a4a22927a..79842504b19ee 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -1817,6 +1817,5 @@ bool GCNTTIImpl::isUniform(const Instruction *I,
break;
}
}
-
return false;
}
>From f2f8b5ad236a05379d05208a67d41170500ac025 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Tue, 16 Dec 2025 18:48:14 +0530
Subject: [PATCH 11/17] refactor
---
llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h | 1 +
1 file changed, 1 insertion(+)
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
index 3f59684b00b44..899249db54574 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@@ -221,6 +221,7 @@ class NVPTXTTIImpl final : public BasicTTIImplBase<NVPTXTTIImpl> {
// Self-referential globals are not supported.
return false;
}
+
InstructionUniformity getInstructionUniformity(const Value *V) const override;
};
>From 2bbcf97a220e7be3e049cf47c52e000833886e03 Mon Sep 17 00:00:00 2001
From: Pankaj Dwivedi <divedi.pk.117 at gmail.com>
Date: Tue, 16 Dec 2025 19:06:39 +0530
Subject: [PATCH 12/17] Update llvm/lib/Analysis/UniformityAnalysis.cpp
Co-authored-by: Matt Arsenault <arsenm2 at gmail.com>
---
llvm/lib/Analysis/UniformityAnalysis.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Analysis/UniformityAnalysis.cpp b/llvm/lib/Analysis/UniformityAnalysis.cpp
index 663faf88e915f..7866aec824dbf 100644
--- a/llvm/lib/Analysis/UniformityAnalysis.cpp
+++ b/llvm/lib/Analysis/UniformityAnalysis.cpp
@@ -119,7 +119,7 @@ bool GenericUniformityAnalysisImpl<SSAContext>::isCustomUniform(
const Instruction &I) const {
// Build bitvector of uniform operands
SmallBitVector UniformArgs(I.getNumOperands());
- for (unsigned OpIdx = 0; OpIdx < I.getNumOperands(); ++OpIdx) {
+ for (unsigned OpIdx = 0, E =UniformArgs.size(); OpIdx != E; ++OpIdx) {
UniformArgs[OpIdx] = !isDivergentUse(I.getOperandUse(OpIdx));
}
>From f81169c80b9e160b6f83a3021bc94bc14d91712c Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Tue, 16 Dec 2025 21:10:54 +0530
Subject: [PATCH 13/17] [Review] address changes
---
llvm/include/llvm/ADT/GenericUniformityImpl.h | 23 +++++++++----------
llvm/lib/Analysis/UniformityAnalysis.cpp | 5 ++--
.../lib/CodeGen/MachineUniformityAnalysis.cpp | 20 +++++++---------
3 files changed, 21 insertions(+), 27 deletions(-)
diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h
index c76964f9d7571..67d8bf09ecaf6 100644
--- a/llvm/include/llvm/ADT/GenericUniformityImpl.h
+++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h
@@ -1,4 +1,4 @@
-//===- GenericUniformityImpl.h -----------------------*- C++ -*------------===//
+//===- GenericUniformityImpl.h -----------------------*- C++ -*------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -412,8 +412,8 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
/// based on its operands. This queries the target-specific callback.
bool isCustomUniform(const InstructionT &I) const;
- /// \brief keep track of instructions that require custom uniformity analysis.
- void addUniformInstruction(const InstructionT *I, InstructionUniformity IU);
+ /// \brief Add an instruction that requires custom uniformity analysis.
+ void addCustomUniformCandidate(const InstructionT *I);
protected:
const ContextT &Context;
@@ -428,9 +428,9 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
// Internal worklist for divergence propagation.
std::vector<const InstructionT *> Worklist;
- // Map containing tracked instruction that can be proven uniform based on its
- // operand Uniformity.
- DenseMap<const InstructionT *, InstructionUniformity> UniformInstruction;
+ // Set of instructions that require custom uniformity analysis based on
+ // operand uniformity.
+ SmallPtrSet<const InstructionT *, 8> CustomUniformCandidates;
/// \brief Mark \p Term as divergent and push all Instructions that become
/// divergent as a result on the worklist.
@@ -796,9 +796,8 @@ void GenericUniformityAnalysisImpl<ContextT>::markDivergent(
if (isAlwaysUniform(I))
return;
// Check if instruction requires custom uniformity analysis
- auto It = UniformInstruction.find(&I);
- if (It != UniformInstruction.end()) {
- if (It->second == InstructionUniformity::Custom && isCustomUniform(I)) {
+ if (CustomUniformCandidates.count(&I)) {
+ if (isCustomUniform(I)) {
addUniformOverride(I);
return;
}
@@ -835,9 +834,9 @@ void GenericUniformityAnalysisImpl<ContextT>::addUniformOverride(
}
template <typename ContextT>
-void GenericUniformityAnalysisImpl<ContextT>::addUniformInstruction(
- const InstructionT *I, InstructionUniformity IU) {
- UniformInstruction[I] = IU;
+void GenericUniformityAnalysisImpl<ContextT>::addCustomUniformCandidate(
+ const InstructionT *I) {
+ CustomUniformCandidates.insert(I);
}
// Mark as divergent all external uses of values defined in \p DefCycle.
diff --git a/llvm/lib/Analysis/UniformityAnalysis.cpp b/llvm/lib/Analysis/UniformityAnalysis.cpp
index 7866aec824dbf..973899c7a161c 100644
--- a/llvm/lib/Analysis/UniformityAnalysis.cpp
+++ b/llvm/lib/Analysis/UniformityAnalysis.cpp
@@ -43,7 +43,7 @@ template <> void llvm::GenericUniformityAnalysisImpl<SSAContext>::initialize() {
break;
case InstructionUniformity::Custom:
// Instructions requiring custom uniformity analysis based on operands
- addUniformInstruction(&I, IU);
+ addCustomUniformCandidate(&I);
break;
case InstructionUniformity::Default:
break;
@@ -119,10 +119,9 @@ bool GenericUniformityAnalysisImpl<SSAContext>::isCustomUniform(
const Instruction &I) const {
// Build bitvector of uniform operands
SmallBitVector UniformArgs(I.getNumOperands());
- for (unsigned OpIdx = 0, E =UniformArgs.size(); OpIdx != E; ++OpIdx) {
+ for (unsigned OpIdx = 0, E = UniformArgs.size(); OpIdx != E; ++OpIdx) {
UniformArgs[OpIdx] = !isDivergentUse(I.getOperandUse(OpIdx));
}
-
// Query target-specific uniformity callback
return TTI->isUniform(&I, UniformArgs);
}
diff --git a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
index 4e43287dc9424..5ee1e5ca8f46c 100644
--- a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
+++ b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
@@ -64,7 +64,7 @@ void llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::initialize() {
break;
case InstructionUniformity::Custom:
// Instructions requiring custom uniformity analysis based on operands
- addUniformInstruction(&instr, uniformity);
+ addCustomUniformCandidate(&instr);
break;
case InstructionUniformity::Default:
break;
@@ -161,17 +161,13 @@ bool GenericUniformityAnalysisImpl<MachineSSAContext>::isCustomUniform(
const MachineInstr &MI) const {
const auto &InstrInfo = *F.getSubtarget().getInstrInfo();
- // Build bitvector of uniform register use operands
- SmallVector<const MachineOperand *, 4> RegUseOps;
- for (const MachineOperand &MO : MI.uses()) {
- if (MO.isReg() && MO.getReg().isVirtual()) {
- RegUseOps.push_back(&MO);
- }
- }
-
- SmallBitVector UniformArgs(RegUseOps.size());
- for (unsigned i = 0; i < RegUseOps.size(); ++i) {
- UniformArgs[i] = !isDivergentUse(*RegUseOps[i]);
+ // Build bitvector of uniform operands
+ SmallBitVector UniformArgs(MI.getNumOperands());
+ for (unsigned OpIdx = 0, E = MI.getNumOperands(); OpIdx != E; ++OpIdx) {
+ const MachineOperand &MO = MI.getOperand(OpIdx);
+ // Register operands: check if divergent
+ // Non-register operands (immediates, etc.): always uniform
+ UniformArgs[OpIdx] = !MO.isReg() || !isDivergentUse(MO);
}
// Query target-specific uniformity callback
>From f29a21e9b1b24d788278619b491dc142bf24e557 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Wed, 17 Dec 2025 13:38:55 +0530
Subject: [PATCH 14/17] change the approach from identifying uniform to
divergent
---
llvm/include/llvm/ADT/GenericUniformityImpl.h | 31 ++++++++++---------
.../llvm/Analysis/TargetTransformInfo.h | 17 +++++-----
.../llvm/Analysis/TargetTransformInfoImpl.h | 10 +++---
llvm/include/llvm/CodeGen/TargetInstrInfo.h | 21 ++++++-------
llvm/lib/Analysis/TargetTransformInfo.cpp | 6 ++--
llvm/lib/Analysis/UniformityAnalysis.cpp | 18 +++++------
.../lib/CodeGen/MachineUniformityAnalysis.cpp | 19 ++++++------
.../AMDGPU/AMDGPUTargetTransformInfo.cpp | 14 ++++-----
.../Target/AMDGPU/AMDGPUTargetTransformInfo.h | 4 +--
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 16 +++++-----
llvm/lib/Target/AMDGPU/SIInstrInfo.h | 4 +--
11 files changed, 79 insertions(+), 81 deletions(-)
diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h
index 67d8bf09ecaf6..beeb5ad86608e 100644
--- a/llvm/include/llvm/ADT/GenericUniformityImpl.h
+++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h
@@ -408,12 +408,12 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
void recordTemporalDivergence(ConstValueRefT, const InstructionT *,
const CycleT *);
- /// Check if an instruction with Custom uniformity can be proven uniform
+ /// Check if an instruction with Custom uniformity can be proven divergent
/// based on its operands. This queries the target-specific callback.
- bool isCustomUniform(const InstructionT &I) const;
+ bool isCustomDivergent(const InstructionT &I) const;
- /// \brief Add an instruction that requires custom uniformity analysis.
- void addCustomUniformCandidate(const InstructionT *I);
+ /// \brief Add an instruction that requires custom divergence analysis.
+ void addCustomDivergenceCandidate(const InstructionT *I);
protected:
const ContextT &Context;
@@ -428,9 +428,9 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
// Internal worklist for divergence propagation.
std::vector<const InstructionT *> Worklist;
- // Set of instructions that require custom uniformity analysis based on
- // operand uniformity.
- SmallPtrSet<const InstructionT *, 8> CustomUniformCandidates;
+ // Set of instructions that require custom divergence analysis based on
+ // operand divergence.
+ SmallPtrSet<const InstructionT *, 8> CustomDivergenceCandidates;
/// \brief Mark \p Term as divergent and push all Instructions that become
/// divergent as a result on the worklist.
@@ -795,12 +795,13 @@ void GenericUniformityAnalysisImpl<ContextT>::markDivergent(
const InstructionT &I) {
if (isAlwaysUniform(I))
return;
- // Check if instruction requires custom uniformity analysis
- if (CustomUniformCandidates.count(&I)) {
- if (isCustomUniform(I)) {
- addUniformOverride(I);
- return;
- }
+ // For custom divergence candidates, try to prove divergence.
+ // If we can't prove it's divergent yet, skip marking it.
+ // The candidate will be re-evaluated as operands become divergent.
+ if (CustomDivergenceCandidates.count(&I)) {
+ if (!isCustomDivergent(I))
+ return; // Can't prove divergent yet, assume uniform
+ // Otherwise, we can prove it's divergent, continue to mark it
}
bool Marked = false;
if (I.isTerminator()) {
@@ -834,9 +835,9 @@ void GenericUniformityAnalysisImpl<ContextT>::addUniformOverride(
}
template <typename ContextT>
-void GenericUniformityAnalysisImpl<ContextT>::addCustomUniformCandidate(
+void GenericUniformityAnalysisImpl<ContextT>::addCustomDivergenceCandidate(
const InstructionT *I) {
- CustomUniformCandidates.insert(I);
+ CustomDivergenceCandidates.insert(I);
}
// Mark as divergent all external uses of values defined in \p DefCycle.
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index b6dc19a873ec4..6f50ffd048df7 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -2084,17 +2084,16 @@ class TargetTransformInfo {
/// target.
LLVM_ABI bool allowVectorElementIndexingUsingGEP() const;
- /// Determine if an instruction with some operands uniform can be proven
- /// uniform. This is used for custom uniformity analysis where the target
- /// can define complex rules that depend on which specific operands are
- /// uniform.
+ /// Determine if an instruction can be proven divergent based on which
+ /// operands are divergent.
///
/// \param I The instruction to check.
- /// \param UniformArgs A bitvector indicating which operands are known to be
- /// uniform (bit N corresponds to operand N).
- /// \returns true if the instruction result can be proven uniform given the
- /// uniform operands, false otherwise.
- bool isUniform(const Instruction *I, const SmallBitVector &UniformArgs) const;
+ /// \param DivergentArgs A bitvector indicating which operands are known to be
+ /// divergent (bit N corresponds to operand N).
+ /// \returns true if the instruction result can be proven divergent given the
+ /// divergent operands, false otherwise.
+ LLVM_ABI bool isDivergent(const Instruction *I,
+ const SmallBitVector &DivergentArgs) const;
private:
std::unique_ptr<const TargetTransformInfoImplBase> TTIImpl;
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 246c05709c4d8..3bf6cb854a911 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -1229,11 +1229,11 @@ class TargetTransformInfoImplBase {
virtual bool allowVectorElementIndexingUsingGEP() const { return true; }
// Custom uniformity check for instructions marked as Custom
- // Override this to provide complex uniformity rules based on which operands
- // are uniform
- virtual bool isUniform(const Instruction *I,
- const SmallBitVector &UniformArgs) const {
- return false; // Conservative: assume divergent
+ // Override this to provide complex divergence rules based on which operands
+ // are divergent
+ virtual bool isDivergent(const Instruction *I,
+ const SmallBitVector &DivergentArgs) const {
+ return false; // Conservative: can't prove divergent
}
protected:
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 67c3ac5f4c40e..5e0b7a242e20e 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -2367,20 +2367,17 @@ class LLVM_ABI TargetInstrInfo : public MCInstrInfo {
return InstructionUniformity::Default;
}
- /// Determine if a machine instruction with some operands uniform can be
- /// proven uniform. This is used for custom uniformity analysis where the
- /// target can define complex rules that depend on which specific operands
- /// are uniform.
+ /// Determine if a machine instruction can be proven divergent based on which
+ /// operands are divergent.
///
/// \param MI The machine instruction to check.
- /// \param UniformArgs A bitvector indicating which register operands are
- /// known to be uniform (bit N corresponds to the Nth
- /// register use operand).
- /// \returns true if the instruction result can be proven uniform given the
- /// uniform operands, false otherwise.
- virtual bool isUniform(const MachineInstr &MI,
- const SmallBitVector &UniformArgs) const {
- return false; // Conservative: assume divergent
+ /// \param DivergentArgs A bitvector indicating which operands are known to be
+ /// divergent (bit N corresponds to operand N).
+ /// \returns true if the instruction result can be proven divergent given the
+ /// divergent operands, false otherwise.
+ virtual bool isDivergent(const MachineInstr &MI,
+ const SmallBitVector &DivergentArgs) const {
+ return false; // Conservative: can't prove divergent
}
/// Returns true if the given \p MI defines a TargetIndex operand that can be
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 9345595555c2d..56152c9d8f429 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1551,9 +1551,9 @@ bool TargetTransformInfo::allowVectorElementIndexingUsingGEP() const {
return TTIImpl->allowVectorElementIndexingUsingGEP();
}
-bool TargetTransformInfo::isUniform(const Instruction *I,
- const SmallBitVector &UniformArgs) const {
- return TTIImpl->isUniform(I, UniformArgs);
+bool TargetTransformInfo::isDivergent(
+ const Instruction *I, const SmallBitVector &DivergentArgs) const {
+ return TTIImpl->isDivergent(I, DivergentArgs);
}
TargetTransformInfoImplBase::~TargetTransformInfoImplBase() = default;
diff --git a/llvm/lib/Analysis/UniformityAnalysis.cpp b/llvm/lib/Analysis/UniformityAnalysis.cpp
index 973899c7a161c..551ec7f4a917a 100644
--- a/llvm/lib/Analysis/UniformityAnalysis.cpp
+++ b/llvm/lib/Analysis/UniformityAnalysis.cpp
@@ -42,8 +42,8 @@ template <> void llvm::GenericUniformityAnalysisImpl<SSAContext>::initialize() {
markDivergent(I);
break;
case InstructionUniformity::Custom:
- // Instructions requiring custom uniformity analysis based on operands
- addCustomUniformCandidate(&I);
+ // Instructions requiring custom divergence analysis based on operands
+ addCustomDivergenceCandidate(&I);
break;
case InstructionUniformity::Default:
break;
@@ -115,15 +115,15 @@ bool llvm::GenericUniformityAnalysisImpl<SSAContext>::isDivergentUse(
}
template <>
-bool GenericUniformityAnalysisImpl<SSAContext>::isCustomUniform(
+bool GenericUniformityAnalysisImpl<SSAContext>::isCustomDivergent(
const Instruction &I) const {
- // Build bitvector of uniform operands
- SmallBitVector UniformArgs(I.getNumOperands());
- for (unsigned OpIdx = 0, E = UniformArgs.size(); OpIdx != E; ++OpIdx) {
- UniformArgs[OpIdx] = !isDivergentUse(I.getOperandUse(OpIdx));
+ // Build bitvector of divergent operands
+ SmallBitVector DivergentArgs(I.getNumOperands());
+ for (unsigned OpIdx = 0, E = DivergentArgs.size(); OpIdx != E; ++OpIdx) {
+ DivergentArgs[OpIdx] = isDivergentUse(I.getOperandUse(OpIdx));
}
- // Query target-specific uniformity callback
- return TTI->isUniform(&I, UniformArgs);
+ // Query target-specific divergence callback
+ return TTI->isDivergent(&I, DivergentArgs);
}
// This ensures explicit instantiation of
diff --git a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
index 5ee1e5ca8f46c..2e932f4edad9b 100644
--- a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
+++ b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
@@ -63,8 +63,8 @@ void llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::initialize() {
markDivergent(instr);
break;
case InstructionUniformity::Custom:
- // Instructions requiring custom uniformity analysis based on operands
- addCustomUniformCandidate(&instr);
+ // Instructions requiring custom divergence analysis based on operands
+ addCustomDivergenceCandidate(&instr);
break;
case InstructionUniformity::Default:
break;
@@ -157,21 +157,22 @@ bool llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::isDivergentUse(
}
template <>
-bool GenericUniformityAnalysisImpl<MachineSSAContext>::isCustomUniform(
+bool GenericUniformityAnalysisImpl<MachineSSAContext>::isCustomDivergent(
const MachineInstr &MI) const {
const auto &InstrInfo = *F.getSubtarget().getInstrInfo();
- // Build bitvector of uniform operands
- SmallBitVector UniformArgs(MI.getNumOperands());
+ // Build bitvector of divergent operands
+ SmallBitVector DivergentArgs(MI.getNumOperands());
for (unsigned OpIdx = 0, E = MI.getNumOperands(); OpIdx != E; ++OpIdx) {
const MachineOperand &MO = MI.getOperand(OpIdx);
// Register operands: check if divergent
- // Non-register operands (immediates, etc.): always uniform
- UniformArgs[OpIdx] = !MO.isReg() || !isDivergentUse(MO);
+ // Non-register operands (immediates, etc.): always uniform (never
+ // divergent)
+ DivergentArgs[OpIdx] = MO.isReg() && isDivergentUse(MO);
}
- // Query target-specific uniformity callback
- return InstrInfo.isUniform(MI, UniformArgs);
+ // Query target-specific divergence callback
+ return InstrInfo.isDivergent(MI, DivergentArgs);
}
// This ensures explicit instantiation of
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 79842504b19ee..2c4dad60a249b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -1749,6 +1749,7 @@ GCNTTIImpl::getInstructionUniformity(const Value *V) const {
return InstructionUniformity::Default;
}
+<<<<<<< HEAD
InstructionCost GCNTTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
StackOffset BaseOffset,
bool HasBaseReg, int64_t Scale,
@@ -1796,21 +1797,20 @@ bool GCNTTIImpl::shouldDropLSRSolutionIfLessProfitable() const {
return true;
}
-bool GCNTTIImpl::isUniform(const Instruction *I,
- const SmallBitVector &UniformArgs) const {
- // Custom uniformity check for permlane16/permlanex16
+bool GCNTTIImpl::isDivergent(const Instruction *I,
+ const SmallBitVector &DivergentArgs) const {
if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(I)) {
switch (Intrinsic->getIntrinsicID()) {
case Intrinsic::amdgcn_permlane16:
case Intrinsic::amdgcn_permlanex16:
// For permlane16/permlanex16:
- // Operand 0: old value (ignored for uniformity)
+ // Operand 0: old value (ignored for divergence)
// Operand 1: src0 (source value to permute)
// Operand 2: src1 (lane select within 16-lane group)
// Operand 3: src2 (which 16-lane group)
- // Result is uniform if either src0 (op 1) or src1 (op 2) is uniform
- if (UniformArgs.size() > 2) {
- return UniformArgs[1] || UniformArgs[2];
+ // Result is divergent if both src0 (op 1) and src1 (op 2) are divergent
+ if (DivergentArgs.size() > 2) {
+ return DivergentArgs[1] && DivergentArgs[2];
}
return false;
default:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index ea2bf72836199..32aec671383f6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -322,8 +322,8 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
bool isNumRegsMajorCostOfLSR() const override;
bool shouldDropLSRSolutionIfLessProfitable() const override;
- bool isUniform(const Instruction *I,
- const SmallBitVector &UniformArgs) const override;
+ bool isDivergent(const Instruction *I,
+ const SmallBitVector &DivergentArgs) const override;
};
} // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index b15d57d346669..fbe1b9c402188 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -10887,18 +10887,18 @@ SIInstrInfo::getInstructionUniformity(const MachineInstr &MI) const {
return InstructionUniformity::Default;
}
-bool SIInstrInfo::isUniform(const MachineInstr &MI,
- const SmallBitVector &UniformArgs) const {
+bool SIInstrInfo::isDivergent(const MachineInstr &MI,
+ const SmallBitVector &DivergentArgs) const {
unsigned opcode = MI.getOpcode();
- // Custom uniformity check for permlane16/permlanex16
+ // Custom divergence check for permlane16/permlanex16
if (opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
opcode == AMDGPU::V_PERMLANEX16_B32_e64) {
- // Result is uniform if either src0 or src1 is uniform
- // UniformArgs[0] = src0 (source value)
- // UniformArgs[1] = src1 (lane select)
- if (UniformArgs.size() >= 2) {
- return UniformArgs[0] || UniformArgs[1];
+ // Result is divergent if both src0 and src1 are divergent
+ // DivergentArgs[0] = src0 (source value)
+ // DivergentArgs[1] = src1 (lane select)
+ if (DivergentArgs.size() >= 2) {
+ return DivergentArgs[0] && DivergentArgs[1];
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 429982f75f29d..3d09521e0794f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1680,8 +1680,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
InstructionUniformity
getInstructionUniformity(const MachineInstr &MI) const final;
- bool isUniform(const MachineInstr &MI,
- const SmallBitVector &UniformArgs) const final;
+ bool isDivergent(const MachineInstr &MI,
+ const SmallBitVector &DivergentArgs) const final;
InstructionUniformity
getGenericInstructionUniformity(const MachineInstr &MI) const;
>From 0a3b84a74b1f8053e3ab9baa44baba2fcc790cc1 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Fri, 20 Mar 2026 15:52:33 +0530
Subject: [PATCH 15/17] review: address suggestions
---
llvm/include/llvm/ADT/GenericUniformityImpl.h | 29 +++----
llvm/include/llvm/ADT/Uniformity.h | 6 +-
.../llvm/Analysis/TargetTransformInfoImpl.h | 5 +-
llvm/include/llvm/CodeGen/TargetInstrInfo.h | 13 ---
llvm/lib/Analysis/UniformityAnalysis.cpp | 9 +-
.../lib/CodeGen/MachineUniformityAnalysis.cpp | 20 +----
.../AMDGPU/AMDGPUTargetTransformInfo.cpp | 29 ++-----
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 24 ------
llvm/lib/Target/AMDGPU/SIInstrInfo.h | 3 -
.../AMDGPU/MIR/uniform-permlane.mir | 86 -------------------
.../UniformityAnalysis/AMDGPU/intrinsics.ll | 4 +-
.../AMDGPU/uniform_intrinsic.ll | 71 +++++++--------
12 files changed, 62 insertions(+), 237 deletions(-)
delete mode 100644 llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/uniform-permlane.mir
diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h
index beeb5ad86608e..44f4db3cccb4c 100644
--- a/llvm/include/llvm/ADT/GenericUniformityImpl.h
+++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h
@@ -408,12 +408,12 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
void recordTemporalDivergence(ConstValueRefT, const InstructionT *,
const CycleT *);
- /// Check if an instruction with Custom uniformity can be proven divergent
+ /// Check if an instruction with Custom uniformity can be proven uniform
/// based on its operands. This queries the target-specific callback.
- bool isCustomDivergent(const InstructionT &I) const;
+ bool isCustomUniform(const InstructionT &I) const;
- /// \brief Add an instruction that requires custom divergence analysis.
- void addCustomDivergenceCandidate(const InstructionT *I);
+ /// \brief Add an instruction that requires custom uniformity analysis.
+ void addCustomUniformityCandidate(const InstructionT *I);
protected:
const ContextT &Context;
@@ -428,9 +428,9 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
// Internal worklist for divergence propagation.
std::vector<const InstructionT *> Worklist;
- // Set of instructions that require custom divergence analysis based on
- // operand divergence.
- SmallPtrSet<const InstructionT *, 8> CustomDivergenceCandidates;
+ // Set of instructions that require custom uniformity analysis based on
+ // operand uniformity.
+ SmallPtrSet<const InstructionT *, 8> CustomUniformityCandidates;
/// \brief Mark \p Term as divergent and push all Instructions that become
/// divergent as a result on the worklist.
@@ -795,13 +795,12 @@ void GenericUniformityAnalysisImpl<ContextT>::markDivergent(
const InstructionT &I) {
if (isAlwaysUniform(I))
return;
- // For custom divergence candidates, try to prove divergence.
- // If we can't prove it's divergent yet, skip marking it.
+ // For custom uniformity candidates, check if the instruction can be
+ // proven uniform based on which operands are uniform/divergent.
// The candidate will be re-evaluated as operands become divergent.
- if (CustomDivergenceCandidates.count(&I)) {
- if (!isCustomDivergent(I))
- return; // Can't prove divergent yet, assume uniform
- // Otherwise, we can prove it's divergent, continue to mark it
+ if (CustomUniformityCandidates.count(&I)) {
+ if (isCustomUniform(I))
+ return;
}
bool Marked = false;
if (I.isTerminator()) {
@@ -835,9 +834,9 @@ void GenericUniformityAnalysisImpl<ContextT>::addUniformOverride(
}
template <typename ContextT>
-void GenericUniformityAnalysisImpl<ContextT>::addCustomDivergenceCandidate(
+void GenericUniformityAnalysisImpl<ContextT>::addCustomUniformityCandidate(
const InstructionT *I) {
- CustomDivergenceCandidates.insert(I);
+ CustomUniformityCandidates.insert(I);
}
// Mark as divergent all external uses of values defined in \p DefCycle.
diff --git a/llvm/include/llvm/ADT/Uniformity.h b/llvm/include/llvm/ADT/Uniformity.h
index 43e588745f73f..0adbdf99c6d2a 100644
--- a/llvm/include/llvm/ADT/Uniformity.h
+++ b/llvm/include/llvm/ADT/Uniformity.h
@@ -25,9 +25,9 @@ enum class InstructionUniformity {
/// The result values can never be assumed to be uniform.
NeverUniform,
- /// If all operands are uniform, the result values are uniform. Otherwise,
- /// the result values may be divergent, and a custom check may be used to
- /// determine uniformity via a callback.
+ /// The result values require a custom uniformity check. A target-specific
+ /// callback determines whether the result is uniform based on which
+ /// operands are uniform.
Custom
};
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 3bf6cb854a911..a2cf6f133f09c 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -1228,12 +1228,9 @@ class TargetTransformInfoImplBase {
virtual bool allowVectorElementIndexingUsingGEP() const { return true; }
- // Custom uniformity check for instructions marked as Custom
- // Override this to provide complex divergence rules based on which operands
- // are divergent
virtual bool isDivergent(const Instruction *I,
const SmallBitVector &DivergentArgs) const {
- return false; // Conservative: can't prove divergent
+ llvm_unreachable("target must implement isDivergent for Custom uniformity");
}
protected:
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 5e0b7a242e20e..77f710203d1fc 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -2367,19 +2367,6 @@ class LLVM_ABI TargetInstrInfo : public MCInstrInfo {
return InstructionUniformity::Default;
}
- /// Determine if a machine instruction can be proven divergent based on which
- /// operands are divergent.
- ///
- /// \param MI The machine instruction to check.
- /// \param DivergentArgs A bitvector indicating which operands are known to be
- /// divergent (bit N corresponds to operand N).
- /// \returns true if the instruction result can be proven divergent given the
- /// divergent operands, false otherwise.
- virtual bool isDivergent(const MachineInstr &MI,
- const SmallBitVector &DivergentArgs) const {
- return false; // Conservative: can't prove divergent
- }
-
/// Returns true if the given \p MI defines a TargetIndex operand that can be
/// tracked by their offset, can have values, and can have debug info
/// associated with it. If so, sets \p Index and \p Offset of the target index
diff --git a/llvm/lib/Analysis/UniformityAnalysis.cpp b/llvm/lib/Analysis/UniformityAnalysis.cpp
index 551ec7f4a917a..c0d301a99b86e 100644
--- a/llvm/lib/Analysis/UniformityAnalysis.cpp
+++ b/llvm/lib/Analysis/UniformityAnalysis.cpp
@@ -42,8 +42,7 @@ template <> void llvm::GenericUniformityAnalysisImpl<SSAContext>::initialize() {
markDivergent(I);
break;
case InstructionUniformity::Custom:
- // Instructions requiring custom divergence analysis based on operands
- addCustomDivergenceCandidate(&I);
+ addCustomUniformityCandidate(&I);
break;
case InstructionUniformity::Default:
break;
@@ -115,15 +114,13 @@ bool llvm::GenericUniformityAnalysisImpl<SSAContext>::isDivergentUse(
}
template <>
-bool GenericUniformityAnalysisImpl<SSAContext>::isCustomDivergent(
+bool GenericUniformityAnalysisImpl<SSAContext>::isCustomUniform(
const Instruction &I) const {
- // Build bitvector of divergent operands
SmallBitVector DivergentArgs(I.getNumOperands());
for (unsigned OpIdx = 0, E = DivergentArgs.size(); OpIdx != E; ++OpIdx) {
DivergentArgs[OpIdx] = isDivergentUse(I.getOperandUse(OpIdx));
}
- // Query target-specific divergence callback
- return TTI->isDivergent(&I, DivergentArgs);
+ return !TTI->isDivergent(&I, DivergentArgs);
}
// This ensures explicit instantiation of
diff --git a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
index 2e932f4edad9b..af1c448497a52 100644
--- a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
+++ b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
@@ -8,7 +8,6 @@
#include "llvm/CodeGen/MachineUniformityAnalysis.h"
#include "llvm/ADT/GenericUniformityImpl.h"
-#include "llvm/ADT/SmallBitVector.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/MachineCycleAnalysis.h"
#include "llvm/CodeGen/MachineDominators.h"
@@ -63,8 +62,6 @@ void llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::initialize() {
markDivergent(instr);
break;
case InstructionUniformity::Custom:
- // Instructions requiring custom divergence analysis based on operands
- addCustomDivergenceCandidate(&instr);
break;
case InstructionUniformity::Default:
break;
@@ -157,22 +154,9 @@ bool llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::isDivergentUse(
}
template <>
-bool GenericUniformityAnalysisImpl<MachineSSAContext>::isCustomDivergent(
+bool GenericUniformityAnalysisImpl<MachineSSAContext>::isCustomUniform(
const MachineInstr &MI) const {
- const auto &InstrInfo = *F.getSubtarget().getInstrInfo();
-
- // Build bitvector of divergent operands
- SmallBitVector DivergentArgs(MI.getNumOperands());
- for (unsigned OpIdx = 0, E = MI.getNumOperands(); OpIdx != E; ++OpIdx) {
- const MachineOperand &MO = MI.getOperand(OpIdx);
- // Register operands: check if divergent
- // Non-register operands (immediates, etc.): always uniform (never
- // divergent)
- DivergentArgs[OpIdx] = MO.isReg() && isDivergentUse(MO);
- }
-
- // Query target-specific divergence callback
- return InstrInfo.isDivergent(MI, DivergentArgs);
+ llvm_unreachable("no MIR instructions use Custom uniformity yet");
}
// This ensures explicit instantiation of
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 2c4dad60a249b..11f25e6aff5da 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -1732,8 +1732,7 @@ InstructionUniformity
GCNTTIImpl::getInstructionUniformity(const Value *V) const {
if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
switch (Intrinsic->getIntrinsicID()) {
- case Intrinsic::amdgcn_permlane16:
- case Intrinsic::amdgcn_permlanex16:
+ case Intrinsic::amdgcn_wave_shuffle:
return InstructionUniformity::Custom;
default:
break;
@@ -1749,7 +1748,6 @@ GCNTTIImpl::getInstructionUniformity(const Value *V) const {
return InstructionUniformity::Default;
}
-<<<<<<< HEAD
InstructionCost GCNTTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
StackOffset BaseOffset,
bool HasBaseReg, int64_t Scale,
@@ -1799,23 +1797,12 @@ bool GCNTTIImpl::shouldDropLSRSolutionIfLessProfitable() const {
bool GCNTTIImpl::isDivergent(const Instruction *I,
const SmallBitVector &DivergentArgs) const {
- if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(I)) {
- switch (Intrinsic->getIntrinsicID()) {
- case Intrinsic::amdgcn_permlane16:
- case Intrinsic::amdgcn_permlanex16:
- // For permlane16/permlanex16:
- // Operand 0: old value (ignored for divergence)
- // Operand 1: src0 (source value to permute)
- // Operand 2: src1 (lane select within 16-lane group)
- // Operand 3: src2 (which 16-lane group)
- // Result is divergent if both src0 (op 1) and src1 (op 2) are divergent
- if (DivergentArgs.size() > 2) {
- return DivergentArgs[1] && DivergentArgs[2];
- }
- return false;
- default:
- break;
- }
+ const IntrinsicInst *Intrinsic = cast<IntrinsicInst>(I);
+ switch (Intrinsic->getIntrinsicID()) {
+ case Intrinsic::amdgcn_wave_shuffle:
+ // wave_shuffle(Value, Index): result is divergent iff Index is divergent.
+ return DivergentArgs[1];
+ default:
+ llvm_unreachable("unexpected intrinsic in isDivergent");
}
- return false;
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index fbe1b9c402188..5086c553da101 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -20,7 +20,6 @@
#include "SIMachineFunctionInfo.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallBitVector.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
#include "llvm/CodeGen/LiveIntervals.h"
@@ -10804,12 +10803,6 @@ SIInstrInfo::getInstructionUniformity(const MachineInstr &MI) const {
return InstructionUniformity::NeverUniform;
unsigned opcode = MI.getOpcode();
-
- // permlane16/permlanex16 require custom uniformity analysis
- if (opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
- opcode == AMDGPU::V_PERMLANEX16_B32_e64)
- return InstructionUniformity::Custom;
-
if (opcode == AMDGPU::V_READLANE_B32 ||
opcode == AMDGPU::V_READFIRSTLANE_B32 ||
opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
@@ -10887,23 +10880,6 @@ SIInstrInfo::getInstructionUniformity(const MachineInstr &MI) const {
return InstructionUniformity::Default;
}
-bool SIInstrInfo::isDivergent(const MachineInstr &MI,
- const SmallBitVector &DivergentArgs) const {
- unsigned opcode = MI.getOpcode();
-
- // Custom divergence check for permlane16/permlanex16
- if (opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
- opcode == AMDGPU::V_PERMLANEX16_B32_e64) {
- // Result is divergent if both src0 and src1 are divergent
- // DivergentArgs[0] = src0 (source value)
- // DivergentArgs[1] = src1 (lane select)
- if (DivergentArgs.size() >= 2) {
- return DivergentArgs[0] && DivergentArgs[1];
- }
- }
-
- return false;
-}
unsigned SIInstrInfo::getDSShaderTypeValue(const MachineFunction &MF) {
switch (MF.getFunction().getCallingConv()) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 3d09521e0794f..93d28d22bfd16 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1680,9 +1680,6 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
InstructionUniformity
getInstructionUniformity(const MachineInstr &MI) const final;
- bool isDivergent(const MachineInstr &MI,
- const SmallBitVector &DivergentArgs) const final;
-
InstructionUniformity
getGenericInstructionUniformity(const MachineInstr &MI) const;
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/uniform-permlane.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/uniform-permlane.mir
deleted file mode 100644
index da6048d86b2dd..0000000000000
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/uniform-permlane.mir
+++ /dev/null
@@ -1,86 +0,0 @@
-# RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -run-pass=print-machine-uniformity -o - %s 2>&1 | FileCheck %s
-
-# Test the machine-level uniformity analysis for permlane16/permlanex16 instructions.
-#
-# NOTE: Permlane instructions have a hardware constraint that src1 (lane select) and src2
-# must be SGPR (scalar) registers. Since SGPRs are always uniform at machine level,
-# permlane results are always uniform according to the AnyOfFirstTwoUseOp logic
-# (either src0 OR src1 being uniform makes the result uniform, and src1 is always uniform).
-#
-# These tests verify that the uniformity analysis correctly handles permlane instructions
-# and that uniform results propagate through chains of operations.
-
----
-# Test: permlane16 with divergent VGPR src and uniform SGPR lane select
-# Result is UNIFORM because lane select (SGPR) is always uniform
-name: permlane16_basic
-machineFunctionInfo:
- isEntryFunction: true
-body: |
- bb.0:
- ; CHECK-LABEL: MachineUniformityInfo for function: @permlane16_basic
- ; CHECK: ALL VALUES UNIFORM
- %0:vgpr_32 = IMPLICIT_DEF
- %1:sreg_32 = S_MOV_B32 5
- %2:sreg_32 = IMPLICIT_DEF
- %3:vgpr_32 = V_PERMLANE16_B32_e64 0, %0, 0, %1, 0, %2, %0, 0, implicit $exec
- S_ENDPGM 0
-
-...
----
-# Test: permlanex16 with divergent VGPR src and uniform SGPR lane select
-# Result is UNIFORM because lane select (SGPR) is always uniform
-name: permlanex16_basic
-machineFunctionInfo:
- isEntryFunction: true
-body: |
- bb.0:
- ; CHECK-LABEL: MachineUniformityInfo for function: @permlanex16_basic
- ; CHECK: ALL VALUES UNIFORM
- %0:vgpr_32 = IMPLICIT_DEF
- %1:sreg_32 = S_MOV_B32 7
- %2:sreg_32 = IMPLICIT_DEF
- %3:vgpr_32 = V_PERMLANEX16_B32_e64 0, %0, 0, %1, 0, %2, %0, 0, implicit $exec
- S_ENDPGM 0
-
-...
----
-# Test: Chain of permlane operations - uniformity propagates
-# Both permlanes are uniform, second uses result of first as source
-name: permlane16_chain_uniform
-machineFunctionInfo:
- isEntryFunction: true
-body: |
- bb.0:
- ; CHECK-LABEL: MachineUniformityInfo for function: @permlane16_chain_uniform
- ; CHECK: ALL VALUES UNIFORM
- %0:vgpr_32 = IMPLICIT_DEF
- %1:sreg_32 = S_MOV_B32 3
- %2:sreg_32 = IMPLICIT_DEF
- ; First permlane - uniform because lane select is SGPR
- %3:vgpr_32 = V_PERMLANE16_B32_e64 0, %0, 0, %1, 0, %2, %0, 0, implicit $exec
- ; Second permlane uses uniform result - also uniform
- %4:vgpr_32 = V_PERMLANEX16_B32_e64 0, %3, 0, %1, 0, %2, %3, 0, implicit $exec
- S_ENDPGM 0
-
-...
----
-# Test: Multiple permlane operations in sequence
-# Verifies that uniformity is correctly tracked through complex chains
-name: permlane_multiple
-machineFunctionInfo:
- isEntryFunction: true
-body: |
- bb.0:
- ; CHECK-LABEL: MachineUniformityInfo for function: @permlane_multiple
- ; CHECK: ALL VALUES UNIFORM
- %0:vgpr_32 = IMPLICIT_DEF
- %1:sreg_32 = S_MOV_B32 1
- %2:sreg_32 = S_MOV_B32 2
- %3:vgpr_32 = V_PERMLANE16_B32_e64 0, %0, 0, %1, 0, %2, %0, 0, implicit $exec
- %4:vgpr_32 = V_PERMLANEX16_B32_e64 0, %3, 0, %1, 0, %2, %3, 0, implicit $exec
- %5:vgpr_32 = V_PERMLANE16_B32_e64 0, %4, 0, %2, 0, %1, %4, 0, implicit $exec
- S_ENDPGM 0
-
-...
-
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
index 9b91c7bee84bd..46cb8cc1312dc 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
@@ -7,14 +7,14 @@ define amdgpu_kernel void @ds_swizzle(ptr addrspace(1) %out, i32 %src) #0 {
ret void
}
-; CHECK: ALL VALUES UNIFORM
+; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
define amdgpu_kernel void @v_permlane16_b32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
%v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
store i32 %v, ptr addrspace(1) %out
ret void
}
-; CHECK: ALL VALUES UNIFORM
+; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
define amdgpu_kernel void @v_permlanex16_b32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
%v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
store i32 %v, ptr addrspace(1) %out
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/uniform_intrinsic.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/uniform_intrinsic.ll
index e7391ee0c265b..d7a697e85f376 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/uniform_intrinsic.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/uniform_intrinsic.ll
@@ -1,59 +1,46 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -mtriple amdgcn-- -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s
-; CHECK: ALL VALUES UNIFORM
-define amdgpu_kernel void @v_permlane16_b32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
- %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
- store i32 %v, ptr addrspace(1) %out
- ret void
-}
+; wave_shuffle(Value, Index): result is uniform when Index is uniform,
+; regardless of Value's divergence.
+; All kernel args are uniform, so Index is uniform => result is uniform.
+; CHECK-LABEL: UniformityInfo for function 'wave_shuffle_all_uniform':
; CHECK: ALL VALUES UNIFORM
-define amdgpu_kernel void @v_permlanex16_b32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
- %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
+define amdgpu_kernel void @wave_shuffle_all_uniform(ptr addrspace(1) %out, i32 %val, i32 %idx) {
+ %v = call i32 @llvm.amdgcn.wave.shuffle(i32 %val, i32 %idx)
store i32 %v, ptr addrspace(1) %out
ret void
}
-; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
-; CHECK: %v1 = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %v, i32 %src1, i32 %src2, i1 false, i1 false)
-; CHECK: store i32 %v1, ptr addrspace(1) %out, align 4
-define amdgpu_kernel void @div_permlane16_var_uni_usr_x16(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
- %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
- %v1 = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %v, i32 %src1, i32 %src2, i1 false, i1 false) #0
- store i32 %v1, ptr addrspace(1) %out
- ret void
-}
-
-; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
-; CHECK: %v1 = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
-; CHECK: store i32 %v1, ptr addrspace(1) %out, align 4
-define amdgpu_kernel void @div_permlane16_var_uni_x16(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
- %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
- %v1 = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
- store i32 %v1, ptr addrspace(1) %out
+; Value is divergent (thread ID), but Index is uniform => result is uniform.
+; CHECK-LABEL: UniformityInfo for function 'wave_shuffle_divergent_val_uniform_idx':
+; CHECK-NOT: DIVERGENT: {{.*}}wave.shuffle
+define amdgpu_kernel void @wave_shuffle_divergent_val_uniform_idx(ptr addrspace(1) %out, i32 %idx) {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %v = call i32 @llvm.amdgcn.wave.shuffle(i32 %tid, i32 %idx)
+ store i32 %v, ptr addrspace(1) %out
ret void
}
-; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
-; CHECK: %v1 = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %v, i32 %src1, i32 %src2, i1 false, i1 false)
-; CHECK: store i32 %v1, ptr addrspace(1) %out, align 4
-define amdgpu_kernel void @div_permlane16_var_uni_usr_16(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
- %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
- %v1 = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %v, i32 %src1, i32 %src2, i1 false, i1 false) #0
- store i32 %v1, ptr addrspace(1) %out
+; Value is uniform, but Index is divergent (thread ID) => result is divergent.
+; CHECK-LABEL: UniformityInfo for function 'wave_shuffle_uniform_val_divergent_idx':
+; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.wave.shuffle.i32(i32 %val, i32 %tid)
+define amdgpu_kernel void @wave_shuffle_uniform_val_divergent_idx(ptr addrspace(1) %out, i32 %val) {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %v = call i32 @llvm.amdgcn.wave.shuffle(i32 %val, i32 %tid)
+ store i32 %v, ptr addrspace(1) %out
ret void
}
-; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
-; CHECK: %v1 = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
-; CHECK: store i32 %v1, ptr addrspace(1) %out, align 4
-define amdgpu_kernel void @div_permlane16_var_uni_16(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
- %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
- %v1 = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
- store i32 %v1, ptr addrspace(1) %out
+; Both Value and Index are divergent => result is divergent.
+; CHECK-LABEL: UniformityInfo for function 'wave_shuffle_both_divergent':
+; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.wave.shuffle.i32(i32 %tid, i32 %tid)
+define amdgpu_kernel void @wave_shuffle_both_divergent(ptr addrspace(1) %out) {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %v = call i32 @llvm.amdgcn.wave.shuffle(i32 %tid, i32 %tid)
+ store i32 %v, ptr addrspace(1) %out
ret void
}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK: {{.*}}
+declare i32 @llvm.amdgcn.wave.shuffle(i32, i32)
+declare i32 @llvm.amdgcn.workitem.id.x()
>From 06b7234eb0d24410a3dba55728a79ab81a40277c Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Tue, 24 Mar 2026 14:35:46 +0530
Subject: [PATCH 16/17] update the divergent logic for wave_shuffle
---
.../Target/AMDGPU/AMDGPUTargetTransformInfo.cpp | 6 ++++--
.../AMDGPU/uniform_intrinsic.ll | 16 ++++++++++------
2 files changed, 14 insertions(+), 8 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 11f25e6aff5da..72ad418d3cf17 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -1800,8 +1800,10 @@ bool GCNTTIImpl::isDivergent(const Instruction *I,
const IntrinsicInst *Intrinsic = cast<IntrinsicInst>(I);
switch (Intrinsic->getIntrinsicID()) {
case Intrinsic::amdgcn_wave_shuffle:
- // wave_shuffle(Value, Index): result is divergent iff Index is divergent.
- return DivergentArgs[1];
+ // wave_shuffle(Value, Index): result is divergent only when both Value and
+ // Index are divergent. A uniform Value read from any lane yields the same
+ // result, and a uniform Index makes all lanes read the same source lane.
+ return DivergentArgs[0] && DivergentArgs[1];
default:
llvm_unreachable("unexpected intrinsic in isDivergent");
}
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/uniform_intrinsic.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/uniform_intrinsic.ll
index d7a697e85f376..e3367235c9f6d 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/uniform_intrinsic.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/uniform_intrinsic.ll
@@ -1,9 +1,10 @@
; RUN: opt -mtriple amdgcn-- -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s
-; wave_shuffle(Value, Index): result is uniform when Index is uniform,
-; regardless of Value's divergence.
+; wave_shuffle(Value, Index): result is divergent only when both Value and
+; Index are divergent. A uniform Value read from any lane yields the same
+; result, and a uniform Index makes all lanes read the same source lane.
-; All kernel args are uniform, so Index is uniform => result is uniform.
+; All kernel args are uniform => result is uniform.
; CHECK-LABEL: UniformityInfo for function 'wave_shuffle_all_uniform':
; CHECK: ALL VALUES UNIFORM
define amdgpu_kernel void @wave_shuffle_all_uniform(ptr addrspace(1) %out, i32 %val, i32 %idx) {
@@ -12,7 +13,8 @@ define amdgpu_kernel void @wave_shuffle_all_uniform(ptr addrspace(1) %out, i32 %
ret void
}
-; Value is divergent (thread ID), but Index is uniform => result is uniform.
+; Value is divergent, Index is uniform => result is uniform.
+; All lanes read from the same source lane, so the result is the same.
; CHECK-LABEL: UniformityInfo for function 'wave_shuffle_divergent_val_uniform_idx':
; CHECK-NOT: DIVERGENT: {{.*}}wave.shuffle
define amdgpu_kernel void @wave_shuffle_divergent_val_uniform_idx(ptr addrspace(1) %out, i32 %idx) {
@@ -22,9 +24,11 @@ define amdgpu_kernel void @wave_shuffle_divergent_val_uniform_idx(ptr addrspace(
ret void
}
-; Value is uniform, but Index is divergent (thread ID) => result is divergent.
+; Value is uniform, Index is divergent => result is uniform.
+; Each lane may read from a different source lane, but Value is the same
+; across all lanes so the result is still uniform.
; CHECK-LABEL: UniformityInfo for function 'wave_shuffle_uniform_val_divergent_idx':
-; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.wave.shuffle.i32(i32 %val, i32 %tid)
+; CHECK-NOT: DIVERGENT: {{.*}}wave.shuffle
define amdgpu_kernel void @wave_shuffle_uniform_val_divergent_idx(ptr addrspace(1) %out, i32 %val) {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%v = call i32 @llvm.amdgcn.wave.shuffle(i32 %val, i32 %tid)
>From 2c8c0c981478f704418d4d5b9bf059c13ccc511b Mon Sep 17 00:00:00 2001
From: Pankaj Dwivedi <divedi.pk.117 at gmail.com>
Date: Tue, 24 Mar 2026 16:32:50 +0530
Subject: [PATCH 17/17] Update llvm/include/llvm/ADT/GenericUniformityImpl.h
Co-authored-by: Jay Foad <jay.foad at gmail.com>
---
llvm/include/llvm/ADT/GenericUniformityImpl.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h
index 44f4db3cccb4c..a306ed8cb6354 100644
--- a/llvm/include/llvm/ADT/GenericUniformityImpl.h
+++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h
@@ -798,7 +798,7 @@ void GenericUniformityAnalysisImpl<ContextT>::markDivergent(
// For custom uniformity candidates, check if the instruction can be
// proven uniform based on which operands are uniform/divergent.
// The candidate will be re-evaluated as operands become divergent.
- if (CustomUniformityCandidates.count(&I)) {
+ if (CustomUniformityCandidates.contains(&I)) {
if (isCustomUniform(I))
return;
}
More information about the llvm-commits
mailing list