[llvm] [TTI] Introduce getInstructionUniformity API for flexible uniformity analysis (PR #137639)
Pankaj Dwivedi via llvm-commits
llvm-commits at lists.llvm.org
Thu Nov 20 03:00:48 PST 2025
https://github.com/PankajDwivedi-25 updated https://github.com/llvm/llvm-project/pull/137639
>From 10936699fe3c83447ceea61c87fc6040c51d46be Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Tue, 23 Sep 2025 17:25:29 +0530
Subject: [PATCH 1/9] [NFC] move isDivergentUse so later dependent function in
pushUsers can safely use it
---
llvm/lib/Analysis/UniformityAnalysis.cpp | 26 ++++++++++++------------
1 file changed, 13 insertions(+), 13 deletions(-)
diff --git a/llvm/lib/Analysis/UniformityAnalysis.cpp b/llvm/lib/Analysis/UniformityAnalysis.cpp
index 2e4063f5db14e..1702cb3b96b92 100644
--- a/llvm/lib/Analysis/UniformityAnalysis.cpp
+++ b/llvm/lib/Analysis/UniformityAnalysis.cpp
@@ -29,6 +29,19 @@ bool llvm::GenericUniformityAnalysisImpl<SSAContext>::markDefsDivergent(
return markDivergent(cast<Value>(&Instr));
}
+template <>
+bool llvm::GenericUniformityAnalysisImpl<SSAContext>::isDivergentUse(
+ const Use &U) const {
+ const auto *V = U.get();
+ if (isDivergent(V))
+ return true;
+ if (const auto *DefInstr = dyn_cast<Instruction>(V)) {
+ const auto *UseInstr = cast<Instruction>(U.getUser());
+ return isTemporalDivergent(*UseInstr->getParent(), *DefInstr);
+ }
+ return false;
+}
+
template <> void llvm::GenericUniformityAnalysisImpl<SSAContext>::initialize() {
for (auto &I : instructions(F)) {
if (TTI->isSourceOfDivergence(&I))
@@ -88,19 +101,6 @@ void llvm::GenericUniformityAnalysisImpl<
}
}
-template <>
-bool llvm::GenericUniformityAnalysisImpl<SSAContext>::isDivergentUse(
- const Use &U) const {
- const auto *V = U.get();
- if (isDivergent(V))
- return true;
- if (const auto *DefInstr = dyn_cast<Instruction>(V)) {
- const auto *UseInstr = cast<Instruction>(U.getUser());
- return isTemporalDivergent(*UseInstr->getParent(), *DefInstr);
- }
- return false;
-}
-
// This ensures explicit instantiation of
// GenericUniformityAnalysisImpl::ImplDeleter::operator()
template class llvm::GenericUniformityInfo<SSAContext>;
>From 2e4a108dd52d8e35e11da387a33b2de56b2c9d70 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Sat, 1 Nov 2025 02:02:14 +0530
Subject: [PATCH 2/9] add target hook to capture special operand uniformity and
update UA to use it
---
llvm/include/llvm/ADT/GenericUniformityImpl.h | 22 +++++++
llvm/include/llvm/ADT/Uniformity.h | 5 +-
.../llvm/Analysis/TargetTransformInfo.h | 2 +
.../llvm/Analysis/TargetTransformInfoImpl.h | 4 ++
llvm/lib/Analysis/TargetTransformInfo.cpp | 5 ++
llvm/lib/Analysis/UniformityAnalysis.cpp | 42 +++++++++----
.../lib/CodeGen/MachineUniformityAnalysis.cpp | 11 ++++
.../Target/AMDGPU/AMDGPUSearchableTables.td | 2 -
.../AMDGPU/AMDGPUTargetTransformInfo.cpp | 14 +++++
.../Target/AMDGPU/AMDGPUTargetTransformInfo.h | 2 +
.../UniformityAnalysis/AMDGPU/intrinsics.ll | 4 +-
.../AMDGPU/uniform_intrinsic.ll | 59 +++++++++++++++++++
12 files changed, 154 insertions(+), 18 deletions(-)
create mode 100644 llvm/test/Analysis/UniformityAnalysis/AMDGPU/uniform_intrinsic.ll
diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h
index 7fb0dbe22f12f..fdc714b5fa778 100644
--- a/llvm/include/llvm/ADT/GenericUniformityImpl.h
+++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h
@@ -51,6 +51,7 @@
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SparseBitVector.h"
#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Uniformity.h"
#include "llvm/Support/raw_ostream.h"
#define DEBUG_TYPE "uniformity"
@@ -407,6 +408,11 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
void recordTemporalDivergence(ConstValueRefT, const InstructionT *,
const CycleT *);
+ bool isOperandUniform(const InstructionT &I, InstructionUniformity IU) const;
+
+ /// \brief keep track of target instruction that can be proven uniform.
+ void addUniformInstruction(const InstructionT *I, InstructionUniformity IU);
+
protected:
const ContextT &Context;
const FunctionT &F;
@@ -420,6 +426,11 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
// Internal worklist for divergence propagation.
std::vector<const InstructionT *> Worklist;
+ // Map containing tracked instruction that can be proven uniform based on its
+ // operand Uniformity.
+ llvm::DenseMap<const InstructionT *, InstructionUniformity>
+ UniformInstruction;
+
/// \brief Mark \p Term as divergent and push all Instructions that become
/// divergent as a result on the worklist.
void analyzeControlDivergence(const InstructionT &Term);
@@ -785,6 +796,11 @@ void GenericUniformityAnalysisImpl<ContextT>::markDivergent(
const InstructionT &I) {
if (isAlwaysUniform(I))
return;
+ auto It = UniformInstruction.find(&I);
+ if (It != UniformInstruction.end() && isOperandUniform(I, It->second)) {
+ addUniformOverride(I);
+ return;
+ }
bool Marked = false;
if (I.isTerminator()) {
Marked = DivergentTermBlocks.insert(I.getParent()).second;
@@ -816,6 +832,12 @@ void GenericUniformityAnalysisImpl<ContextT>::addUniformOverride(
UniformOverrides.insert(&Instr);
}
+template <typename ContextT>
+void GenericUniformityAnalysisImpl<ContextT>::addUniformInstruction(
+ const InstructionT *I, InstructionUniformity IU) {
+ UniformInstruction[I] = IU;
+}
+
// Mark as divergent all external uses of values defined in \p DefCycle.
//
// A value V defined by a block B inside \p DefCycle may be used outside the
diff --git a/llvm/include/llvm/ADT/Uniformity.h b/llvm/include/llvm/ADT/Uniformity.h
index 21ca106b80be3..9571d43b8a9b9 100644
--- a/llvm/include/llvm/ADT/Uniformity.h
+++ b/llvm/include/llvm/ADT/Uniformity.h
@@ -23,7 +23,10 @@ enum class InstructionUniformity {
AlwaysUniform,
/// The result values can never be assumed to be uniform.
- NeverUniform
+ NeverUniform,
+
+ /// Result value can be uniform if either of first two operand are uniform.
+ EitherOfFirstTwoOp
};
} // namespace llvm
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index a65e4667ab76c..ad5ab5ddce40f 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -24,6 +24,7 @@
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/BitmaskEnum.h"
+#include "llvm/ADT/Uniformity.h"
#include "llvm/Analysis/IVDescriptors.h"
#include "llvm/Analysis/InterestingMemoryOperand.h"
#include "llvm/IR/FMF.h"
@@ -1999,6 +2000,7 @@ class TargetTransformInfo {
/// Returns true if GEP should not be used to index into vectors for this
/// target.
LLVM_ABI bool allowVectorElementIndexingUsingGEP() const;
+ InstructionUniformity getInstructionUniformity(const Instruction &I) const;
private:
std::unique_ptr<const TargetTransformInfoImplBase> TTIImpl;
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index d8e35748f53e5..847dd46c01cfc 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -1153,6 +1153,10 @@ class TargetTransformInfoImplBase {
SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const {}
virtual bool allowVectorElementIndexingUsingGEP() const { return true; }
+ virtual InstructionUniformity
+ getInstructionUniformity(const Instruction &I) const {
+ return InstructionUniformity::Default;
+ }
protected:
// Obtain the minimum required size to hold the value (without the sign)
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 45369f0ffe137..cebe4e207c66d 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1517,6 +1517,11 @@ bool TargetTransformInfo::allowVectorElementIndexingUsingGEP() const {
return TTIImpl->allowVectorElementIndexingUsingGEP();
}
+InstructionUniformity
+TargetTransformInfo::getInstructionUniformity(const Instruction &I) const {
+ return TTIImpl->getInstructionUniformity(I);
+}
+
TargetTransformInfoImplBase::~TargetTransformInfoImplBase() = default;
TargetIRAnalysis::TargetIRAnalysis() : TTICallback(&getDefaultTTI) {}
diff --git a/llvm/lib/Analysis/UniformityAnalysis.cpp b/llvm/lib/Analysis/UniformityAnalysis.cpp
index 1702cb3b96b92..64aa9e42d673d 100644
--- a/llvm/lib/Analysis/UniformityAnalysis.cpp
+++ b/llvm/lib/Analysis/UniformityAnalysis.cpp
@@ -8,6 +8,7 @@
#include "llvm/Analysis/UniformityAnalysis.h"
#include "llvm/ADT/GenericUniformityImpl.h"
+#include "llvm/ADT/Uniformity.h"
#include "llvm/Analysis/CycleAnalysis.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/Dominators.h"
@@ -29,25 +30,15 @@ bool llvm::GenericUniformityAnalysisImpl<SSAContext>::markDefsDivergent(
return markDivergent(cast<Value>(&Instr));
}
-template <>
-bool llvm::GenericUniformityAnalysisImpl<SSAContext>::isDivergentUse(
- const Use &U) const {
- const auto *V = U.get();
- if (isDivergent(V))
- return true;
- if (const auto *DefInstr = dyn_cast<Instruction>(V)) {
- const auto *UseInstr = cast<Instruction>(U.getUser());
- return isTemporalDivergent(*UseInstr->getParent(), *DefInstr);
- }
- return false;
-}
-
template <> void llvm::GenericUniformityAnalysisImpl<SSAContext>::initialize() {
for (auto &I : instructions(F)) {
if (TTI->isSourceOfDivergence(&I))
markDivergent(I);
else if (TTI->isAlwaysUniform(&I))
addUniformOverride(I);
+ InstructionUniformity IU = TTI->getInstructionUniformity(I);
+ if (IU != InstructionUniformity::Default)
+ addUniformInstruction(&I, IU);
}
for (auto &Arg : F.args()) {
if (TTI->isSourceOfDivergence(&Arg)) {
@@ -101,6 +92,31 @@ void llvm::GenericUniformityAnalysisImpl<
}
}
+template <>
+bool llvm::GenericUniformityAnalysisImpl<SSAContext>::isDivergentUse(
+ const Use &U) const {
+ const auto *V = U.get();
+ if (isDivergent(V))
+ return true;
+ if (const auto *DefInstr = dyn_cast<Instruction>(V)) {
+ const auto *UseInstr = cast<Instruction>(U.getUser());
+ return isTemporalDivergent(*UseInstr->getParent(), *DefInstr);
+ }
+ return false;
+}
+
+template <>
+bool GenericUniformityAnalysisImpl<SSAContext>::isOperandUniform(
+ const Instruction &I, InstructionUniformity IU) const {
+ switch (IU) {
+ case InstructionUniformity::EitherOfFirstTwoOp:
+ return !isDivergentUse(I.getOperandUse(0)) ||
+ !isDivergentUse(I.getOperandUse(1));
+ default:
+ return false;
+ }
+}
+
// This ensures explicit instantiation of
// GenericUniformityAnalysisImpl::ImplDeleter::operator()
template class llvm::GenericUniformityInfo<SSAContext>;
diff --git a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
index e4b82ce83fda6..91a46b3b2f6be 100644
--- a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
+++ b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
@@ -148,6 +148,17 @@ bool llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::isDivergentUse(
return isTemporalDivergent(*UseInstr->getParent(), *DefInstr);
}
+template <>
+bool GenericUniformityAnalysisImpl<MachineSSAContext>::isOperandUniform(
+ const MachineInstr &I, InstructionUniformity IU) const {
+ switch (IU) {
+ case InstructionUniformity::EitherOfFirstTwoOp:
+ return !isDivergentUse(I.getOperand(0)) || !isDivergentUse(I.getOperand(1));
+ default:
+ return false;
+ }
+}
+
// This ensures explicit instantiation of
// GenericUniformityAnalysisImpl::ImplDeleter::operator()
template class llvm::GenericUniformityInfo<MachineSSAContext>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index 2393346839707..f35f44f6cee29 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -317,8 +317,6 @@ def : SourceOfDivergence<int_amdgcn_live_mask>;
def : SourceOfDivergence<int_amdgcn_ds_swizzle>;
def : SourceOfDivergence<int_amdgcn_ds_ordered_add>;
def : SourceOfDivergence<int_amdgcn_ds_ordered_swap>;
-def : SourceOfDivergence<int_amdgcn_permlane16>;
-def : SourceOfDivergence<int_amdgcn_permlanex16>;
def : SourceOfDivergence<int_amdgcn_permlane16_var>;
def : SourceOfDivergence<int_amdgcn_permlanex16_var>;
def : SourceOfDivergence<int_amdgcn_permlane_bcast>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 03d16fdd54c42..bb660656dbc60 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -1574,3 +1574,17 @@ unsigned GCNTTIImpl::getNumberOfParts(Type *Tp) const {
}
return BaseT::getNumberOfParts(Tp);
}
+
+InstructionUniformity
+GCNTTIImpl::getInstructionUniformity(const Instruction &I) const {
+ if (const auto *II = dyn_cast<IntrinsicInst>(&I)) {
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::amdgcn_permlane16:
+ case Intrinsic::amdgcn_permlanex16:
+ return InstructionUniformity::EitherOfFirstTwoOp;
+ default:
+ break;
+ }
+ }
+ return InstructionUniformity::Default;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 20da8344c9d37..ef665a4b0c0a7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -302,6 +302,8 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
/// together under a single i32 value. Otherwise fall back to base
/// implementation.
unsigned getNumberOfParts(Type *Tp) const override;
+ InstructionUniformity
+ getInstructionUniformity(const Instruction &I) const override;
};
} // end namespace llvm
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
index d5c6000a1eef6..d32f3d3284bc3 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
@@ -7,14 +7,14 @@ define amdgpu_kernel void @ds_swizzle(ptr addrspace(1) %out, i32 %src) #0 {
ret void
}
-; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
+; CHECK: ALL VALUES UNIFORM
define amdgpu_kernel void @v_permlane16_b32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
%v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
store i32 %v, ptr addrspace(1) %out
ret void
}
-; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
+; CHECK: ALL VALUES UNIFORM
define amdgpu_kernel void @v_permlanex16_b32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
%v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
store i32 %v, ptr addrspace(1) %out
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/uniform_intrinsic.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/uniform_intrinsic.ll
new file mode 100644
index 0000000000000..37be465a7796b
--- /dev/null
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/uniform_intrinsic.ll
@@ -0,0 +1,59 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple amdgcn-- -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s
+
+; CHECK: ALL VALUES UNIFORM
+define amdgpu_kernel void @v_permlane16_b32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
+ %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+; CHECK: ALL VALUES UNIFORM
+define amdgpu_kernel void @v_permlanex16_b32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
+ %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+; CHECK: %v1 = call i32 @llvm.amdgcn.permlanex16.i32(i32 %v, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+; CHECK: store i32 %v1, ptr addrspace(1) %out, align 4
+define amdgpu_kernel void @div_permlane16_var_uni_usr_x16(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
+ %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
+ %v1 = call i32 @llvm.amdgcn.permlanex16.i32(i32 %v, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
+ store i32 %v1, ptr addrspace(1) %out
+ ret void
+}
+
+; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+; CHECK: %v1 = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+; CHECK: store i32 %v1, ptr addrspace(1) %out, align 4
+define amdgpu_kernel void @div_permlane16_var_uni_x16(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
+ %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
+ %v1 = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
+ store i32 %v1, ptr addrspace(1) %out
+ ret void
+}
+
+; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+; CHECK: %v1 = call i32 @llvm.amdgcn.permlane16.i32(i32 %v, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+; CHECK: store i32 %v1, ptr addrspace(1) %out, align 4
+define amdgpu_kernel void @div_permlane16_var_uni_usr_16(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
+ %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
+ %v1 = call i32 @llvm.amdgcn.permlane16.i32(i32 %v, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
+ store i32 %v1, ptr addrspace(1) %out
+ ret void
+}
+
+; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+; CHECK: %v1 = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+; CHECK: store i32 %v1, ptr addrspace(1) %out, align 4
+define amdgpu_kernel void @div_permlane16_var_uni_16(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
+ %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
+ %v1 = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
+ store i32 %v1, ptr addrspace(1) %out
+ ret void
+}
+
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
>From 2452edc95d542fc5cb7ae773dc86f4ce940c9c60 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Tue, 4 Nov 2025 14:21:00 +0530
Subject: [PATCH 3/9] update enum name for more clarity
---
llvm/include/llvm/ADT/Uniformity.h | 5 +++--
llvm/lib/Analysis/UniformityAnalysis.cpp | 2 +-
llvm/lib/CodeGen/MachineUniformityAnalysis.cpp | 4 ++--
llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp | 2 +-
4 files changed, 7 insertions(+), 6 deletions(-)
diff --git a/llvm/include/llvm/ADT/Uniformity.h b/llvm/include/llvm/ADT/Uniformity.h
index 9571d43b8a9b9..ed558b004d322 100644
--- a/llvm/include/llvm/ADT/Uniformity.h
+++ b/llvm/include/llvm/ADT/Uniformity.h
@@ -25,8 +25,9 @@ enum class InstructionUniformity {
/// The result values can never be assumed to be uniform.
NeverUniform,
- /// Result value can be uniform if either of first two operand are uniform.
- EitherOfFirstTwoOp
+ /// Result value can be uniform if any of the first two use operand are
+ /// uniform.
+ AnyOfFirstTwoUseOp
};
} // namespace llvm
diff --git a/llvm/lib/Analysis/UniformityAnalysis.cpp b/llvm/lib/Analysis/UniformityAnalysis.cpp
index 64aa9e42d673d..438247ed1ea46 100644
--- a/llvm/lib/Analysis/UniformityAnalysis.cpp
+++ b/llvm/lib/Analysis/UniformityAnalysis.cpp
@@ -109,7 +109,7 @@ template <>
bool GenericUniformityAnalysisImpl<SSAContext>::isOperandUniform(
const Instruction &I, InstructionUniformity IU) const {
switch (IU) {
- case InstructionUniformity::EitherOfFirstTwoOp:
+ case InstructionUniformity::AnyOfFirstTwoUseOp:
return !isDivergentUse(I.getOperandUse(0)) ||
!isDivergentUse(I.getOperandUse(1));
default:
diff --git a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
index 91a46b3b2f6be..ee3aa8b9e2880 100644
--- a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
+++ b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
@@ -152,8 +152,8 @@ template <>
bool GenericUniformityAnalysisImpl<MachineSSAContext>::isOperandUniform(
const MachineInstr &I, InstructionUniformity IU) const {
switch (IU) {
- case InstructionUniformity::EitherOfFirstTwoOp:
- return !isDivergentUse(I.getOperand(0)) || !isDivergentUse(I.getOperand(1));
+ case InstructionUniformity::AnyOfFirstTwoUseOp:
+ return !isDivergentUse(I.getOperand(1)) || !isDivergentUse(I.getOperand(2));
default:
return false;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index bb660656dbc60..563e5726f8593 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -1581,7 +1581,7 @@ GCNTTIImpl::getInstructionUniformity(const Instruction &I) const {
switch (II->getIntrinsicID()) {
case Intrinsic::amdgcn_permlane16:
case Intrinsic::amdgcn_permlanex16:
- return InstructionUniformity::EitherOfFirstTwoOp;
+ return InstructionUniformity::AnyOfFirstTwoUseOp;
default:
break;
}
>From 62862fa670b46a650ec5fb25d1d4ff6d87c60b84 Mon Sep 17 00:00:00 2001
From: Pankaj Dwivedi <divedi.pk.117 at gmail.com>
Date: Wed, 5 Nov 2025 10:53:47 +0530
Subject: [PATCH 4/9] Apply suggestion from @arsenm
Co-authored-by: Matt Arsenault <arsenm2 at gmail.com>
---
llvm/include/llvm/ADT/GenericUniformityImpl.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h
index fdc714b5fa778..12192d02adc65 100644
--- a/llvm/include/llvm/ADT/GenericUniformityImpl.h
+++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h
@@ -428,7 +428,7 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
// Map containing tracked instruction that can be proven uniform based on its
// operand Uniformity.
- llvm::DenseMap<const InstructionT *, InstructionUniformity>
+ DenseMap<const InstructionT *, InstructionUniformity>
UniformInstruction;
/// \brief Mark \p Term as divergent and push all Instructions that become
>From 4f71ec70dd863841acf0aac8748f0e2311d9ff16 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Mon, 17 Nov 2025 15:17:36 +0530
Subject: [PATCH 5/9] let getInstructionUniformity hook wrap
isSourceOfDivergence/isAlwaysUniform
---
.../llvm/Analysis/TargetTransformInfo.h | 3 ++-
.../llvm/Analysis/TargetTransformInfoImpl.h | 7 +++++--
llvm/lib/Analysis/TargetTransformInfo.cpp | 4 ++--
llvm/lib/Analysis/UniformityAnalysis.cpp | 18 +++++++++++-----
.../lib/CodeGen/MachineUniformityAnalysis.cpp | 10 +++------
.../AMDGPU/AMDGPUTargetTransformInfo.cpp | 21 ++++++++++++++++---
.../Target/AMDGPU/AMDGPUTargetTransformInfo.h | 4 ++--
.../Target/NVPTX/NVPTXTargetTransformInfo.cpp | 13 ++++++++++++
.../Target/NVPTX/NVPTXTargetTransformInfo.h | 1 +
9 files changed, 59 insertions(+), 22 deletions(-)
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index ad5ab5ddce40f..fe5184e0eae51 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -2000,7 +2000,8 @@ class TargetTransformInfo {
/// Returns true if GEP should not be used to index into vectors for this
/// target.
LLVM_ABI bool allowVectorElementIndexingUsingGEP() const;
- InstructionUniformity getInstructionUniformity(const Instruction &I) const;
+
+ InstructionUniformity getInstructionUniformity(const Value *V) const;
private:
std::unique_ptr<const TargetTransformInfoImplBase> TTIImpl;
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 847dd46c01cfc..a9ec71e10ecbb 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -1153,8 +1153,11 @@ class TargetTransformInfoImplBase {
SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const {}
virtual bool allowVectorElementIndexingUsingGEP() const { return true; }
- virtual InstructionUniformity
- getInstructionUniformity(const Instruction &I) const {
+
+ // New API for uniformity classification
+ // Targets should override this to provide target-specific uniformity analysis
+ // The default implementation returns Default (conservative behavior)
+ virtual InstructionUniformity getInstructionUniformity(const Value *V) const {
return InstructionUniformity::Default;
}
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index cebe4e207c66d..8a3674953f950 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1518,8 +1518,8 @@ bool TargetTransformInfo::allowVectorElementIndexingUsingGEP() const {
}
InstructionUniformity
-TargetTransformInfo::getInstructionUniformity(const Instruction &I) const {
- return TTIImpl->getInstructionUniformity(I);
+TargetTransformInfo::getInstructionUniformity(const Value *V) const {
+ return TTIImpl->getInstructionUniformity(V);
}
TargetTransformInfoImplBase::~TargetTransformInfoImplBase() = default;
diff --git a/llvm/lib/Analysis/UniformityAnalysis.cpp b/llvm/lib/Analysis/UniformityAnalysis.cpp
index 438247ed1ea46..4a317ea241fff 100644
--- a/llvm/lib/Analysis/UniformityAnalysis.cpp
+++ b/llvm/lib/Analysis/UniformityAnalysis.cpp
@@ -32,16 +32,24 @@ bool llvm::GenericUniformityAnalysisImpl<SSAContext>::markDefsDivergent(
template <> void llvm::GenericUniformityAnalysisImpl<SSAContext>::initialize() {
for (auto &I : instructions(F)) {
- if (TTI->isSourceOfDivergence(&I))
+ InstructionUniformity IU = TTI->getInstructionUniformity(&I);
+ switch (IU) {
+ case InstructionUniformity::NeverUniform:
markDivergent(I);
- else if (TTI->isAlwaysUniform(&I))
+ break;
+ case InstructionUniformity::AlwaysUniform:
addUniformOverride(I);
- InstructionUniformity IU = TTI->getInstructionUniformity(I);
- if (IU != InstructionUniformity::Default)
+ break;
+ case InstructionUniformity::Default:
+ break;
+ default:
addUniformInstruction(&I, IU);
+ break;
+ }
}
for (auto &Arg : F.args()) {
- if (TTI->isSourceOfDivergence(&Arg)) {
+ if (TTI->getInstructionUniformity(&Arg) ==
+ InstructionUniformity::NeverUniform) {
markDivergent(&Arg);
}
}
diff --git a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
index ee3aa8b9e2880..8d08ab8ae25b4 100644
--- a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
+++ b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
@@ -148,15 +148,11 @@ bool llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::isDivergentUse(
return isTemporalDivergent(*UseInstr->getParent(), *DefInstr);
}
+// This can be defined later depending on use of the MachineUniformityAnalysis.
template <>
bool GenericUniformityAnalysisImpl<MachineSSAContext>::isOperandUniform(
- const MachineInstr &I, InstructionUniformity IU) const {
- switch (IU) {
- case InstructionUniformity::AnyOfFirstTwoUseOp:
- return !isDivergentUse(I.getOperand(1)) || !isDivergentUse(I.getOperand(2));
- default:
- return false;
- }
+ const MachineInstr &MI, InstructionUniformity IU) const {
+ return false;
}
// This ensures explicit instantiation of
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 563e5726f8593..bfa68c975ec0c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -1575,16 +1575,31 @@ unsigned GCNTTIImpl::getNumberOfParts(Type *Tp) const {
return BaseT::getNumberOfParts(Tp);
}
+// New API that wraps the old isSourceOfDivergence and isAlwaysUniform APIs
+// with additional support for new uniformity classifications
InstructionUniformity
-GCNTTIImpl::getInstructionUniformity(const Instruction &I) const {
- if (const auto *II = dyn_cast<IntrinsicInst>(&I)) {
- switch (II->getIntrinsicID()) {
+GCNTTIImpl::getInstructionUniformity(const Value *V) const {
+ // Check for new special cases first (permlane16/permlanex16)
+ // These need operand-dependent uniformity analysis
+ if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
+ switch (Intrinsic->getIntrinsicID()) {
case Intrinsic::amdgcn_permlane16:
case Intrinsic::amdgcn_permlanex16:
+ // Result value can be uniform if either of first two operands are uniform
return InstructionUniformity::AnyOfFirstTwoUseOp;
default:
break;
}
}
+
+ // Delegate to old APIs for backward compatibility
+ if (isAlwaysUniform(V))
+ return InstructionUniformity::AlwaysUniform;
+
+ // Check if source of divergence
+ if (isSourceOfDivergence(V))
+ return InstructionUniformity::NeverUniform;
+
+ // Default behavior
return InstructionUniformity::Default;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index ef665a4b0c0a7..c2e102c9bab8e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -302,8 +302,8 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
/// together under a single i32 value. Otherwise fall back to base
/// implementation.
unsigned getNumberOfParts(Type *Tp) const override;
- InstructionUniformity
- getInstructionUniformity(const Instruction &I) const override;
+
+ InstructionUniformity getInstructionUniformity(const Value *V) const override;
};
} // end namespace llvm
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
index 64593e6439184..177fa24e5433c 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -635,3 +635,16 @@ void NVPTXTTIImpl::collectKernelLaunchBounds(
if (MaxNTID.size() > 2)
LB.push_back({"maxntidz", MaxNTID[2]});
}
+
+// New API that wraps the old isSourceOfDivergence API
+// NVPTX doesn't have isAlwaysUniform, so we only delegate to
+// isSourceOfDivergence
+InstructionUniformity
+NVPTXTTIImpl::getInstructionUniformity(const Value *V) const {
+ // Delegate to old API for backward compatibility
+ if (isSourceOfDivergence(V))
+ return InstructionUniformity::NeverUniform;
+
+ // Default behavior
+ return InstructionUniformity::Default;
+}
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
index 78eb751cf3c2e..640d7b461b649 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@@ -195,6 +195,7 @@ class NVPTXTTIImpl final : public BasicTTIImplBase<NVPTXTTIImpl> {
// Self-referential globals are not supported.
return false;
}
+ InstructionUniformity getInstructionUniformity(const Value *V) const override;
};
} // end namespace llvm
>From 5647603bfcda8366d0886a3a124120845cf02bb8 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Wed, 19 Nov 2025 17:38:58 +0530
Subject: [PATCH 6/9] update the operand check & update machine inst uniformity
---
llvm/lib/Analysis/UniformityAnalysis.cpp | 7 +-
.../lib/CodeGen/MachineUniformityAnalysis.cpp | 52 +++++++++--
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 7 ++
.../AMDGPU/MIR/uniform-permlane.mir | 86 +++++++++++++++++++
.../AMDGPU/uniform_intrinsic.ll | 8 +-
5 files changed, 146 insertions(+), 14 deletions(-)
create mode 100644 llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/uniform-permlane.mir
diff --git a/llvm/lib/Analysis/UniformityAnalysis.cpp b/llvm/lib/Analysis/UniformityAnalysis.cpp
index 4a317ea241fff..2b17010d5326d 100644
--- a/llvm/lib/Analysis/UniformityAnalysis.cpp
+++ b/llvm/lib/Analysis/UniformityAnalysis.cpp
@@ -118,8 +118,11 @@ bool GenericUniformityAnalysisImpl<SSAContext>::isOperandUniform(
const Instruction &I, InstructionUniformity IU) const {
switch (IU) {
case InstructionUniformity::AnyOfFirstTwoUseOp:
- return !isDivergentUse(I.getOperandUse(0)) ||
- !isDivergentUse(I.getOperandUse(1));
+ // For permlane16/permlanex16: <old> <src0> <src1> <src2> <fi>
+ // <bound_control> Check if either src0 (operand 1) or src1 (operand 2 -
+ // lane select) is uniform
+ return !isDivergentUse(I.getOperandUse(1)) ||
+ !isDivergentUse(I.getOperandUse(2));
default:
return false;
}
diff --git a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
index 8d08ab8ae25b4..2941e679653ba 100644
--- a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
+++ b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
@@ -53,13 +53,18 @@ void llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::initialize() {
for (const MachineBasicBlock &block : F) {
for (const MachineInstr &instr : block) {
auto uniformity = InstrInfo.getInstructionUniformity(instr);
- if (uniformity == InstructionUniformity::AlwaysUniform) {
- addUniformOverride(instr);
- continue;
- }
-
- if (uniformity == InstructionUniformity::NeverUniform) {
+ switch (uniformity) {
+ case InstructionUniformity::NeverUniform:
markDivergent(instr);
+ break;
+ case InstructionUniformity::AlwaysUniform:
+ addUniformOverride(instr);
+ break;
+ case InstructionUniformity::Default:
+ break;
+ default:
+ addUniformInstruction(&instr, uniformity);
+ break;
}
}
}
@@ -148,11 +153,42 @@ bool llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::isDivergentUse(
return isTemporalDivergent(*UseInstr->getParent(), *DefInstr);
}
-// This can be defined later depending on use of the MachineUniformityAnalysis.
template <>
bool GenericUniformityAnalysisImpl<MachineSSAContext>::isOperandUniform(
const MachineInstr &MI, InstructionUniformity IU) const {
- return false;
+ switch (IU) {
+ // For permlane16/permlanex16, check if either src or lane select is uniform
+ // These instructions have mixed immediate and register operands:
+ // Operand 1 is src0 (the source value to permute)
+ // Operand 3 is src1 (lane select - which lane within the 16 to read from)
+ // Result is uniform if EITHER the source OR lane select is uniform
+ case InstructionUniformity::AnyOfFirstTwoUseOp: {
+ // Check if any of the first two register use operands is uniform
+ // Result is uniform if ANY of these operands is uniform
+ const MachineOperand *FirstRegOp = nullptr;
+ const MachineOperand *SecondRegOp = nullptr;
+
+ // Find the first two register use operands
+ for (const MachineOperand &MO : MI.uses()) {
+ if (MO.isReg() && MO.getReg().isVirtual()) {
+ if (!FirstRegOp)
+ FirstRegOp = &MO;
+ else if (!SecondRegOp) {
+ SecondRegOp = &MO;
+ break;
+ }
+ }
+ }
+
+ if (!FirstRegOp || !SecondRegOp)
+ return false;
+
+ // Return true if either operand is uniform
+ return !isDivergentUse(*FirstRegOp) || !isDivergentUse(*SecondRegOp);
+ }
+ default:
+ return false;
+ }
}
// This ensures explicit instantiation of
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 7cb7f47ddb220..2db4d1bbd96ab 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -10574,6 +10574,13 @@ SIInstrInfo::getInstructionUniformity(const MachineInstr &MI) const {
return InstructionUniformity::NeverUniform;
unsigned opcode = MI.getOpcode();
+
+ // Special handling for permlane16/permlanex16 - uniformity depends on
+ // operands
+ if (opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
+ opcode == AMDGPU::V_PERMLANEX16_B32_e64)
+ return InstructionUniformity::AnyOfFirstTwoUseOp;
+
if (opcode == AMDGPU::V_READLANE_B32 ||
opcode == AMDGPU::V_READFIRSTLANE_B32 ||
opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/uniform-permlane.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/uniform-permlane.mir
new file mode 100644
index 0000000000000..f08d16affef23
--- /dev/null
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/uniform-permlane.mir
@@ -0,0 +1,86 @@
+# RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -run-pass=print-machine-uniformity -o - %s 2>&1 | FileCheck %s
+
+# Test the machine-level uniformity analysis for permlane16/permlanex16 instructions.
+#
+# NOTE: Permlane instructions have a hardware constraint that src1 (lane select) and src2
+# must be SGPR (scalar) registers. Since SGPRs are always uniform at machine level,
+# permlane results are always uniform according to the AnyOfFirstTwoUseOp logic
+# (either src0 OR src1 being uniform makes the result uniform, and src1 is always uniform).
+#
+# These tests verify that the uniformity analysis correctly handles permlane instructions
+# and that uniform results propagate through chains of operations.
+
+---
+# Test: permlane16 with divergent VGPR src and uniform SGPR lane select
+# Result is UNIFORM because lane select (SGPR) is always uniform
+name: permlane16_basic
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: MachineUniformityInfo for function: permlane16_basic
+ ; CHECK: ALL VALUES UNIFORM
+ %0:vgpr_32 = IMPLICIT_DEF
+ %1:sreg_32 = S_MOV_B32 5
+ %2:sreg_32 = IMPLICIT_DEF
+ %3:vgpr_32 = V_PERMLANE16_B32_e64 0, %0, 0, %1, 0, %2, %0, 0, implicit $exec
+ S_ENDPGM 0
+
+...
+---
+# Test: permlanex16 with divergent VGPR src and uniform SGPR lane select
+# Result is UNIFORM because lane select (SGPR) is always uniform
+name: permlanex16_basic
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: MachineUniformityInfo for function: permlanex16_basic
+ ; CHECK: ALL VALUES UNIFORM
+ %0:vgpr_32 = IMPLICIT_DEF
+ %1:sreg_32 = S_MOV_B32 7
+ %2:sreg_32 = IMPLICIT_DEF
+ %3:vgpr_32 = V_PERMLANEX16_B32_e64 0, %0, 0, %1, 0, %2, %0, 0, implicit $exec
+ S_ENDPGM 0
+
+...
+---
+# Test: Chain of permlane operations - uniformity propagates
+# Both permlanes are uniform, second uses result of first as source
+name: permlane16_chain_uniform
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: MachineUniformityInfo for function: permlane16_chain_uniform
+ ; CHECK: ALL VALUES UNIFORM
+ %0:vgpr_32 = IMPLICIT_DEF
+ %1:sreg_32 = S_MOV_B32 3
+ %2:sreg_32 = IMPLICIT_DEF
+ ; First permlane - uniform because lane select is SGPR
+ %3:vgpr_32 = V_PERMLANE16_B32_e64 0, %0, 0, %1, 0, %2, %0, 0, implicit $exec
+ ; Second permlane uses uniform result - also uniform
+ %4:vgpr_32 = V_PERMLANEX16_B32_e64 0, %3, 0, %1, 0, %2, %3, 0, implicit $exec
+ S_ENDPGM 0
+
+...
+---
+# Test: Multiple permlane operations in sequence
+# Verifies that uniformity is correctly tracked through complex chains
+name: permlane_multiple
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: MachineUniformityInfo for function: permlane_multiple
+ ; CHECK: ALL VALUES UNIFORM
+ %0:vgpr_32 = IMPLICIT_DEF
+ %1:sreg_32 = S_MOV_B32 1
+ %2:sreg_32 = S_MOV_B32 2
+ %3:vgpr_32 = V_PERMLANE16_B32_e64 0, %0, 0, %1, 0, %2, %0, 0, implicit $exec
+ %4:vgpr_32 = V_PERMLANEX16_B32_e64 0, %3, 0, %1, 0, %2, %3, 0, implicit $exec
+ %5:vgpr_32 = V_PERMLANE16_B32_e64 0, %4, 0, %2, 0, %1, %4, 0, implicit $exec
+ S_ENDPGM 0
+
+...
+
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/uniform_intrinsic.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/uniform_intrinsic.ll
index 37be465a7796b..e7391ee0c265b 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/uniform_intrinsic.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/uniform_intrinsic.ll
@@ -16,11 +16,11 @@ define amdgpu_kernel void @v_permlanex16_b32(ptr addrspace(1) %out, i32 %src0, i
}
; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
-; CHECK: %v1 = call i32 @llvm.amdgcn.permlanex16.i32(i32 %v, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+; CHECK: %v1 = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %v, i32 %src1, i32 %src2, i1 false, i1 false)
; CHECK: store i32 %v1, ptr addrspace(1) %out, align 4
define amdgpu_kernel void @div_permlane16_var_uni_usr_x16(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
%v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
- %v1 = call i32 @llvm.amdgcn.permlanex16.i32(i32 %v, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
+ %v1 = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %v, i32 %src1, i32 %src2, i1 false, i1 false) #0
store i32 %v1, ptr addrspace(1) %out
ret void
}
@@ -36,11 +36,11 @@ define amdgpu_kernel void @div_permlane16_var_uni_x16(ptr addrspace(1) %out, i32
}
; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
-; CHECK: %v1 = call i32 @llvm.amdgcn.permlane16.i32(i32 %v, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+; CHECK: %v1 = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %v, i32 %src1, i32 %src2, i1 false, i1 false)
; CHECK: store i32 %v1, ptr addrspace(1) %out, align 4
define amdgpu_kernel void @div_permlane16_var_uni_usr_16(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
%v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
- %v1 = call i32 @llvm.amdgcn.permlane16.i32(i32 %v, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
+ %v1 = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %v, i32 %src1, i32 %src2, i1 false, i1 false) #0
store i32 %v1, ptr addrspace(1) %out
ret void
}
>From 6ff4c4b19c1611d37306a7244f77abd133747fc4 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Wed, 19 Nov 2025 18:59:57 +0530
Subject: [PATCH 7/9] Fix formatting
---
llvm/include/llvm/ADT/GenericUniformityImpl.h | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h
index 12192d02adc65..307d51ac25288 100644
--- a/llvm/include/llvm/ADT/GenericUniformityImpl.h
+++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h
@@ -1,4 +1,4 @@
-//===- GenericUniformityImpl.h -----------------------*- C++ -*------------===//
+//===- GenericUniformityImpl.h -----------------------*- C++ -*------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -428,8 +428,7 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
// Map containing tracked instruction that can be proven uniform based on its
// operand Uniformity.
- DenseMap<const InstructionT *, InstructionUniformity>
- UniformInstruction;
+ DenseMap<const InstructionT *, InstructionUniformity> UniformInstruction;
/// \brief Mark \p Term as divergent and push all Instructions that become
/// divergent as a result on the worklist.
>From 33e36d4c429c98f6f1683abba3a9f3dd52ca84be Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Wed, 19 Nov 2025 19:50:03 +0530
Subject: [PATCH 8/9] update mir test check
---
.../UniformityAnalysis/AMDGPU/MIR/uniform-permlane.mir | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/uniform-permlane.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/uniform-permlane.mir
index f08d16affef23..da6048d86b2dd 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/uniform-permlane.mir
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/uniform-permlane.mir
@@ -18,7 +18,7 @@ machineFunctionInfo:
isEntryFunction: true
body: |
bb.0:
- ; CHECK-LABEL: MachineUniformityInfo for function: permlane16_basic
+ ; CHECK-LABEL: MachineUniformityInfo for function: @permlane16_basic
; CHECK: ALL VALUES UNIFORM
%0:vgpr_32 = IMPLICIT_DEF
%1:sreg_32 = S_MOV_B32 5
@@ -35,7 +35,7 @@ machineFunctionInfo:
isEntryFunction: true
body: |
bb.0:
- ; CHECK-LABEL: MachineUniformityInfo for function: permlanex16_basic
+ ; CHECK-LABEL: MachineUniformityInfo for function: @permlanex16_basic
; CHECK: ALL VALUES UNIFORM
%0:vgpr_32 = IMPLICIT_DEF
%1:sreg_32 = S_MOV_B32 7
@@ -52,7 +52,7 @@ machineFunctionInfo:
isEntryFunction: true
body: |
bb.0:
- ; CHECK-LABEL: MachineUniformityInfo for function: permlane16_chain_uniform
+ ; CHECK-LABEL: MachineUniformityInfo for function: @permlane16_chain_uniform
; CHECK: ALL VALUES UNIFORM
%0:vgpr_32 = IMPLICIT_DEF
%1:sreg_32 = S_MOV_B32 3
@@ -72,7 +72,7 @@ machineFunctionInfo:
isEntryFunction: true
body: |
bb.0:
- ; CHECK-LABEL: MachineUniformityInfo for function: permlane_multiple
+ ; CHECK-LABEL: MachineUniformityInfo for function: @permlane_multiple
; CHECK: ALL VALUES UNIFORM
%0:vgpr_32 = IMPLICIT_DEF
%1:sreg_32 = S_MOV_B32 1
>From 9e7f001d7757b2ce63ebed093338b365fc162b7e Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Thu, 20 Nov 2025 16:30:26 +0530
Subject: [PATCH 9/9] seperate complex target based custom logic through target
hook
---
llvm/include/llvm/ADT/GenericUniformityImpl.h | 15 +++--
llvm/include/llvm/ADT/Uniformity.h | 7 ++-
.../llvm/Analysis/TargetTransformInfo.h | 12 ++++
.../llvm/Analysis/TargetTransformInfoImpl.h | 8 +++
llvm/include/llvm/CodeGen/TargetInstrInfo.h | 16 ++++++
llvm/lib/Analysis/TargetTransformInfo.cpp | 5 ++
llvm/lib/Analysis/UniformityAnalysis.cpp | 28 +++++-----
.../lib/CodeGen/MachineUniformityAnalysis.cpp | 56 +++++++------------
.../AMDGPU/AMDGPUTargetTransformInfo.cpp | 32 +++++++++--
.../Target/AMDGPU/AMDGPUTargetTransformInfo.h | 3 +
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 23 +++++++-
llvm/lib/Target/AMDGPU/SIInstrInfo.h | 3 +
12 files changed, 144 insertions(+), 64 deletions(-)
diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h
index 307d51ac25288..a334e46b542e1 100644
--- a/llvm/include/llvm/ADT/GenericUniformityImpl.h
+++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h
@@ -408,9 +408,11 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
void recordTemporalDivergence(ConstValueRefT, const InstructionT *,
const CycleT *);
- bool isOperandUniform(const InstructionT &I, InstructionUniformity IU) const;
+ /// Check if an instruction with Custom uniformity can be proven uniform
+ /// based on its operands. This queries the target-specific callback.
+ bool isCustomUniform(const InstructionT &I) const;
- /// \brief keep track of target instruction that can be proven uniform.
+ /// \brief keep track of instructions that require custom uniformity analysis.
void addUniformInstruction(const InstructionT *I, InstructionUniformity IU);
protected:
@@ -795,10 +797,13 @@ void GenericUniformityAnalysisImpl<ContextT>::markDivergent(
const InstructionT &I) {
if (isAlwaysUniform(I))
return;
+ // Check if instruction requires custom uniformity analysis
auto It = UniformInstruction.find(&I);
- if (It != UniformInstruction.end() && isOperandUniform(I, It->second)) {
- addUniformOverride(I);
- return;
+ if (It != UniformInstruction.end()) {
+ if (It->second == InstructionUniformity::Custom && isCustomUniform(I)) {
+ addUniformOverride(I);
+ return;
+ }
}
bool Marked = false;
if (I.isTerminator()) {
diff --git a/llvm/include/llvm/ADT/Uniformity.h b/llvm/include/llvm/ADT/Uniformity.h
index ed558b004d322..43e588745f73f 100644
--- a/llvm/include/llvm/ADT/Uniformity.h
+++ b/llvm/include/llvm/ADT/Uniformity.h
@@ -25,9 +25,10 @@ enum class InstructionUniformity {
/// The result values can never be assumed to be uniform.
NeverUniform,
- /// Result value can be uniform if any of the first two use operand are
- /// uniform.
- AnyOfFirstTwoUseOp
+ /// If all operands are uniform, the result values are uniform. Otherwise,
+ /// the result values may be divergent, and a custom check may be used to
+ /// determine uniformity via a callback.
+ Custom
};
} // namespace llvm
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index fe5184e0eae51..b3681592788d4 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -2003,6 +2003,18 @@ class TargetTransformInfo {
InstructionUniformity getInstructionUniformity(const Value *V) const;
+ /// Determine if an instruction with some operands uniform can be proven
+ /// uniform. This is used for custom uniformity analysis where the target
+ /// can define complex rules that depend on which specific operands are
+ /// uniform.
+ ///
+ /// \param I The instruction to check.
+ /// \param UniformArgs A bitvector indicating which operands are known to be
+ /// uniform (bit N corresponds to operand N).
+ /// \returns true if the instruction result can be proven uniform given the
+ /// uniform operands, false otherwise.
+ bool isUniform(const Instruction *I, const SmallBitVector &UniformArgs) const;
+
private:
std::unique_ptr<const TargetTransformInfoImplBase> TTIImpl;
};
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index a9ec71e10ecbb..93eb0735c83ee 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -1161,6 +1161,14 @@ class TargetTransformInfoImplBase {
return InstructionUniformity::Default;
}
+ // Custom uniformity check for instructions marked as Custom
+ // Override this to provide complex uniformity rules based on which operands
+ // are uniform
+ virtual bool isUniform(const Instruction *I,
+ const SmallBitVector &UniformArgs) const {
+ return false; // Conservative: assume divergent
+ }
+
protected:
// Obtain the minimum required size to hold the value (without the sign)
// In case of a vector it returns the min required size for one element.
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 18142c2c0adf3..ad917f3d295ab 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -2359,6 +2359,22 @@ class LLVM_ABI TargetInstrInfo : public MCInstrInfo {
return InstructionUniformity::Default;
}
+ /// Determine if a machine instruction with some operands uniform can be
+ /// proven uniform. This is used for custom uniformity analysis where the
+ /// target can define complex rules that depend on which specific operands
+ /// are uniform.
+ ///
+ /// \param MI The machine instruction to check.
+ /// \param UniformArgs A bitvector indicating which register operands are
+ /// known to be uniform (bit N corresponds to the Nth
+ /// register use operand).
+ /// \returns true if the instruction result can be proven uniform given the
+ /// uniform operands, false otherwise.
+ virtual bool isUniform(const MachineInstr &MI,
+ const SmallBitVector &UniformArgs) const {
+ return false; // Conservative: assume divergent
+ }
+
/// Returns true if the given \p MI defines a TargetIndex operand that can be
/// tracked by their offset, can have values, and can have debug info
/// associated with it. If so, sets \p Index and \p Offset of the target index
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 8a3674953f950..b84b1cf2afd1c 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1522,6 +1522,11 @@ TargetTransformInfo::getInstructionUniformity(const Value *V) const {
return TTIImpl->getInstructionUniformity(V);
}
+bool TargetTransformInfo::isUniform(const Instruction *I,
+ const SmallBitVector &UniformArgs) const {
+ return TTIImpl->isUniform(I, UniformArgs);
+}
+
TargetTransformInfoImplBase::~TargetTransformInfoImplBase() = default;
TargetIRAnalysis::TargetIRAnalysis() : TTICallback(&getDefaultTTI) {}
diff --git a/llvm/lib/Analysis/UniformityAnalysis.cpp b/llvm/lib/Analysis/UniformityAnalysis.cpp
index 2b17010d5326d..240455d8631ba 100644
--- a/llvm/lib/Analysis/UniformityAnalysis.cpp
+++ b/llvm/lib/Analysis/UniformityAnalysis.cpp
@@ -8,6 +8,7 @@
#include "llvm/Analysis/UniformityAnalysis.h"
#include "llvm/ADT/GenericUniformityImpl.h"
+#include "llvm/ADT/SmallBitVector.h"
#include "llvm/ADT/Uniformity.h"
#include "llvm/Analysis/CycleAnalysis.h"
#include "llvm/Analysis/TargetTransformInfo.h"
@@ -40,11 +41,12 @@ template <> void llvm::GenericUniformityAnalysisImpl<SSAContext>::initialize() {
case InstructionUniformity::AlwaysUniform:
addUniformOverride(I);
break;
- case InstructionUniformity::Default:
- break;
- default:
+ case InstructionUniformity::Custom:
+ // Instructions requiring custom uniformity analysis based on operands
addUniformInstruction(&I, IU);
break;
+ case InstructionUniformity::Default:
+ break;
}
}
for (auto &Arg : F.args()) {
@@ -114,18 +116,16 @@ bool llvm::GenericUniformityAnalysisImpl<SSAContext>::isDivergentUse(
}
template <>
-bool GenericUniformityAnalysisImpl<SSAContext>::isOperandUniform(
- const Instruction &I, InstructionUniformity IU) const {
- switch (IU) {
- case InstructionUniformity::AnyOfFirstTwoUseOp:
- // For permlane16/permlanex16: <old> <src0> <src1> <src2> <fi>
- // <bound_control> Check if either src0 (operand 1) or src1 (operand 2 -
- // lane select) is uniform
- return !isDivergentUse(I.getOperandUse(1)) ||
- !isDivergentUse(I.getOperandUse(2));
- default:
- return false;
+bool GenericUniformityAnalysisImpl<SSAContext>::isCustomUniform(
+ const Instruction &I) const {
+ // Build bitvector of uniform operands
+ SmallBitVector UniformArgs(I.getNumOperands());
+ for (unsigned OpIdx = 0; OpIdx < I.getNumOperands(); ++OpIdx) {
+ UniformArgs[OpIdx] = !isDivergentUse(I.getOperandUse(OpIdx));
}
+
+ // Query target-specific uniformity callback
+ return TTI->isUniform(&I, UniformArgs);
}
// This ensures explicit instantiation of
diff --git a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
index 2941e679653ba..cf25b2e4d19c7 100644
--- a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
+++ b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
@@ -8,6 +8,7 @@
#include "llvm/CodeGen/MachineUniformityAnalysis.h"
#include "llvm/ADT/GenericUniformityImpl.h"
+#include "llvm/ADT/SmallBitVector.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/MachineCycleAnalysis.h"
#include "llvm/CodeGen/MachineDominators.h"
@@ -60,11 +61,12 @@ void llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::initialize() {
case InstructionUniformity::AlwaysUniform:
addUniformOverride(instr);
break;
- case InstructionUniformity::Default:
- break;
- default:
+ case InstructionUniformity::Custom:
+ // Instructions requiring custom uniformity analysis based on operands
addUniformInstruction(&instr, uniformity);
break;
+ case InstructionUniformity::Default:
+ break;
}
}
}
@@ -154,41 +156,25 @@ bool llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::isDivergentUse(
}
template <>
-bool GenericUniformityAnalysisImpl<MachineSSAContext>::isOperandUniform(
- const MachineInstr &MI, InstructionUniformity IU) const {
- switch (IU) {
- // For permlane16/permlanex16, check if either src or lane select is uniform
- // These instructions have mixed immediate and register operands:
- // Operand 1 is src0 (the source value to permute)
- // Operand 3 is src1 (lane select - which lane within the 16 to read from)
- // Result is uniform if EITHER the source OR lane select is uniform
- case InstructionUniformity::AnyOfFirstTwoUseOp: {
- // Check if any of the first two register use operands is uniform
- // Result is uniform if ANY of these operands is uniform
- const MachineOperand *FirstRegOp = nullptr;
- const MachineOperand *SecondRegOp = nullptr;
-
- // Find the first two register use operands
- for (const MachineOperand &MO : MI.uses()) {
- if (MO.isReg() && MO.getReg().isVirtual()) {
- if (!FirstRegOp)
- FirstRegOp = &MO;
- else if (!SecondRegOp) {
- SecondRegOp = &MO;
- break;
- }
- }
- }
-
- if (!FirstRegOp || !SecondRegOp)
- return false;
+bool GenericUniformityAnalysisImpl<MachineSSAContext>::isCustomUniform(
+ const MachineInstr &MI) const {
+ const auto &InstrInfo = *F.getSubtarget().getInstrInfo();
- // Return true if either operand is uniform
- return !isDivergentUse(*FirstRegOp) || !isDivergentUse(*SecondRegOp);
+ // Build bitvector of uniform register use operands
+ SmallVector<const MachineOperand *, 4> RegUseOps;
+ for (const MachineOperand &MO : MI.uses()) {
+ if (MO.isReg() && MO.getReg().isVirtual()) {
+ RegUseOps.push_back(&MO);
+ }
}
- default:
- return false;
+
+ SmallBitVector UniformArgs(RegUseOps.size());
+ for (unsigned i = 0; i < RegUseOps.size(); ++i) {
+ UniformArgs[i] = !isDivergentUse(*RegUseOps[i]);
}
+
+ // Query target-specific uniformity callback
+ return InstrInfo.isUniform(MI, UniformArgs);
}
// This ensures explicit instantiation of
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index bfa68c975ec0c..fb67946d39737 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -18,6 +18,7 @@
#include "AMDGPUTargetMachine.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIModeRegisterDefaults.h"
+#include "llvm/ADT/SmallBitVector.h"
#include "llvm/Analysis/InlineCost.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ValueTracking.h"
@@ -1579,14 +1580,12 @@ unsigned GCNTTIImpl::getNumberOfParts(Type *Tp) const {
// with additional support for new uniformity classifications
InstructionUniformity
GCNTTIImpl::getInstructionUniformity(const Value *V) const {
- // Check for new special cases first (permlane16/permlanex16)
- // These need operand-dependent uniformity analysis
+ // Check for special cases requiring custom uniformity analysis
if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
switch (Intrinsic->getIntrinsicID()) {
case Intrinsic::amdgcn_permlane16:
case Intrinsic::amdgcn_permlanex16:
- // Result value can be uniform if either of first two operands are uniform
- return InstructionUniformity::AnyOfFirstTwoUseOp;
+ return InstructionUniformity::Custom;
default:
break;
}
@@ -1603,3 +1602,28 @@ GCNTTIImpl::getInstructionUniformity(const Value *V) const {
// Default behavior
return InstructionUniformity::Default;
}
+
+bool GCNTTIImpl::isUniform(const Instruction *I,
+ const SmallBitVector &UniformArgs) const {
+ // Custom uniformity check for permlane16/permlanex16
+ if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(I)) {
+ switch (Intrinsic->getIntrinsicID()) {
+ case Intrinsic::amdgcn_permlane16:
+ case Intrinsic::amdgcn_permlanex16:
+ // For permlane16/permlanex16:
+ // Operand 0: old value (ignored for uniformity)
+ // Operand 1: src0 (source value to permute)
+ // Operand 2: src1 (lane select within 16-lane group)
+ // Operand 3: src2 (which 16-lane group)
+ // Result is uniform if either src0 (op 1) or src1 (op 2) is uniform
+ if (UniformArgs.size() > 2) {
+ return UniformArgs[1] || UniformArgs[2];
+ }
+ return false;
+ default:
+ break;
+ }
+ }
+
+ return false;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index c2e102c9bab8e..86a5715907d5f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -304,6 +304,9 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
unsigned getNumberOfParts(Type *Tp) const override;
InstructionUniformity getInstructionUniformity(const Value *V) const override;
+
+ bool isUniform(const Instruction *I,
+ const SmallBitVector &UniformArgs) const override;
};
} // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 2db4d1bbd96ab..cb6ccc9cb1332 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -20,6 +20,7 @@
#include "SIMachineFunctionInfo.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallBitVector.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
#include "llvm/CodeGen/LiveIntervals.h"
@@ -10575,11 +10576,10 @@ SIInstrInfo::getInstructionUniformity(const MachineInstr &MI) const {
unsigned opcode = MI.getOpcode();
- // Special handling for permlane16/permlanex16 - uniformity depends on
- // operands
+ // permlane16/permlanex16 require custom uniformity analysis
if (opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
opcode == AMDGPU::V_PERMLANEX16_B32_e64)
- return InstructionUniformity::AnyOfFirstTwoUseOp;
+ return InstructionUniformity::Custom;
if (opcode == AMDGPU::V_READLANE_B32 ||
opcode == AMDGPU::V_READFIRSTLANE_B32 ||
@@ -10658,6 +10658,23 @@ SIInstrInfo::getInstructionUniformity(const MachineInstr &MI) const {
return InstructionUniformity::Default;
}
+bool SIInstrInfo::isUniform(const MachineInstr &MI,
+ const SmallBitVector &UniformArgs) const {
+ unsigned opcode = MI.getOpcode();
+
+ // Custom uniformity check for permlane16/permlanex16
+ if (opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
+ opcode == AMDGPU::V_PERMLANEX16_B32_e64) {
+ // Result is uniform if either src0 or src1 is uniform
+ // UniformArgs[0] = src0 (source value)
+ // UniformArgs[1] = src1 (lane select)
+ if (UniformArgs.size() >= 2) {
+ return UniformArgs[0] || UniformArgs[1];
+ }
+ }
+
+ return false;
+}
unsigned SIInstrInfo::getDSShaderTypeValue(const MachineFunction &MF) {
switch (MF.getFunction().getCallingConv()) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index c66985a19685b..ac22fa58dc04f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1640,6 +1640,9 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
InstructionUniformity
getInstructionUniformity(const MachineInstr &MI) const final;
+ bool isUniform(const MachineInstr &MI,
+ const SmallBitVector &UniformArgs) const final;
+
InstructionUniformity
getGenericInstructionUniformity(const MachineInstr &MI) const;
More information about the llvm-commits
mailing list