[llvm] [AMDGPU][Uniformity][TTI] Make Uniformity Analysis Operand-Aware via Custom Divergence Checks (PR #137639)

Tue Mar 24 04:03:08 PDT 2026

https://github.com/PankajDwivedi-25 updated https://github.com/llvm/llvm-project/pull/137639

>From a30a915e7f241a840f96668d0a1be250de6c4b1c Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Tue, 23 Sep 2025 17:25:29 +0530
Subject: [PATCH 01/17] [NFC] move isDivergentUse so later dependent function
 in pushUsers can safely use it

---
 llvm/lib/Analysis/UniformityAnalysis.cpp | 26 ++++++++++++------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Analysis/UniformityAnalysis.cpp b/llvm/lib/Analysis/UniformityAnalysis.cpp
index b56534935d7c2..f6cf67463c8a1 100644
--- a/llvm/lib/Analysis/UniformityAnalysis.cpp
+++ b/llvm/lib/Analysis/UniformityAnalysis.cpp
@@ -29,6 +29,19 @@ bool llvm::GenericUniformityAnalysisImpl<SSAContext>::markDefsDivergent(
   return markDivergent(cast<Value>(&Instr));
 }
 
+template <>
+bool llvm::GenericUniformityAnalysisImpl<SSAContext>::isDivergentUse(
+    const Use &U) const {
+  const auto *V = U.get();
+  if (isDivergent(V))
+    return true;
+  if (const auto *DefInstr = dyn_cast<Instruction>(V)) {
+    const auto *UseInstr = cast<Instruction>(U.getUser());
+    return isTemporalDivergent(*UseInstr->getParent(), *DefInstr);
+  }
+  return false;
+}
+
 template <> void llvm::GenericUniformityAnalysisImpl<SSAContext>::initialize() {
   for (auto &I : instructions(F)) {
     InstructionUniformity IU = TTI->getInstructionUniformity(&I);
@@ -95,19 +108,6 @@ void llvm::GenericUniformityAnalysisImpl<
   }
 }
 
-template <>
-bool llvm::GenericUniformityAnalysisImpl<SSAContext>::isDivergentUse(
-    const Use &U) const {
-  const auto *V = U.get();
-  if (isDivergent(V))
-    return true;
-  if (const auto *DefInstr = dyn_cast<Instruction>(V)) {
-    const auto *UseInstr = cast<Instruction>(U.getUser());
-    return isTemporalDivergent(*UseInstr->getParent(), *DefInstr);
-  }
-  return false;
-}
-
 // This ensures explicit instantiation of
 // GenericUniformityAnalysisImpl::ImplDeleter::operator()
 template class llvm::GenericUniformityInfo<SSAContext>;

>From c41a9ec639d8bf07578b16d037c13768e6bbd224 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Sat, 1 Nov 2025 02:02:14 +0530
Subject: [PATCH 02/17] add target hook to capture special operand uniformity
 and update UA to use it

---
 llvm/include/llvm/ADT/GenericUniformityImpl.h | 22 +++++++
 llvm/include/llvm/ADT/Uniformity.h            |  5 +-
 .../llvm/Analysis/TargetTransformInfo.h       |  1 +
 .../llvm/Analysis/TargetTransformInfoImpl.h   |  4 ++
 llvm/lib/Analysis/TargetTransformInfo.cpp     |  5 ++
 llvm/lib/Analysis/UniformityAnalysis.cpp      | 39 ++++++++----
 .../lib/CodeGen/MachineUniformityAnalysis.cpp | 11 ++++
 .../UniformityAnalysis/AMDGPU/intrinsics.ll   |  4 +-
 .../AMDGPU/uniform_intrinsic.ll               | 59 +++++++++++++++++++
 9 files changed, 134 insertions(+), 16 deletions(-)
 create mode 100644 llvm/test/Analysis/UniformityAnalysis/AMDGPU/uniform_intrinsic.ll

diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h
index 2db76a1ad9b13..23a774bed21a9 100644
--- a/llvm/include/llvm/ADT/GenericUniformityImpl.h
+++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h
@@ -51,6 +51,7 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SparseBitVector.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Uniformity.h"
 #include "llvm/Support/raw_ostream.h"
 
 #define DEBUG_TYPE "uniformity"
@@ -407,6 +408,11 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
   void recordTemporalDivergence(ConstValueRefT, const InstructionT *,
                                 const CycleT *);
 
+  bool isOperandUniform(const InstructionT &I, InstructionUniformity IU) const;
+
+  /// \brief keep track of target instruction that can be proven uniform.
+  void addUniformInstruction(const InstructionT *I, InstructionUniformity IU);
+
 protected:
   const ContextT &Context;
   const FunctionT &F;
@@ -420,6 +426,11 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
   // Internal worklist for divergence propagation.
   std::vector<const InstructionT *> Worklist;
 
+  // Map containing tracked instruction that can be proven uniform based on its
+  // operand Uniformity.
+  llvm::DenseMap<const InstructionT *, InstructionUniformity>
+      UniformInstruction;
+
   /// \brief Mark \p Term as divergent and push all Instructions that become
   /// divergent as a result on the worklist.
   void analyzeControlDivergence(const InstructionT &Term);
@@ -783,6 +794,11 @@ void GenericUniformityAnalysisImpl<ContextT>::markDivergent(
     const InstructionT &I) {
   if (isAlwaysUniform(I))
     return;
+  auto It = UniformInstruction.find(&I);
+  if (It != UniformInstruction.end() && isOperandUniform(I, It->second)) {
+    addUniformOverride(I);
+    return;
+  }
   bool Marked = false;
   if (I.isTerminator()) {
     Marked = DivergentTermBlocks.insert(I.getParent()).second;
@@ -814,6 +830,12 @@ void GenericUniformityAnalysisImpl<ContextT>::addUniformOverride(
   UniformOverrides.insert(&Instr);
 }
 
+template <typename ContextT>
+void GenericUniformityAnalysisImpl<ContextT>::addUniformInstruction(
+    const InstructionT *I, InstructionUniformity IU) {
+  UniformInstruction[I] = IU;
+}
+
 // Mark as divergent all external uses of values defined in \p DefCycle.
 //
 // A value V defined by a block B inside \p DefCycle may be used outside the
diff --git a/llvm/include/llvm/ADT/Uniformity.h b/llvm/include/llvm/ADT/Uniformity.h
index 21ca106b80be3..9571d43b8a9b9 100644
--- a/llvm/include/llvm/ADT/Uniformity.h
+++ b/llvm/include/llvm/ADT/Uniformity.h
@@ -23,7 +23,10 @@ enum class InstructionUniformity {
   AlwaysUniform,
 
   /// The result values can never be assumed to be uniform.
-  NeverUniform
+  NeverUniform,
+
+  /// Result value can be uniform if either of first two operand are uniform.
+  EitherOfFirstTwoOp
 };
 
 } // namespace llvm
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 9cb4a97f9459c..707c57e1160ba 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -2083,6 +2083,7 @@ class TargetTransformInfo {
   /// Returns true if GEP should not be used to index into vectors for this
   /// target.
   LLVM_ABI bool allowVectorElementIndexingUsingGEP() const;
+  InstructionUniformity getInstructionUniformity(const Instruction &I) const;
 
 private:
   std::unique_ptr<const TargetTransformInfoImplBase> TTIImpl;
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 410b9187a5fed..94d6d9ecba35c 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -1227,6 +1227,10 @@ class TargetTransformInfoImplBase {
       SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const {}
 
   virtual bool allowVectorElementIndexingUsingGEP() const { return true; }
+  virtual InstructionUniformity
+  getInstructionUniformity(const Instruction &I) const {
+    return InstructionUniformity::Default;
+  }
 
 protected:
   // Obtain the minimum required size to hold the value (without the sign)
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index b0afba2c78884..bf26ff4079da5 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1551,6 +1551,11 @@ bool TargetTransformInfo::allowVectorElementIndexingUsingGEP() const {
   return TTIImpl->allowVectorElementIndexingUsingGEP();
 }
 
+InstructionUniformity
+TargetTransformInfo::getInstructionUniformity(const Instruction &I) const {
+  return TTIImpl->getInstructionUniformity(I);
+}
+
 TargetTransformInfoImplBase::~TargetTransformInfoImplBase() = default;
 
 TargetIRAnalysis::TargetIRAnalysis() : TTICallback(&getDefaultTTI) {}
diff --git a/llvm/lib/Analysis/UniformityAnalysis.cpp b/llvm/lib/Analysis/UniformityAnalysis.cpp
index f6cf67463c8a1..9b7cb33d7f76a 100644
--- a/llvm/lib/Analysis/UniformityAnalysis.cpp
+++ b/llvm/lib/Analysis/UniformityAnalysis.cpp
@@ -8,6 +8,7 @@
 
 #include "llvm/Analysis/UniformityAnalysis.h"
 #include "llvm/ADT/GenericUniformityImpl.h"
+#include "llvm/ADT/Uniformity.h"
 #include "llvm/Analysis/CycleAnalysis.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Dominators.h"
@@ -29,19 +30,6 @@ bool llvm::GenericUniformityAnalysisImpl<SSAContext>::markDefsDivergent(
   return markDivergent(cast<Value>(&Instr));
 }
 
-template <>
-bool llvm::GenericUniformityAnalysisImpl<SSAContext>::isDivergentUse(
-    const Use &U) const {
-  const auto *V = U.get();
-  if (isDivergent(V))
-    return true;
-  if (const auto *DefInstr = dyn_cast<Instruction>(V)) {
-    const auto *UseInstr = cast<Instruction>(U.getUser());
-    return isTemporalDivergent(*UseInstr->getParent(), *DefInstr);
-  }
-  return false;
-}
-
 template <> void llvm::GenericUniformityAnalysisImpl<SSAContext>::initialize() {
   for (auto &I : instructions(F)) {
     InstructionUniformity IU = TTI->getInstructionUniformity(&I);
@@ -108,6 +96,31 @@ void llvm::GenericUniformityAnalysisImpl<
   }
 }
 
+template <>
+bool llvm::GenericUniformityAnalysisImpl<SSAContext>::isDivergentUse(
+    const Use &U) const {
+  const auto *V = U.get();
+  if (isDivergent(V))
+    return true;
+  if (const auto *DefInstr = dyn_cast<Instruction>(V)) {
+    const auto *UseInstr = cast<Instruction>(U.getUser());
+    return isTemporalDivergent(*UseInstr->getParent(), *DefInstr);
+  }
+  return false;
+}
+
+template <>
+bool GenericUniformityAnalysisImpl<SSAContext>::isOperandUniform(
+    const Instruction &I, InstructionUniformity IU) const {
+  switch (IU) {
+  case InstructionUniformity::EitherOfFirstTwoOp:
+    return !isDivergentUse(I.getOperandUse(0)) ||
+           !isDivergentUse(I.getOperandUse(1));
+  default:
+    return false;
+  }
+}
+
 // This ensures explicit instantiation of
 // GenericUniformityAnalysisImpl::ImplDeleter::operator()
 template class llvm::GenericUniformityInfo<SSAContext>;
diff --git a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
index dbadb67e1e6d2..ffe878f82f207 100644
--- a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
+++ b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
@@ -151,6 +151,17 @@ bool llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::isDivergentUse(
   return isTemporalDivergent(*UseInstr->getParent(), *DefInstr);
 }
 
+template <>
+bool GenericUniformityAnalysisImpl<MachineSSAContext>::isOperandUniform(
+    const MachineInstr &I, InstructionUniformity IU) const {
+  switch (IU) {
+  case InstructionUniformity::EitherOfFirstTwoOp:
+    return !isDivergentUse(I.getOperand(0)) || !isDivergentUse(I.getOperand(1));
+  default:
+    return false;
+  }
+}
+
 // This ensures explicit instantiation of
 // GenericUniformityAnalysisImpl::ImplDeleter::operator()
 template class llvm::GenericUniformityInfo<MachineSSAContext>;
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
index 46cb8cc1312dc..9b91c7bee84bd 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
@@ -7,14 +7,14 @@ define amdgpu_kernel void @ds_swizzle(ptr addrspace(1) %out, i32 %src) #0 {
   ret void
 }
 
-; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
+; CHECK: ALL VALUES UNIFORM
 define amdgpu_kernel void @v_permlane16_b32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
   %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
   store i32 %v, ptr addrspace(1) %out
   ret void
 }
 
-; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
+; CHECK: ALL VALUES UNIFORM
 define amdgpu_kernel void @v_permlanex16_b32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
   %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
   store i32 %v, ptr addrspace(1) %out
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/uniform_intrinsic.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/uniform_intrinsic.ll
new file mode 100644
index 0000000000000..37be465a7796b
--- /dev/null
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/uniform_intrinsic.ll
@@ -0,0 +1,59 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple amdgcn-- -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s
+
+; CHECK: ALL VALUES UNIFORM
+define amdgpu_kernel void @v_permlane16_b32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
+  %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
+  store i32 %v, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: ALL VALUES UNIFORM
+define amdgpu_kernel void @v_permlanex16_b32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
+  %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
+  store i32 %v, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK:  DIVERGENT:   %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+; CHECK:               %v1 = call i32 @llvm.amdgcn.permlanex16.i32(i32 %v, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+; CHECK:               store i32 %v1, ptr addrspace(1) %out, align 4
+define amdgpu_kernel void @div_permlane16_var_uni_usr_x16(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
+  %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
+  %v1 = call i32 @llvm.amdgcn.permlanex16.i32(i32 %v, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
+  store i32 %v1, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK:  DIVERGENT:   %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+; CHECK:               %v1 = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+; CHECK:               store i32 %v1, ptr addrspace(1) %out, align 4
+define amdgpu_kernel void @div_permlane16_var_uni_x16(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
+  %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
+  %v1 = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
+  store i32 %v1, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK:  DIVERGENT:   %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+; CHECK:               %v1 = call i32 @llvm.amdgcn.permlane16.i32(i32 %v, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+; CHECK:               store i32 %v1, ptr addrspace(1) %out, align 4
+define amdgpu_kernel void @div_permlane16_var_uni_usr_16(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
+  %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
+  %v1 = call i32 @llvm.amdgcn.permlane16.i32(i32 %v, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
+  store i32 %v1, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK:  DIVERGENT:   %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+; CHECK:               %v1 = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+; CHECK:               store i32 %v1, ptr addrspace(1) %out, align 4
+define amdgpu_kernel void @div_permlane16_var_uni_16(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
+  %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
+  %v1 = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
+  store i32 %v1, ptr addrspace(1) %out
+  ret void
+}
+
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}

>From c9a6192d5e6a253cea7e049e4accb22947fdd658 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Tue, 4 Nov 2025 14:21:00 +0530
Subject: [PATCH 03/17] update enum name for more clarity

---
 llvm/include/llvm/ADT/Uniformity.h             | 5 +++--
 llvm/lib/Analysis/UniformityAnalysis.cpp       | 2 +-
 llvm/lib/CodeGen/MachineUniformityAnalysis.cpp | 4 ++--
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/llvm/include/llvm/ADT/Uniformity.h b/llvm/include/llvm/ADT/Uniformity.h
index 9571d43b8a9b9..ed558b004d322 100644
--- a/llvm/include/llvm/ADT/Uniformity.h
+++ b/llvm/include/llvm/ADT/Uniformity.h
@@ -25,8 +25,9 @@ enum class InstructionUniformity {
   /// The result values can never be assumed to be uniform.
   NeverUniform,
 
-  /// Result value can be uniform if either of first two operand are uniform.
-  EitherOfFirstTwoOp
+  /// Result value can be uniform if any of the first two use operand are
+  /// uniform.
+  AnyOfFirstTwoUseOp
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Analysis/UniformityAnalysis.cpp b/llvm/lib/Analysis/UniformityAnalysis.cpp
index 9b7cb33d7f76a..494ba272aecf7 100644
--- a/llvm/lib/Analysis/UniformityAnalysis.cpp
+++ b/llvm/lib/Analysis/UniformityAnalysis.cpp
@@ -113,7 +113,7 @@ template <>
 bool GenericUniformityAnalysisImpl<SSAContext>::isOperandUniform(
     const Instruction &I, InstructionUniformity IU) const {
   switch (IU) {
-  case InstructionUniformity::EitherOfFirstTwoOp:
+  case InstructionUniformity::AnyOfFirstTwoUseOp:
     return !isDivergentUse(I.getOperandUse(0)) ||
            !isDivergentUse(I.getOperandUse(1));
   default:
diff --git a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
index ffe878f82f207..8f6652ee27f25 100644
--- a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
+++ b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
@@ -155,8 +155,8 @@ template <>
 bool GenericUniformityAnalysisImpl<MachineSSAContext>::isOperandUniform(
     const MachineInstr &I, InstructionUniformity IU) const {
   switch (IU) {
-  case InstructionUniformity::EitherOfFirstTwoOp:
-    return !isDivergentUse(I.getOperand(0)) || !isDivergentUse(I.getOperand(1));
+  case InstructionUniformity::AnyOfFirstTwoUseOp:
+    return !isDivergentUse(I.getOperand(1)) || !isDivergentUse(I.getOperand(2));
   default:
     return false;
   }

>From f59ff790dbc92c76097dee809567775ca502cbe9 Mon Sep 17 00:00:00 2001
From: Pankaj Dwivedi <divedi.pk.117 at gmail.com>
Date: Wed, 5 Nov 2025 10:53:47 +0530
Subject: [PATCH 04/17] Apply suggestion from @arsenm

Co-authored-by: Matt Arsenault <arsenm2 at gmail.com>
---
 llvm/include/llvm/ADT/GenericUniformityImpl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h
index 23a774bed21a9..da305ba283222 100644
--- a/llvm/include/llvm/ADT/GenericUniformityImpl.h
+++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h
@@ -428,7 +428,7 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
 
   // Map containing tracked instruction that can be proven uniform based on its
   // operand Uniformity.
-  llvm::DenseMap<const InstructionT *, InstructionUniformity>
+  DenseMap<const InstructionT *, InstructionUniformity>
       UniformInstruction;
 
   /// \brief Mark \p Term as divergent and push all Instructions that become

>From e64e3a6ffaf443fd630fa96826049ba5eeddabf4 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Mon, 17 Nov 2025 15:17:36 +0530
Subject: [PATCH 05/17] let getInstructionUniformity hook wrap
 isSourceOfDivergence/isAlwaysUniform

---
 llvm/include/llvm/Analysis/TargetTransformInfo.h     |  3 ++-
 llvm/include/llvm/Analysis/TargetTransformInfoImpl.h |  7 +++++--
 llvm/lib/Analysis/TargetTransformInfo.cpp            |  4 ++--
 llvm/lib/CodeGen/MachineUniformityAnalysis.cpp       | 10 +++-------
 llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp |  2 ++
 llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h     |  1 -
 6 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 707c57e1160ba..437b47c5a59ef 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -2083,7 +2083,8 @@ class TargetTransformInfo {
   /// Returns true if GEP should not be used to index into vectors for this
   /// target.
   LLVM_ABI bool allowVectorElementIndexingUsingGEP() const;
-  InstructionUniformity getInstructionUniformity(const Instruction &I) const;
+
+  InstructionUniformity getInstructionUniformity(const Value *V) const;
 
 private:
   std::unique_ptr<const TargetTransformInfoImplBase> TTIImpl;
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 94d6d9ecba35c..43d73c3fc72cb 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -1227,8 +1227,11 @@ class TargetTransformInfoImplBase {
       SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const {}
 
   virtual bool allowVectorElementIndexingUsingGEP() const { return true; }
-  virtual InstructionUniformity
-  getInstructionUniformity(const Instruction &I) const {
+
+  // New API for uniformity classification
+  // Targets should override this to provide target-specific uniformity analysis
+  // The default implementation returns Default (conservative behavior)
+  virtual InstructionUniformity getInstructionUniformity(const Value *V) const {
     return InstructionUniformity::Default;
   }
 
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index bf26ff4079da5..a26742ea23a96 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1552,8 +1552,8 @@ bool TargetTransformInfo::allowVectorElementIndexingUsingGEP() const {
 }
 
 InstructionUniformity
-TargetTransformInfo::getInstructionUniformity(const Instruction &I) const {
-  return TTIImpl->getInstructionUniformity(I);
+TargetTransformInfo::getInstructionUniformity(const Value *V) const {
+  return TTIImpl->getInstructionUniformity(V);
 }
 
 TargetTransformInfoImplBase::~TargetTransformInfoImplBase() = default;
diff --git a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
index 8f6652ee27f25..782bb11fe0cb7 100644
--- a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
+++ b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
@@ -151,15 +151,11 @@ bool llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::isDivergentUse(
   return isTemporalDivergent(*UseInstr->getParent(), *DefInstr);
 }
 
+// This can be defined later depending on use of the MachineUniformityAnalysis.
 template <>
 bool GenericUniformityAnalysisImpl<MachineSSAContext>::isOperandUniform(
-    const MachineInstr &I, InstructionUniformity IU) const {
-  switch (IU) {
-  case InstructionUniformity::AnyOfFirstTwoUseOp:
-    return !isDivergentUse(I.getOperand(1)) || !isDivergentUse(I.getOperand(2));
-  default:
-    return false;
-  }
+    const MachineInstr &MI, InstructionUniformity IU) const {
+  return false;
 }
 
 // This ensures explicit instantiation of
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index e81ce4aaf6fe9..d3f01046faa2a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -1727,6 +1727,8 @@ unsigned GCNTTIImpl::getNumberOfParts(Type *Tp) const {
   return BaseT::getNumberOfParts(Tp);
 }
 
+// New API that wraps the old isSourceOfDivergence and isAlwaysUniform APIs
+// with additional support for new uniformity classifications
 InstructionUniformity
 GCNTTIImpl::getInstructionUniformity(const Value *V) const {
   if (isAlwaysUniform(V))
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
index 899249db54574..3f59684b00b44 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@@ -221,7 +221,6 @@ class NVPTXTTIImpl final : public BasicTTIImplBase<NVPTXTTIImpl> {
     // Self-referential globals are not supported.
     return false;
   }
-
   InstructionUniformity getInstructionUniformity(const Value *V) const override;
 };
 

>From 12ceaf7c935ddfd608855d0196a7b881106229f5 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Wed, 19 Nov 2025 17:38:58 +0530
Subject: [PATCH 06/17] update the operand check & update machine inst
 uniformity

---
 llvm/lib/Analysis/UniformityAnalysis.cpp      |  7 +-
 .../lib/CodeGen/MachineUniformityAnalysis.cpp | 35 +++++++-
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |  7 ++
 .../AMDGPU/MIR/uniform-permlane.mir           | 86 +++++++++++++++++++
 .../AMDGPU/uniform_intrinsic.ll               |  8 +-
 5 files changed, 135 insertions(+), 8 deletions(-)
 create mode 100644 llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/uniform-permlane.mir

diff --git a/llvm/lib/Analysis/UniformityAnalysis.cpp b/llvm/lib/Analysis/UniformityAnalysis.cpp
index 494ba272aecf7..9a02b65f97bc5 100644
--- a/llvm/lib/Analysis/UniformityAnalysis.cpp
+++ b/llvm/lib/Analysis/UniformityAnalysis.cpp
@@ -114,8 +114,11 @@ bool GenericUniformityAnalysisImpl<SSAContext>::isOperandUniform(
     const Instruction &I, InstructionUniformity IU) const {
   switch (IU) {
   case InstructionUniformity::AnyOfFirstTwoUseOp:
-    return !isDivergentUse(I.getOperandUse(0)) ||
-           !isDivergentUse(I.getOperandUse(1));
+    // For permlane16/permlanex16: <old> <src0> <src1> <src2> <fi>
+    // <bound_control> Check if either src0 (operand 1) or src1 (operand 2 -
+    // lane select) is uniform
+    return !isDivergentUse(I.getOperandUse(1)) ||
+           !isDivergentUse(I.getOperandUse(2));
   default:
     return false;
   }
diff --git a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
index 782bb11fe0cb7..1cbd9450f4e99 100644
--- a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
+++ b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
@@ -151,11 +151,42 @@ bool llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::isDivergentUse(
   return isTemporalDivergent(*UseInstr->getParent(), *DefInstr);
 }
 
-// This can be defined later depending on use of the MachineUniformityAnalysis.
 template <>
 bool GenericUniformityAnalysisImpl<MachineSSAContext>::isOperandUniform(
     const MachineInstr &MI, InstructionUniformity IU) const {
-  return false;
+  switch (IU) {
+  // For permlane16/permlanex16, check if either src or lane select is uniform
+  // These instructions have mixed immediate and register operands:
+  // Operand 1 is src0 (the source value to permute)
+  // Operand 3 is src1 (lane select - which lane within the 16 to read from)
+  // Result is uniform if EITHER the source OR lane select is uniform
+  case InstructionUniformity::AnyOfFirstTwoUseOp: {
+    // Check if any of the first two register use operands is uniform
+    // Result is uniform if ANY of these operands is uniform
+    const MachineOperand *FirstRegOp = nullptr;
+    const MachineOperand *SecondRegOp = nullptr;
+
+    // Find the first two register use operands
+    for (const MachineOperand &MO : MI.uses()) {
+      if (MO.isReg() && MO.getReg().isVirtual()) {
+        if (!FirstRegOp)
+          FirstRegOp = &MO;
+        else if (!SecondRegOp) {
+          SecondRegOp = &MO;
+          break;
+        }
+      }
+    }
+
+    if (!FirstRegOp || !SecondRegOp)
+      return false;
+
+    // Return true if either operand is uniform
+    return !isDivergentUse(*FirstRegOp) || !isDivergentUse(*SecondRegOp);
+  }
+  default:
+    return false;
+  }
 }
 
 // This ensures explicit instantiation of
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 5086c553da101..6eea224babdbe 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -10803,6 +10803,13 @@ SIInstrInfo::getInstructionUniformity(const MachineInstr &MI) const {
     return InstructionUniformity::NeverUniform;
 
   unsigned opcode = MI.getOpcode();
+
+  // Special handling for permlane16/permlanex16 - uniformity depends on
+  // operands
+  if (opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
+      opcode == AMDGPU::V_PERMLANEX16_B32_e64)
+    return InstructionUniformity::AnyOfFirstTwoUseOp;
+
   if (opcode == AMDGPU::V_READLANE_B32 ||
       opcode == AMDGPU::V_READFIRSTLANE_B32 ||
       opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/uniform-permlane.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/uniform-permlane.mir
new file mode 100644
index 0000000000000..f08d16affef23
--- /dev/null
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/uniform-permlane.mir
@@ -0,0 +1,86 @@
+# RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -run-pass=print-machine-uniformity -o - %s 2>&1 | FileCheck %s
+
+# Test the machine-level uniformity analysis for permlane16/permlanex16 instructions.
+#
+# NOTE: Permlane instructions have a hardware constraint that src1 (lane select) and src2
+# must be SGPR (scalar) registers. Since SGPRs are always uniform at machine level, 
+# permlane results are always uniform according to the AnyOfFirstTwoUseOp logic
+# (either src0 OR src1 being uniform makes the result uniform, and src1 is always uniform).
+#
+# These tests verify that the uniformity analysis correctly handles permlane instructions
+# and that uniform results propagate through chains of operations.
+
+---
+# Test: permlane16 with divergent VGPR src and uniform SGPR lane select
+# Result is UNIFORM because lane select (SGPR) is always uniform
+name: permlane16_basic
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  bb.0:
+    ; CHECK-LABEL: MachineUniformityInfo for function: permlane16_basic
+    ; CHECK: ALL VALUES UNIFORM
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:sreg_32 = S_MOV_B32 5
+    %2:sreg_32 = IMPLICIT_DEF
+    %3:vgpr_32 = V_PERMLANE16_B32_e64 0, %0, 0, %1, 0, %2, %0, 0, implicit $exec
+    S_ENDPGM 0
+
+...
+---
+# Test: permlanex16 with divergent VGPR src and uniform SGPR lane select
+# Result is UNIFORM because lane select (SGPR) is always uniform
+name: permlanex16_basic
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  bb.0:
+    ; CHECK-LABEL: MachineUniformityInfo for function: permlanex16_basic
+    ; CHECK: ALL VALUES UNIFORM
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:sreg_32 = S_MOV_B32 7
+    %2:sreg_32 = IMPLICIT_DEF
+    %3:vgpr_32 = V_PERMLANEX16_B32_e64 0, %0, 0, %1, 0, %2, %0, 0, implicit $exec
+    S_ENDPGM 0
+
+...
+---
+# Test: Chain of permlane operations - uniformity propagates
+# Both permlanes are uniform, second uses result of first as source
+name: permlane16_chain_uniform
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  bb.0:
+    ; CHECK-LABEL: MachineUniformityInfo for function: permlane16_chain_uniform
+    ; CHECK: ALL VALUES UNIFORM
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:sreg_32 = S_MOV_B32 3
+    %2:sreg_32 = IMPLICIT_DEF
+    ; First permlane - uniform because lane select is SGPR
+    %3:vgpr_32 = V_PERMLANE16_B32_e64 0, %0, 0, %1, 0, %2, %0, 0, implicit $exec
+    ; Second permlane uses uniform result - also uniform
+    %4:vgpr_32 = V_PERMLANEX16_B32_e64 0, %3, 0, %1, 0, %2, %3, 0, implicit $exec
+    S_ENDPGM 0
+
+...
+---
+# Test: Multiple permlane operations in sequence
+# Verifies that uniformity is correctly tracked through complex chains
+name: permlane_multiple
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  bb.0:
+    ; CHECK-LABEL: MachineUniformityInfo for function: permlane_multiple
+    ; CHECK: ALL VALUES UNIFORM
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:sreg_32 = S_MOV_B32 1
+    %2:sreg_32 = S_MOV_B32 2  
+    %3:vgpr_32 = V_PERMLANE16_B32_e64 0, %0, 0, %1, 0, %2, %0, 0, implicit $exec
+    %4:vgpr_32 = V_PERMLANEX16_B32_e64 0, %3, 0, %1, 0, %2, %3, 0, implicit $exec
+    %5:vgpr_32 = V_PERMLANE16_B32_e64 0, %4, 0, %2, 0, %1, %4, 0, implicit $exec
+    S_ENDPGM 0
+
+...
+
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/uniform_intrinsic.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/uniform_intrinsic.ll
index 37be465a7796b..e7391ee0c265b 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/uniform_intrinsic.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/uniform_intrinsic.ll
@@ -16,11 +16,11 @@ define amdgpu_kernel void @v_permlanex16_b32(ptr addrspace(1) %out, i32 %src0, i
 }
 
 ; CHECK:  DIVERGENT:   %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
-; CHECK:               %v1 = call i32 @llvm.amdgcn.permlanex16.i32(i32 %v, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+; CHECK:               %v1 = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %v, i32 %src1, i32 %src2, i1 false, i1 false)
 ; CHECK:               store i32 %v1, ptr addrspace(1) %out, align 4
 define amdgpu_kernel void @div_permlane16_var_uni_usr_x16(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
   %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
-  %v1 = call i32 @llvm.amdgcn.permlanex16.i32(i32 %v, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
+  %v1 = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %v, i32 %src1, i32 %src2, i1 false, i1 false) #0
   store i32 %v1, ptr addrspace(1) %out
   ret void
 }
@@ -36,11 +36,11 @@ define amdgpu_kernel void @div_permlane16_var_uni_x16(ptr addrspace(1) %out, i32
 }
 
 ; CHECK:  DIVERGENT:   %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
-; CHECK:               %v1 = call i32 @llvm.amdgcn.permlane16.i32(i32 %v, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+; CHECK:               %v1 = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %v, i32 %src1, i32 %src2, i1 false, i1 false)
 ; CHECK:               store i32 %v1, ptr addrspace(1) %out, align 4
 define amdgpu_kernel void @div_permlane16_var_uni_usr_16(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
   %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
-  %v1 = call i32 @llvm.amdgcn.permlane16.i32(i32 %v, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
+  %v1 = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %v, i32 %src1, i32 %src2, i1 false, i1 false) #0
   store i32 %v1, ptr addrspace(1) %out
   ret void
 }

>From cf4dcef9b39841bc53d0a5b3f7774a916a902bc5 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Wed, 19 Nov 2025 18:59:57 +0530
Subject: [PATCH 07/17] Fix formatting

---
 llvm/include/llvm/ADT/GenericUniformityImpl.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h
index da305ba283222..d06a8d3f28716 100644
--- a/llvm/include/llvm/ADT/GenericUniformityImpl.h
+++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h
@@ -1,4 +1,4 @@
-//===- GenericUniformityImpl.h -----------------------*- C++ -*------------===//
+//===- GenericUniformityImpl.h -----------------------*- C++ -*------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -428,8 +428,7 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
 
   // Map containing tracked instruction that can be proven uniform based on its
   // operand Uniformity.
-  DenseMap<const InstructionT *, InstructionUniformity>
-      UniformInstruction;
+  DenseMap<const InstructionT *, InstructionUniformity> UniformInstruction;
 
   /// \brief Mark \p Term as divergent and push all Instructions that become
   /// divergent as a result on the worklist.

>From 96fe05eea124cdb6492be22d1df51457a9a10a26 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Wed, 19 Nov 2025 19:50:03 +0530
Subject: [PATCH 08/17] update mir test check

---
 .../UniformityAnalysis/AMDGPU/MIR/uniform-permlane.mir    | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/uniform-permlane.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/uniform-permlane.mir
index f08d16affef23..da6048d86b2dd 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/uniform-permlane.mir
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/uniform-permlane.mir
@@ -18,7 +18,7 @@ machineFunctionInfo:
   isEntryFunction: true
 body: |
   bb.0:
-    ; CHECK-LABEL: MachineUniformityInfo for function: permlane16_basic
+    ; CHECK-LABEL: MachineUniformityInfo for function: @permlane16_basic
     ; CHECK: ALL VALUES UNIFORM
     %0:vgpr_32 = IMPLICIT_DEF
     %1:sreg_32 = S_MOV_B32 5
@@ -35,7 +35,7 @@ machineFunctionInfo:
   isEntryFunction: true
 body: |
   bb.0:
-    ; CHECK-LABEL: MachineUniformityInfo for function: permlanex16_basic
+    ; CHECK-LABEL: MachineUniformityInfo for function: @permlanex16_basic
     ; CHECK: ALL VALUES UNIFORM
     %0:vgpr_32 = IMPLICIT_DEF
     %1:sreg_32 = S_MOV_B32 7
@@ -52,7 +52,7 @@ machineFunctionInfo:
   isEntryFunction: true
 body: |
   bb.0:
-    ; CHECK-LABEL: MachineUniformityInfo for function: permlane16_chain_uniform
+    ; CHECK-LABEL: MachineUniformityInfo for function: @permlane16_chain_uniform
     ; CHECK: ALL VALUES UNIFORM
     %0:vgpr_32 = IMPLICIT_DEF
     %1:sreg_32 = S_MOV_B32 3
@@ -72,7 +72,7 @@ machineFunctionInfo:
   isEntryFunction: true
 body: |
   bb.0:
-    ; CHECK-LABEL: MachineUniformityInfo for function: permlane_multiple
+    ; CHECK-LABEL: MachineUniformityInfo for function: @permlane_multiple
     ; CHECK: ALL VALUES UNIFORM
     %0:vgpr_32 = IMPLICIT_DEF
     %1:sreg_32 = S_MOV_B32 1

>From 41eea7b41f985489dc14a0c881f575abfde42250 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Thu, 20 Nov 2025 16:30:26 +0530
Subject: [PATCH 09/17] seperate complex target based custom logic through
 target hook

---
 llvm/include/llvm/ADT/GenericUniformityImpl.h | 15 ++++--
 llvm/include/llvm/ADT/Uniformity.h            |  7 +--
 .../llvm/Analysis/TargetTransformInfo.h       | 12 +++++
 .../llvm/Analysis/TargetTransformInfoImpl.h   |  8 +++
 llvm/include/llvm/CodeGen/TargetInstrInfo.h   | 16 ++++++
 llvm/lib/Analysis/TargetTransformInfo.cpp     |  5 ++
 llvm/lib/Analysis/UniformityAnalysis.cpp      | 29 +++++-----
 .../lib/CodeGen/MachineUniformityAnalysis.cpp | 53 ++++++++-----------
 .../AMDGPU/AMDGPUTargetTransformInfo.cpp      | 38 ++++++++++++-
 .../Target/AMDGPU/AMDGPUTargetTransformInfo.h |  3 ++
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        | 23 ++++++--
 llvm/lib/Target/AMDGPU/SIInstrInfo.h          |  3 ++
 12 files changed, 154 insertions(+), 58 deletions(-)

diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h
index d06a8d3f28716..c76964f9d7571 100644
--- a/llvm/include/llvm/ADT/GenericUniformityImpl.h
+++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h
@@ -408,9 +408,11 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
   void recordTemporalDivergence(ConstValueRefT, const InstructionT *,
                                 const CycleT *);
 
-  bool isOperandUniform(const InstructionT &I, InstructionUniformity IU) const;
+  /// Check if an instruction with Custom uniformity can be proven uniform
+  /// based on its operands. This queries the target-specific callback.
+  bool isCustomUniform(const InstructionT &I) const;
 
-  /// \brief keep track of target instruction that can be proven uniform.
+  /// \brief keep track of instructions that require custom uniformity analysis.
   void addUniformInstruction(const InstructionT *I, InstructionUniformity IU);
 
 protected:
@@ -793,10 +795,13 @@ void GenericUniformityAnalysisImpl<ContextT>::markDivergent(
     const InstructionT &I) {
   if (isAlwaysUniform(I))
     return;
+  // Check if instruction requires custom uniformity analysis
   auto It = UniformInstruction.find(&I);
-  if (It != UniformInstruction.end() && isOperandUniform(I, It->second)) {
-    addUniformOverride(I);
-    return;
+  if (It != UniformInstruction.end()) {
+    if (It->second == InstructionUniformity::Custom && isCustomUniform(I)) {
+      addUniformOverride(I);
+      return;
+    }
   }
   bool Marked = false;
   if (I.isTerminator()) {
diff --git a/llvm/include/llvm/ADT/Uniformity.h b/llvm/include/llvm/ADT/Uniformity.h
index ed558b004d322..43e588745f73f 100644
--- a/llvm/include/llvm/ADT/Uniformity.h
+++ b/llvm/include/llvm/ADT/Uniformity.h
@@ -25,9 +25,10 @@ enum class InstructionUniformity {
   /// The result values can never be assumed to be uniform.
   NeverUniform,
 
-  /// Result value can be uniform if any of the first two use operand are
-  /// uniform.
-  AnyOfFirstTwoUseOp
+  /// If all operands are uniform, the result values are uniform. Otherwise,
+  /// the result values may be divergent, and a custom check may be used to
+  /// determine uniformity via a callback.
+  Custom
 };
 
 } // namespace llvm
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 437b47c5a59ef..bbead5aab4bab 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -2086,6 +2086,18 @@ class TargetTransformInfo {
 
   InstructionUniformity getInstructionUniformity(const Value *V) const;
 
+  /// Determine if an instruction with some operands uniform can be proven
+  /// uniform. This is used for custom uniformity analysis where the target
+  /// can define complex rules that depend on which specific operands are
+  /// uniform.
+  ///
+  /// \param I The instruction to check.
+  /// \param UniformArgs A bitvector indicating which operands are known to be
+  ///                    uniform (bit N corresponds to operand N).
+  /// \returns true if the instruction result can be proven uniform given the
+  ///          uniform operands, false otherwise.
+  bool isUniform(const Instruction *I, const SmallBitVector &UniformArgs) const;
+
 private:
   std::unique_ptr<const TargetTransformInfoImplBase> TTIImpl;
 };
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 43d73c3fc72cb..24032caab5ac1 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -1235,6 +1235,14 @@ class TargetTransformInfoImplBase {
     return InstructionUniformity::Default;
   }
 
+  // Custom uniformity check for instructions marked as Custom
+  // Override this to provide complex uniformity rules based on which operands
+  // are uniform
+  virtual bool isUniform(const Instruction *I,
+                         const SmallBitVector &UniformArgs) const {
+    return false; // Conservative: assume divergent
+  }
+
 protected:
   // Obtain the minimum required size to hold the value (without the sign)
   // In case of a vector it returns the min required size for one element.
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 77f710203d1fc..67c3ac5f4c40e 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -2367,6 +2367,22 @@ class LLVM_ABI TargetInstrInfo : public MCInstrInfo {
     return InstructionUniformity::Default;
   }
 
+  /// Determine if a machine instruction with some operands uniform can be
+  /// proven uniform. This is used for custom uniformity analysis where the
+  /// target can define complex rules that depend on which specific operands
+  /// are uniform.
+  ///
+  /// \param MI The machine instruction to check.
+  /// \param UniformArgs A bitvector indicating which register operands are
+  ///                    known to be uniform (bit N corresponds to the Nth
+  ///                    register use operand).
+  /// \returns true if the instruction result can be proven uniform given the
+  ///          uniform operands, false otherwise.
+  virtual bool isUniform(const MachineInstr &MI,
+                         const SmallBitVector &UniformArgs) const {
+    return false; // Conservative: assume divergent
+  }
+
   /// Returns true if the given \p MI defines a TargetIndex operand that can be
   /// tracked by their offset, can have values, and can have debug info
   /// associated with it. If so, sets \p Index and \p Offset of the target index
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index a26742ea23a96..ce2063fbb8008 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1556,6 +1556,11 @@ TargetTransformInfo::getInstructionUniformity(const Value *V) const {
   return TTIImpl->getInstructionUniformity(V);
 }
 
+bool TargetTransformInfo::isUniform(const Instruction *I,
+                                    const SmallBitVector &UniformArgs) const {
+  return TTIImpl->isUniform(I, UniformArgs);
+}
+
 TargetTransformInfoImplBase::~TargetTransformInfoImplBase() = default;
 
 TargetIRAnalysis::TargetIRAnalysis() : TTICallback(&getDefaultTTI) {}
diff --git a/llvm/lib/Analysis/UniformityAnalysis.cpp b/llvm/lib/Analysis/UniformityAnalysis.cpp
index 9a02b65f97bc5..663faf88e915f 100644
--- a/llvm/lib/Analysis/UniformityAnalysis.cpp
+++ b/llvm/lib/Analysis/UniformityAnalysis.cpp
@@ -8,6 +8,7 @@
 
 #include "llvm/Analysis/UniformityAnalysis.h"
 #include "llvm/ADT/GenericUniformityImpl.h"
+#include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/Uniformity.h"
 #include "llvm/Analysis/CycleAnalysis.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
@@ -36,10 +37,14 @@ template <> void llvm::GenericUniformityAnalysisImpl<SSAContext>::initialize() {
     switch (IU) {
     case InstructionUniformity::AlwaysUniform:
       addUniformOverride(I);
-      continue;
+      break;
     case InstructionUniformity::NeverUniform:
       markDivergent(I);
-      continue;
+      break;
+    case InstructionUniformity::Custom:
+      // Instructions requiring custom uniformity analysis based on operands
+      addUniformInstruction(&I, IU);
+      break;
     case InstructionUniformity::Default:
       break;
     }
@@ -110,18 +115,16 @@ bool llvm::GenericUniformityAnalysisImpl<SSAContext>::isDivergentUse(
 }
 
 template <>
-bool GenericUniformityAnalysisImpl<SSAContext>::isOperandUniform(
-    const Instruction &I, InstructionUniformity IU) const {
-  switch (IU) {
-  case InstructionUniformity::AnyOfFirstTwoUseOp:
-    // For permlane16/permlanex16: <old> <src0> <src1> <src2> <fi>
-    // <bound_control> Check if either src0 (operand 1) or src1 (operand 2 -
-    // lane select) is uniform
-    return !isDivergentUse(I.getOperandUse(1)) ||
-           !isDivergentUse(I.getOperandUse(2));
-  default:
-    return false;
+bool GenericUniformityAnalysisImpl<SSAContext>::isCustomUniform(
+    const Instruction &I) const {
+  // Build bitvector of uniform operands
+  SmallBitVector UniformArgs(I.getNumOperands());
+  for (unsigned OpIdx = 0; OpIdx < I.getNumOperands(); ++OpIdx) {
+    UniformArgs[OpIdx] = !isDivergentUse(I.getOperandUse(OpIdx));
   }
+
+  // Query target-specific uniformity callback
+  return TTI->isUniform(&I, UniformArgs);
 }
 
 // This ensures explicit instantiation of
diff --git a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
index 1cbd9450f4e99..4e43287dc9424 100644
--- a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
+++ b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
@@ -8,6 +8,7 @@
 
 #include "llvm/CodeGen/MachineUniformityAnalysis.h"
 #include "llvm/ADT/GenericUniformityImpl.h"
+#include "llvm/ADT/SmallBitVector.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/MachineCycleAnalysis.h"
 #include "llvm/CodeGen/MachineDominators.h"
@@ -61,6 +62,10 @@ void llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::initialize() {
       case InstructionUniformity::NeverUniform:
         markDivergent(instr);
         break;
+      case InstructionUniformity::Custom:
+        // Instructions requiring custom uniformity analysis based on operands
+        addUniformInstruction(&instr, uniformity);
+        break;
       case InstructionUniformity::Default:
         break;
       }
@@ -152,41 +157,25 @@ bool llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::isDivergentUse(
 }
 
 template <>
-bool GenericUniformityAnalysisImpl<MachineSSAContext>::isOperandUniform(
-    const MachineInstr &MI, InstructionUniformity IU) const {
-  switch (IU) {
-  // For permlane16/permlanex16, check if either src or lane select is uniform
-  // These instructions have mixed immediate and register operands:
-  // Operand 1 is src0 (the source value to permute)
-  // Operand 3 is src1 (lane select - which lane within the 16 to read from)
-  // Result is uniform if EITHER the source OR lane select is uniform
-  case InstructionUniformity::AnyOfFirstTwoUseOp: {
-    // Check if any of the first two register use operands is uniform
-    // Result is uniform if ANY of these operands is uniform
-    const MachineOperand *FirstRegOp = nullptr;
-    const MachineOperand *SecondRegOp = nullptr;
-
-    // Find the first two register use operands
-    for (const MachineOperand &MO : MI.uses()) {
-      if (MO.isReg() && MO.getReg().isVirtual()) {
-        if (!FirstRegOp)
-          FirstRegOp = &MO;
-        else if (!SecondRegOp) {
-          SecondRegOp = &MO;
-          break;
-        }
-      }
-    }
-
-    if (!FirstRegOp || !SecondRegOp)
-      return false;
+bool GenericUniformityAnalysisImpl<MachineSSAContext>::isCustomUniform(
+    const MachineInstr &MI) const {
+  const auto &InstrInfo = *F.getSubtarget().getInstrInfo();
 
-    // Return true if either operand is uniform
-    return !isDivergentUse(*FirstRegOp) || !isDivergentUse(*SecondRegOp);
+  // Build bitvector of uniform register use operands
+  SmallVector<const MachineOperand *, 4> RegUseOps;
+  for (const MachineOperand &MO : MI.uses()) {
+    if (MO.isReg() && MO.getReg().isVirtual()) {
+      RegUseOps.push_back(&MO);
+    }
   }
-  default:
-    return false;
+
+  SmallBitVector UniformArgs(RegUseOps.size());
+  for (unsigned i = 0; i < RegUseOps.size(); ++i) {
+    UniformArgs[i] = !isDivergentUse(*RegUseOps[i]);
   }
+
+  // Query target-specific uniformity callback
+  return InstrInfo.isUniform(MI, UniformArgs);
 }
 
 // This ensures explicit instantiation of
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index d3f01046faa2a..2f50a4a22927a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -19,6 +19,7 @@
 #include "AMDGPUTargetMachine.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIModeRegisterDefaults.h"
+#include "llvm/ADT/SmallBitVector.h"
 #include "llvm/Analysis/InlineCost.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -1727,10 +1728,18 @@ unsigned GCNTTIImpl::getNumberOfParts(Type *Tp) const {
   return BaseT::getNumberOfParts(Tp);
 }
 
-// New API that wraps the old isSourceOfDivergence and isAlwaysUniform APIs
-// with additional support for new uniformity classifications
 InstructionUniformity
 GCNTTIImpl::getInstructionUniformity(const Value *V) const {
+  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
+    switch (Intrinsic->getIntrinsicID()) {
+    case Intrinsic::amdgcn_permlane16:
+    case Intrinsic::amdgcn_permlanex16:
+      return InstructionUniformity::Custom;
+    default:
+      break;
+    }
+  }
+
   if (isAlwaysUniform(V))
     return InstructionUniformity::AlwaysUniform;
 
@@ -1786,3 +1795,28 @@ bool GCNTTIImpl::shouldDropLSRSolutionIfLessProfitable() const {
   // Prefer the baseline when LSR cannot clearly reduce per-iteration work.
   return true;
 }
+
+bool GCNTTIImpl::isUniform(const Instruction *I,
+                           const SmallBitVector &UniformArgs) const {
+  // Custom uniformity check for permlane16/permlanex16
+  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(I)) {
+    switch (Intrinsic->getIntrinsicID()) {
+    case Intrinsic::amdgcn_permlane16:
+    case Intrinsic::amdgcn_permlanex16:
+      // For permlane16/permlanex16:
+      // Operand 0: old value (ignored for uniformity)
+      // Operand 1: src0 (source value to permute)
+      // Operand 2: src1 (lane select within 16-lane group)
+      // Operand 3: src2 (which 16-lane group)
+      // Result is uniform if either src0 (op 1) or src1 (op 2) is uniform
+      if (UniformArgs.size() > 2) {
+        return UniformArgs[1] || UniformArgs[2];
+      }
+      return false;
+    default:
+      break;
+    }
+  }
+
+  return false;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index dc7d01533da02..ea2bf72836199 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -321,6 +321,9 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
                      const TTI::LSRCost &B) const override;
   bool isNumRegsMajorCostOfLSR() const override;
   bool shouldDropLSRSolutionIfLessProfitable() const override;
+
+  bool isUniform(const Instruction *I,
+                 const SmallBitVector &UniformArgs) const override;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 6eea224babdbe..b15d57d346669 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -20,6 +20,7 @@
 #include "SIMachineFunctionInfo.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallBitVector.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
 #include "llvm/CodeGen/LiveIntervals.h"
@@ -10804,11 +10805,10 @@ SIInstrInfo::getInstructionUniformity(const MachineInstr &MI) const {
 
   unsigned opcode = MI.getOpcode();
 
-  // Special handling for permlane16/permlanex16 - uniformity depends on
-  // operands
+  // permlane16/permlanex16 require custom uniformity analysis
   if (opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
       opcode == AMDGPU::V_PERMLANEX16_B32_e64)
-    return InstructionUniformity::AnyOfFirstTwoUseOp;
+    return InstructionUniformity::Custom;
 
   if (opcode == AMDGPU::V_READLANE_B32 ||
       opcode == AMDGPU::V_READFIRSTLANE_B32 ||
@@ -10887,6 +10887,23 @@ SIInstrInfo::getInstructionUniformity(const MachineInstr &MI) const {
 
   return InstructionUniformity::Default;
 }
+bool SIInstrInfo::isUniform(const MachineInstr &MI,
+                            const SmallBitVector &UniformArgs) const {
+  unsigned opcode = MI.getOpcode();
+
+  // Custom uniformity check for permlane16/permlanex16
+  if (opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
+      opcode == AMDGPU::V_PERMLANEX16_B32_e64) {
+    // Result is uniform if either src0 or src1 is uniform
+    // UniformArgs[0] = src0 (source value)
+    // UniformArgs[1] = src1 (lane select)
+    if (UniformArgs.size() >= 2) {
+      return UniformArgs[0] || UniformArgs[1];
+    }
+  }
+
+  return false;
+}
 
 unsigned SIInstrInfo::getDSShaderTypeValue(const MachineFunction &MF) {
   switch (MF.getFunction().getCallingConv()) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 93d28d22bfd16..429982f75f29d 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1680,6 +1680,9 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
   InstructionUniformity
   getInstructionUniformity(const MachineInstr &MI) const final;
 
+  bool isUniform(const MachineInstr &MI,
+                 const SmallBitVector &UniformArgs) const final;
+
   InstructionUniformity
   getGenericInstructionUniformity(const MachineInstr &MI) const;
 

>From 1865da335920214d7046702d64a6ac48bd294a6a Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Tue, 16 Dec 2025 18:28:29 +0530
Subject: [PATCH 10/17] refactor: rebased with latest changes

---
 llvm/include/llvm/Analysis/TargetTransformInfo.h     | 2 --
 llvm/include/llvm/Analysis/TargetTransformInfoImpl.h | 7 -------
 llvm/lib/Analysis/TargetTransformInfo.cpp            | 5 -----
 llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp | 1 -
 4 files changed, 15 deletions(-)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index bbead5aab4bab..b6dc19a873ec4 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -2084,8 +2084,6 @@ class TargetTransformInfo {
   /// target.
   LLVM_ABI bool allowVectorElementIndexingUsingGEP() const;
 
-  InstructionUniformity getInstructionUniformity(const Value *V) const;
-
   /// Determine if an instruction with some operands uniform can be proven
   /// uniform. This is used for custom uniformity analysis where the target
   /// can define complex rules that depend on which specific operands are
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 24032caab5ac1..246c05709c4d8 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -1228,13 +1228,6 @@ class TargetTransformInfoImplBase {
 
   virtual bool allowVectorElementIndexingUsingGEP() const { return true; }
 
-  // New API for uniformity classification
-  // Targets should override this to provide target-specific uniformity analysis
-  // The default implementation returns Default (conservative behavior)
-  virtual InstructionUniformity getInstructionUniformity(const Value *V) const {
-    return InstructionUniformity::Default;
-  }
-
   // Custom uniformity check for instructions marked as Custom
   // Override this to provide complex uniformity rules based on which operands
   // are uniform
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index ce2063fbb8008..9345595555c2d 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1551,11 +1551,6 @@ bool TargetTransformInfo::allowVectorElementIndexingUsingGEP() const {
   return TTIImpl->allowVectorElementIndexingUsingGEP();
 }
 
-InstructionUniformity
-TargetTransformInfo::getInstructionUniformity(const Value *V) const {
-  return TTIImpl->getInstructionUniformity(V);
-}
-
 bool TargetTransformInfo::isUniform(const Instruction *I,
                                     const SmallBitVector &UniformArgs) const {
   return TTIImpl->isUniform(I, UniformArgs);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 2f50a4a22927a..79842504b19ee 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -1817,6 +1817,5 @@ bool GCNTTIImpl::isUniform(const Instruction *I,
       break;
     }
   }
-
   return false;
 }

>From f2f8b5ad236a05379d05208a67d41170500ac025 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Tue, 16 Dec 2025 18:48:14 +0530
Subject: [PATCH 11/17] refactor

---
 llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
index 3f59684b00b44..899249db54574 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@@ -221,6 +221,7 @@ class NVPTXTTIImpl final : public BasicTTIImplBase<NVPTXTTIImpl> {
     // Self-referential globals are not supported.
     return false;
   }
+
   InstructionUniformity getInstructionUniformity(const Value *V) const override;
 };
 

>From 2bbcf97a220e7be3e049cf47c52e000833886e03 Mon Sep 17 00:00:00 2001
From: Pankaj Dwivedi <divedi.pk.117 at gmail.com>
Date: Tue, 16 Dec 2025 19:06:39 +0530
Subject: [PATCH 12/17] Update llvm/lib/Analysis/UniformityAnalysis.cpp

Co-authored-by: Matt Arsenault <arsenm2 at gmail.com>
---
 llvm/lib/Analysis/UniformityAnalysis.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Analysis/UniformityAnalysis.cpp b/llvm/lib/Analysis/UniformityAnalysis.cpp
index 663faf88e915f..7866aec824dbf 100644
--- a/llvm/lib/Analysis/UniformityAnalysis.cpp
+++ b/llvm/lib/Analysis/UniformityAnalysis.cpp
@@ -119,7 +119,7 @@ bool GenericUniformityAnalysisImpl<SSAContext>::isCustomUniform(
     const Instruction &I) const {
   // Build bitvector of uniform operands
   SmallBitVector UniformArgs(I.getNumOperands());
-  for (unsigned OpIdx = 0; OpIdx < I.getNumOperands(); ++OpIdx) {
+  for (unsigned OpIdx = 0, E =UniformArgs.size(); OpIdx != E; ++OpIdx) {
     UniformArgs[OpIdx] = !isDivergentUse(I.getOperandUse(OpIdx));
   }
 

>From f81169c80b9e160b6f83a3021bc94bc14d91712c Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Tue, 16 Dec 2025 21:10:54 +0530
Subject: [PATCH 13/17] [Review] address changes

---
 llvm/include/llvm/ADT/GenericUniformityImpl.h | 23 +++++++++----------
 llvm/lib/Analysis/UniformityAnalysis.cpp      |  5 ++--
 .../lib/CodeGen/MachineUniformityAnalysis.cpp | 20 +++++++---------
 3 files changed, 21 insertions(+), 27 deletions(-)

diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h
index c76964f9d7571..67d8bf09ecaf6 100644
--- a/llvm/include/llvm/ADT/GenericUniformityImpl.h
+++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h
@@ -1,4 +1,4 @@
-//===- GenericUniformityImpl.h -----------------------*- C++ -*------------===//
+//===- GenericUniformityImpl.h -----------------------*- C++ -*------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -412,8 +412,8 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
   /// based on its operands. This queries the target-specific callback.
   bool isCustomUniform(const InstructionT &I) const;
 
-  /// \brief keep track of instructions that require custom uniformity analysis.
-  void addUniformInstruction(const InstructionT *I, InstructionUniformity IU);
+  /// \brief Add an instruction that requires custom uniformity analysis.
+  void addCustomUniformCandidate(const InstructionT *I);
 
 protected:
   const ContextT &Context;
@@ -428,9 +428,9 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
   // Internal worklist for divergence propagation.
   std::vector<const InstructionT *> Worklist;
 
-  // Map containing tracked instruction that can be proven uniform based on its
-  // operand Uniformity.
-  DenseMap<const InstructionT *, InstructionUniformity> UniformInstruction;
+  // Set of instructions that require custom uniformity analysis based on
+  // operand uniformity.
+  SmallPtrSet<const InstructionT *, 8> CustomUniformCandidates;
 
   /// \brief Mark \p Term as divergent and push all Instructions that become
   /// divergent as a result on the worklist.
@@ -796,9 +796,8 @@ void GenericUniformityAnalysisImpl<ContextT>::markDivergent(
   if (isAlwaysUniform(I))
     return;
   // Check if instruction requires custom uniformity analysis
-  auto It = UniformInstruction.find(&I);
-  if (It != UniformInstruction.end()) {
-    if (It->second == InstructionUniformity::Custom && isCustomUniform(I)) {
+  if (CustomUniformCandidates.count(&I)) {
+    if (isCustomUniform(I)) {
       addUniformOverride(I);
       return;
     }
@@ -835,9 +834,9 @@ void GenericUniformityAnalysisImpl<ContextT>::addUniformOverride(
 }
 
 template <typename ContextT>
-void GenericUniformityAnalysisImpl<ContextT>::addUniformInstruction(
-    const InstructionT *I, InstructionUniformity IU) {
-  UniformInstruction[I] = IU;
+void GenericUniformityAnalysisImpl<ContextT>::addCustomUniformCandidate(
+    const InstructionT *I) {
+  CustomUniformCandidates.insert(I);
 }
 
 // Mark as divergent all external uses of values defined in \p DefCycle.
diff --git a/llvm/lib/Analysis/UniformityAnalysis.cpp b/llvm/lib/Analysis/UniformityAnalysis.cpp
index 7866aec824dbf..973899c7a161c 100644
--- a/llvm/lib/Analysis/UniformityAnalysis.cpp
+++ b/llvm/lib/Analysis/UniformityAnalysis.cpp
@@ -43,7 +43,7 @@ template <> void llvm::GenericUniformityAnalysisImpl<SSAContext>::initialize() {
       break;
     case InstructionUniformity::Custom:
       // Instructions requiring custom uniformity analysis based on operands
-      addUniformInstruction(&I, IU);
+      addCustomUniformCandidate(&I);
       break;
     case InstructionUniformity::Default:
       break;
@@ -119,10 +119,9 @@ bool GenericUniformityAnalysisImpl<SSAContext>::isCustomUniform(
     const Instruction &I) const {
   // Build bitvector of uniform operands
   SmallBitVector UniformArgs(I.getNumOperands());
-  for (unsigned OpIdx = 0, E =UniformArgs.size(); OpIdx != E; ++OpIdx) {
+  for (unsigned OpIdx = 0, E = UniformArgs.size(); OpIdx != E; ++OpIdx) {
     UniformArgs[OpIdx] = !isDivergentUse(I.getOperandUse(OpIdx));
   }
-
   // Query target-specific uniformity callback
   return TTI->isUniform(&I, UniformArgs);
 }
diff --git a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
index 4e43287dc9424..5ee1e5ca8f46c 100644
--- a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
+++ b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
@@ -64,7 +64,7 @@ void llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::initialize() {
         break;
       case InstructionUniformity::Custom:
         // Instructions requiring custom uniformity analysis based on operands
-        addUniformInstruction(&instr, uniformity);
+        addCustomUniformCandidate(&instr);
         break;
       case InstructionUniformity::Default:
         break;
@@ -161,17 +161,13 @@ bool GenericUniformityAnalysisImpl<MachineSSAContext>::isCustomUniform(
     const MachineInstr &MI) const {
   const auto &InstrInfo = *F.getSubtarget().getInstrInfo();
 
-  // Build bitvector of uniform register use operands
-  SmallVector<const MachineOperand *, 4> RegUseOps;
-  for (const MachineOperand &MO : MI.uses()) {
-    if (MO.isReg() && MO.getReg().isVirtual()) {
-      RegUseOps.push_back(&MO);
-    }
-  }
-
-  SmallBitVector UniformArgs(RegUseOps.size());
-  for (unsigned i = 0; i < RegUseOps.size(); ++i) {
-    UniformArgs[i] = !isDivergentUse(*RegUseOps[i]);
+  // Build bitvector of uniform operands
+  SmallBitVector UniformArgs(MI.getNumOperands());
+  for (unsigned OpIdx = 0, E = MI.getNumOperands(); OpIdx != E; ++OpIdx) {
+    const MachineOperand &MO = MI.getOperand(OpIdx);
+    // Register operands: check if divergent
+    // Non-register operands (immediates, etc.): always uniform
+    UniformArgs[OpIdx] = !MO.isReg() || !isDivergentUse(MO);
   }
 
   // Query target-specific uniformity callback

>From f29a21e9b1b24d788278619b491dc142bf24e557 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Wed, 17 Dec 2025 13:38:55 +0530
Subject: [PATCH 14/17] change the approach from identifying uniform to
 divergent

---
 llvm/include/llvm/ADT/GenericUniformityImpl.h | 31 ++++++++++---------
 .../llvm/Analysis/TargetTransformInfo.h       | 17 +++++-----
 .../llvm/Analysis/TargetTransformInfoImpl.h   | 10 +++---
 llvm/include/llvm/CodeGen/TargetInstrInfo.h   | 21 ++++++-------
 llvm/lib/Analysis/TargetTransformInfo.cpp     |  6 ++--
 llvm/lib/Analysis/UniformityAnalysis.cpp      | 18 +++++------
 .../lib/CodeGen/MachineUniformityAnalysis.cpp | 19 ++++++------
 .../AMDGPU/AMDGPUTargetTransformInfo.cpp      | 14 ++++-----
 .../Target/AMDGPU/AMDGPUTargetTransformInfo.h |  4 +--
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        | 16 +++++-----
 llvm/lib/Target/AMDGPU/SIInstrInfo.h          |  4 +--
 11 files changed, 79 insertions(+), 81 deletions(-)

diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h
index 67d8bf09ecaf6..beeb5ad86608e 100644
--- a/llvm/include/llvm/ADT/GenericUniformityImpl.h
+++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h
@@ -408,12 +408,12 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
   void recordTemporalDivergence(ConstValueRefT, const InstructionT *,
                                 const CycleT *);
 
-  /// Check if an instruction with Custom uniformity can be proven uniform
+  /// Check if an instruction with Custom uniformity can be proven divergent
   /// based on its operands. This queries the target-specific callback.
-  bool isCustomUniform(const InstructionT &I) const;
+  bool isCustomDivergent(const InstructionT &I) const;
 
-  /// \brief Add an instruction that requires custom uniformity analysis.
-  void addCustomUniformCandidate(const InstructionT *I);
+  /// \brief Add an instruction that requires custom divergence analysis.
+  void addCustomDivergenceCandidate(const InstructionT *I);
 
 protected:
   const ContextT &Context;
@@ -428,9 +428,9 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
   // Internal worklist for divergence propagation.
   std::vector<const InstructionT *> Worklist;
 
-  // Set of instructions that require custom uniformity analysis based on
-  // operand uniformity.
-  SmallPtrSet<const InstructionT *, 8> CustomUniformCandidates;
+  // Set of instructions that require custom divergence analysis based on
+  // operand divergence.
+  SmallPtrSet<const InstructionT *, 8> CustomDivergenceCandidates;
 
   /// \brief Mark \p Term as divergent and push all Instructions that become
   /// divergent as a result on the worklist.
@@ -795,12 +795,13 @@ void GenericUniformityAnalysisImpl<ContextT>::markDivergent(
     const InstructionT &I) {
   if (isAlwaysUniform(I))
     return;
-  // Check if instruction requires custom uniformity analysis
-  if (CustomUniformCandidates.count(&I)) {
-    if (isCustomUniform(I)) {
-      addUniformOverride(I);
-      return;
-    }
+  // For custom divergence candidates, try to prove divergence.
+  // If we can't prove it's divergent yet, skip marking it.
+  // The candidate will be re-evaluated as operands become divergent.
+  if (CustomDivergenceCandidates.count(&I)) {
+    if (!isCustomDivergent(I))
+      return; // Can't prove divergent yet, assume uniform
+    // Otherwise, we can prove it's divergent, continue to mark it
   }
   bool Marked = false;
   if (I.isTerminator()) {
@@ -834,9 +835,9 @@ void GenericUniformityAnalysisImpl<ContextT>::addUniformOverride(
 }
 
 template <typename ContextT>
-void GenericUniformityAnalysisImpl<ContextT>::addCustomUniformCandidate(
+void GenericUniformityAnalysisImpl<ContextT>::addCustomDivergenceCandidate(
     const InstructionT *I) {
-  CustomUniformCandidates.insert(I);
+  CustomDivergenceCandidates.insert(I);
 }
 
 // Mark as divergent all external uses of values defined in \p DefCycle.
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index b6dc19a873ec4..6f50ffd048df7 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -2084,17 +2084,16 @@ class TargetTransformInfo {
   /// target.
   LLVM_ABI bool allowVectorElementIndexingUsingGEP() const;
 
-  /// Determine if an instruction with some operands uniform can be proven
-  /// uniform. This is used for custom uniformity analysis where the target
-  /// can define complex rules that depend on which specific operands are
-  /// uniform.
+  /// Determine if an instruction can be proven divergent based on which
+  /// operands are divergent.
   ///
   /// \param I The instruction to check.
-  /// \param UniformArgs A bitvector indicating which operands are known to be
-  ///                    uniform (bit N corresponds to operand N).
-  /// \returns true if the instruction result can be proven uniform given the
-  ///          uniform operands, false otherwise.
-  bool isUniform(const Instruction *I, const SmallBitVector &UniformArgs) const;
+  /// \param DivergentArgs A bitvector indicating which operands are known to be
+  ///                      divergent (bit N corresponds to operand N).
+  /// \returns true if the instruction result can be proven divergent given the
+  ///          divergent operands, false otherwise.
+  LLVM_ABI bool isDivergent(const Instruction *I,
+                            const SmallBitVector &DivergentArgs) const;
 
 private:
   std::unique_ptr<const TargetTransformInfoImplBase> TTIImpl;
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 246c05709c4d8..3bf6cb854a911 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -1229,11 +1229,11 @@ class TargetTransformInfoImplBase {
   virtual bool allowVectorElementIndexingUsingGEP() const { return true; }
 
   // Custom uniformity check for instructions marked as Custom
-  // Override this to provide complex uniformity rules based on which operands
-  // are uniform
-  virtual bool isUniform(const Instruction *I,
-                         const SmallBitVector &UniformArgs) const {
-    return false; // Conservative: assume divergent
+  // Override this to provide complex divergence rules based on which operands
+  // are divergent
+  virtual bool isDivergent(const Instruction *I,
+                           const SmallBitVector &DivergentArgs) const {
+    return false; // Conservative: can't prove divergent
   }
 
 protected:
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 67c3ac5f4c40e..5e0b7a242e20e 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -2367,20 +2367,17 @@ class LLVM_ABI TargetInstrInfo : public MCInstrInfo {
     return InstructionUniformity::Default;
   }
 
-  /// Determine if a machine instruction with some operands uniform can be
-  /// proven uniform. This is used for custom uniformity analysis where the
-  /// target can define complex rules that depend on which specific operands
-  /// are uniform.
+  /// Determine if a machine instruction can be proven divergent based on which
+  /// operands are divergent.
   ///
   /// \param MI The machine instruction to check.
-  /// \param UniformArgs A bitvector indicating which register operands are
-  ///                    known to be uniform (bit N corresponds to the Nth
-  ///                    register use operand).
-  /// \returns true if the instruction result can be proven uniform given the
-  ///          uniform operands, false otherwise.
-  virtual bool isUniform(const MachineInstr &MI,
-                         const SmallBitVector &UniformArgs) const {
-    return false; // Conservative: assume divergent
+  /// \param DivergentArgs A bitvector indicating which operands are known to be
+  ///                      divergent (bit N corresponds to operand N).
+  /// \returns true if the instruction result can be proven divergent given the
+  ///          divergent operands, false otherwise.
+  virtual bool isDivergent(const MachineInstr &MI,
+                           const SmallBitVector &DivergentArgs) const {
+    return false; // Conservative: can't prove divergent
   }
 
   /// Returns true if the given \p MI defines a TargetIndex operand that can be
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 9345595555c2d..56152c9d8f429 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1551,9 +1551,9 @@ bool TargetTransformInfo::allowVectorElementIndexingUsingGEP() const {
   return TTIImpl->allowVectorElementIndexingUsingGEP();
 }
 
-bool TargetTransformInfo::isUniform(const Instruction *I,
-                                    const SmallBitVector &UniformArgs) const {
-  return TTIImpl->isUniform(I, UniformArgs);
+bool TargetTransformInfo::isDivergent(
+    const Instruction *I, const SmallBitVector &DivergentArgs) const {
+  return TTIImpl->isDivergent(I, DivergentArgs);
 }
 
 TargetTransformInfoImplBase::~TargetTransformInfoImplBase() = default;
diff --git a/llvm/lib/Analysis/UniformityAnalysis.cpp b/llvm/lib/Analysis/UniformityAnalysis.cpp
index 973899c7a161c..551ec7f4a917a 100644
--- a/llvm/lib/Analysis/UniformityAnalysis.cpp
+++ b/llvm/lib/Analysis/UniformityAnalysis.cpp
@@ -42,8 +42,8 @@ template <> void llvm::GenericUniformityAnalysisImpl<SSAContext>::initialize() {
       markDivergent(I);
       break;
     case InstructionUniformity::Custom:
-      // Instructions requiring custom uniformity analysis based on operands
-      addCustomUniformCandidate(&I);
+      // Instructions requiring custom divergence analysis based on operands
+      addCustomDivergenceCandidate(&I);
       break;
     case InstructionUniformity::Default:
       break;
@@ -115,15 +115,15 @@ bool llvm::GenericUniformityAnalysisImpl<SSAContext>::isDivergentUse(
 }
 
 template <>
-bool GenericUniformityAnalysisImpl<SSAContext>::isCustomUniform(
+bool GenericUniformityAnalysisImpl<SSAContext>::isCustomDivergent(
     const Instruction &I) const {
-  // Build bitvector of uniform operands
-  SmallBitVector UniformArgs(I.getNumOperands());
-  for (unsigned OpIdx = 0, E = UniformArgs.size(); OpIdx != E; ++OpIdx) {
-    UniformArgs[OpIdx] = !isDivergentUse(I.getOperandUse(OpIdx));
+  // Build bitvector of divergent operands
+  SmallBitVector DivergentArgs(I.getNumOperands());
+  for (unsigned OpIdx = 0, E = DivergentArgs.size(); OpIdx != E; ++OpIdx) {
+    DivergentArgs[OpIdx] = isDivergentUse(I.getOperandUse(OpIdx));
   }
-  // Query target-specific uniformity callback
-  return TTI->isUniform(&I, UniformArgs);
+  // Query target-specific divergence callback
+  return TTI->isDivergent(&I, DivergentArgs);
 }
 
 // This ensures explicit instantiation of
diff --git a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
index 5ee1e5ca8f46c..2e932f4edad9b 100644
--- a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
+++ b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
@@ -63,8 +63,8 @@ void llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::initialize() {
         markDivergent(instr);
         break;
       case InstructionUniformity::Custom:
-        // Instructions requiring custom uniformity analysis based on operands
-        addCustomUniformCandidate(&instr);
+        // Instructions requiring custom divergence analysis based on operands
+        addCustomDivergenceCandidate(&instr);
         break;
       case InstructionUniformity::Default:
         break;
@@ -157,21 +157,22 @@ bool llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::isDivergentUse(
 }
 
 template <>
-bool GenericUniformityAnalysisImpl<MachineSSAContext>::isCustomUniform(
+bool GenericUniformityAnalysisImpl<MachineSSAContext>::isCustomDivergent(
     const MachineInstr &MI) const {
   const auto &InstrInfo = *F.getSubtarget().getInstrInfo();
 
-  // Build bitvector of uniform operands
-  SmallBitVector UniformArgs(MI.getNumOperands());
+  // Build bitvector of divergent operands
+  SmallBitVector DivergentArgs(MI.getNumOperands());
   for (unsigned OpIdx = 0, E = MI.getNumOperands(); OpIdx != E; ++OpIdx) {
     const MachineOperand &MO = MI.getOperand(OpIdx);
     // Register operands: check if divergent
-    // Non-register operands (immediates, etc.): always uniform
-    UniformArgs[OpIdx] = !MO.isReg() || !isDivergentUse(MO);
+    // Non-register operands (immediates, etc.): always uniform (never
+    // divergent)
+    DivergentArgs[OpIdx] = MO.isReg() && isDivergentUse(MO);
   }
 
-  // Query target-specific uniformity callback
-  return InstrInfo.isUniform(MI, UniformArgs);
+  // Query target-specific divergence callback
+  return InstrInfo.isDivergent(MI, DivergentArgs);
 }
 
 // This ensures explicit instantiation of
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 79842504b19ee..2c4dad60a249b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -1749,6 +1749,7 @@ GCNTTIImpl::getInstructionUniformity(const Value *V) const {
   return InstructionUniformity::Default;
 }
 
+<<<<<<< HEAD
 InstructionCost GCNTTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
                                                  StackOffset BaseOffset,
                                                  bool HasBaseReg, int64_t Scale,
@@ -1796,21 +1797,20 @@ bool GCNTTIImpl::shouldDropLSRSolutionIfLessProfitable() const {
   return true;
 }
 
-bool GCNTTIImpl::isUniform(const Instruction *I,
-                           const SmallBitVector &UniformArgs) const {
-  // Custom uniformity check for permlane16/permlanex16
+bool GCNTTIImpl::isDivergent(const Instruction *I,
+                             const SmallBitVector &DivergentArgs) const {
   if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(I)) {
     switch (Intrinsic->getIntrinsicID()) {
     case Intrinsic::amdgcn_permlane16:
     case Intrinsic::amdgcn_permlanex16:
       // For permlane16/permlanex16:
-      // Operand 0: old value (ignored for uniformity)
+      // Operand 0: old value (ignored for divergence)
       // Operand 1: src0 (source value to permute)
       // Operand 2: src1 (lane select within 16-lane group)
       // Operand 3: src2 (which 16-lane group)
-      // Result is uniform if either src0 (op 1) or src1 (op 2) is uniform
-      if (UniformArgs.size() > 2) {
-        return UniformArgs[1] || UniformArgs[2];
+      // Result is divergent if both src0 (op 1) and src1 (op 2) are divergent
+      if (DivergentArgs.size() > 2) {
+        return DivergentArgs[1] && DivergentArgs[2];
       }
       return false;
     default:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index ea2bf72836199..32aec671383f6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -322,8 +322,8 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
   bool isNumRegsMajorCostOfLSR() const override;
   bool shouldDropLSRSolutionIfLessProfitable() const override;
 
-  bool isUniform(const Instruction *I,
-                 const SmallBitVector &UniformArgs) const override;
+  bool isDivergent(const Instruction *I,
+                   const SmallBitVector &DivergentArgs) const override;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index b15d57d346669..fbe1b9c402188 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -10887,18 +10887,18 @@ SIInstrInfo::getInstructionUniformity(const MachineInstr &MI) const {
 
   return InstructionUniformity::Default;
 }
-bool SIInstrInfo::isUniform(const MachineInstr &MI,
-                            const SmallBitVector &UniformArgs) const {
+bool SIInstrInfo::isDivergent(const MachineInstr &MI,
+                              const SmallBitVector &DivergentArgs) const {
   unsigned opcode = MI.getOpcode();
 
-  // Custom uniformity check for permlane16/permlanex16
+  // Custom divergence check for permlane16/permlanex16
   if (opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
       opcode == AMDGPU::V_PERMLANEX16_B32_e64) {
-    // Result is uniform if either src0 or src1 is uniform
-    // UniformArgs[0] = src0 (source value)
-    // UniformArgs[1] = src1 (lane select)
-    if (UniformArgs.size() >= 2) {
-      return UniformArgs[0] || UniformArgs[1];
+    // Result is divergent if both src0 and src1 are divergent
+    // DivergentArgs[0] = src0 (source value)
+    // DivergentArgs[1] = src1 (lane select)
+    if (DivergentArgs.size() >= 2) {
+      return DivergentArgs[0] && DivergentArgs[1];
     }
   }
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 429982f75f29d..3d09521e0794f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1680,8 +1680,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
   InstructionUniformity
   getInstructionUniformity(const MachineInstr &MI) const final;
 
-  bool isUniform(const MachineInstr &MI,
-                 const SmallBitVector &UniformArgs) const final;
+  bool isDivergent(const MachineInstr &MI,
+                   const SmallBitVector &DivergentArgs) const final;
 
   InstructionUniformity
   getGenericInstructionUniformity(const MachineInstr &MI) const;

>From 0a3b84a74b1f8053e3ab9baa44baba2fcc790cc1 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Fri, 20 Mar 2026 15:52:33 +0530
Subject: [PATCH 15/17] review: address suggestions

---
 llvm/include/llvm/ADT/GenericUniformityImpl.h | 29 +++----
 llvm/include/llvm/ADT/Uniformity.h            |  6 +-
 .../llvm/Analysis/TargetTransformInfoImpl.h   |  5 +-
 llvm/include/llvm/CodeGen/TargetInstrInfo.h   | 13 ---
 llvm/lib/Analysis/UniformityAnalysis.cpp      |  9 +-
 .../lib/CodeGen/MachineUniformityAnalysis.cpp | 20 +----
 .../AMDGPU/AMDGPUTargetTransformInfo.cpp      | 29 ++-----
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        | 24 ------
 llvm/lib/Target/AMDGPU/SIInstrInfo.h          |  3 -
 .../AMDGPU/MIR/uniform-permlane.mir           | 86 -------------------
 .../UniformityAnalysis/AMDGPU/intrinsics.ll   |  4 +-
 .../AMDGPU/uniform_intrinsic.ll               | 71 +++++++--------
 12 files changed, 62 insertions(+), 237 deletions(-)
 delete mode 100644 llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/uniform-permlane.mir

diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h
index beeb5ad86608e..44f4db3cccb4c 100644
--- a/llvm/include/llvm/ADT/GenericUniformityImpl.h
+++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h
@@ -408,12 +408,12 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
   void recordTemporalDivergence(ConstValueRefT, const InstructionT *,
                                 const CycleT *);
 
-  /// Check if an instruction with Custom uniformity can be proven divergent
+  /// Check if an instruction with Custom uniformity can be proven uniform
   /// based on its operands. This queries the target-specific callback.
-  bool isCustomDivergent(const InstructionT &I) const;
+  bool isCustomUniform(const InstructionT &I) const;
 
-  /// \brief Add an instruction that requires custom divergence analysis.
-  void addCustomDivergenceCandidate(const InstructionT *I);
+  /// \brief Add an instruction that requires custom uniformity analysis.
+  void addCustomUniformityCandidate(const InstructionT *I);
 
 protected:
   const ContextT &Context;
@@ -428,9 +428,9 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
   // Internal worklist for divergence propagation.
   std::vector<const InstructionT *> Worklist;
 
-  // Set of instructions that require custom divergence analysis based on
-  // operand divergence.
-  SmallPtrSet<const InstructionT *, 8> CustomDivergenceCandidates;
+  // Set of instructions that require custom uniformity analysis based on
+  // operand uniformity.
+  SmallPtrSet<const InstructionT *, 8> CustomUniformityCandidates;
 
   /// \brief Mark \p Term as divergent and push all Instructions that become
   /// divergent as a result on the worklist.
@@ -795,13 +795,12 @@ void GenericUniformityAnalysisImpl<ContextT>::markDivergent(
     const InstructionT &I) {
   if (isAlwaysUniform(I))
     return;
-  // For custom divergence candidates, try to prove divergence.
-  // If we can't prove it's divergent yet, skip marking it.
+  // For custom uniformity candidates, check if the instruction can be
+  // proven uniform based on which operands are uniform/divergent.
   // The candidate will be re-evaluated as operands become divergent.
-  if (CustomDivergenceCandidates.count(&I)) {
-    if (!isCustomDivergent(I))
-      return; // Can't prove divergent yet, assume uniform
-    // Otherwise, we can prove it's divergent, continue to mark it
+  if (CustomUniformityCandidates.count(&I)) {
+    if (isCustomUniform(I))
+      return;
   }
   bool Marked = false;
   if (I.isTerminator()) {
@@ -835,9 +834,9 @@ void GenericUniformityAnalysisImpl<ContextT>::addUniformOverride(
 }
 
 template <typename ContextT>
-void GenericUniformityAnalysisImpl<ContextT>::addCustomDivergenceCandidate(
+void GenericUniformityAnalysisImpl<ContextT>::addCustomUniformityCandidate(
     const InstructionT *I) {
-  CustomDivergenceCandidates.insert(I);
+  CustomUniformityCandidates.insert(I);
 }
 
 // Mark as divergent all external uses of values defined in \p DefCycle.
diff --git a/llvm/include/llvm/ADT/Uniformity.h b/llvm/include/llvm/ADT/Uniformity.h
index 43e588745f73f..0adbdf99c6d2a 100644
--- a/llvm/include/llvm/ADT/Uniformity.h
+++ b/llvm/include/llvm/ADT/Uniformity.h
@@ -25,9 +25,9 @@ enum class InstructionUniformity {
   /// The result values can never be assumed to be uniform.
   NeverUniform,
 
-  /// If all operands are uniform, the result values are uniform. Otherwise,
-  /// the result values may be divergent, and a custom check may be used to
-  /// determine uniformity via a callback.
+  /// The result values require a custom uniformity check. A target-specific
+  /// callback determines whether the result is uniform based on which
+  /// operands are uniform.
   Custom
 };
 
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 3bf6cb854a911..a2cf6f133f09c 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -1228,12 +1228,9 @@ class TargetTransformInfoImplBase {
 
   virtual bool allowVectorElementIndexingUsingGEP() const { return true; }
 
-  // Custom uniformity check for instructions marked as Custom
-  // Override this to provide complex divergence rules based on which operands
-  // are divergent
   virtual bool isDivergent(const Instruction *I,
                            const SmallBitVector &DivergentArgs) const {
-    return false; // Conservative: can't prove divergent
+    llvm_unreachable("target must implement isDivergent for Custom uniformity");
   }
 
 protected:
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 5e0b7a242e20e..77f710203d1fc 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -2367,19 +2367,6 @@ class LLVM_ABI TargetInstrInfo : public MCInstrInfo {
     return InstructionUniformity::Default;
   }
 
-  /// Determine if a machine instruction can be proven divergent based on which
-  /// operands are divergent.
-  ///
-  /// \param MI The machine instruction to check.
-  /// \param DivergentArgs A bitvector indicating which operands are known to be
-  ///                      divergent (bit N corresponds to operand N).
-  /// \returns true if the instruction result can be proven divergent given the
-  ///          divergent operands, false otherwise.
-  virtual bool isDivergent(const MachineInstr &MI,
-                           const SmallBitVector &DivergentArgs) const {
-    return false; // Conservative: can't prove divergent
-  }
-
   /// Returns true if the given \p MI defines a TargetIndex operand that can be
   /// tracked by their offset, can have values, and can have debug info
   /// associated with it. If so, sets \p Index and \p Offset of the target index
diff --git a/llvm/lib/Analysis/UniformityAnalysis.cpp b/llvm/lib/Analysis/UniformityAnalysis.cpp
index 551ec7f4a917a..c0d301a99b86e 100644
--- a/llvm/lib/Analysis/UniformityAnalysis.cpp
+++ b/llvm/lib/Analysis/UniformityAnalysis.cpp
@@ -42,8 +42,7 @@ template <> void llvm::GenericUniformityAnalysisImpl<SSAContext>::initialize() {
       markDivergent(I);
       break;
     case InstructionUniformity::Custom:
-      // Instructions requiring custom divergence analysis based on operands
-      addCustomDivergenceCandidate(&I);
+      addCustomUniformityCandidate(&I);
       break;
     case InstructionUniformity::Default:
       break;
@@ -115,15 +114,13 @@ bool llvm::GenericUniformityAnalysisImpl<SSAContext>::isDivergentUse(
 }
 
 template <>
-bool GenericUniformityAnalysisImpl<SSAContext>::isCustomDivergent(
+bool GenericUniformityAnalysisImpl<SSAContext>::isCustomUniform(
     const Instruction &I) const {
-  // Build bitvector of divergent operands
   SmallBitVector DivergentArgs(I.getNumOperands());
   for (unsigned OpIdx = 0, E = DivergentArgs.size(); OpIdx != E; ++OpIdx) {
     DivergentArgs[OpIdx] = isDivergentUse(I.getOperandUse(OpIdx));
   }
-  // Query target-specific divergence callback
-  return TTI->isDivergent(&I, DivergentArgs);
+  return !TTI->isDivergent(&I, DivergentArgs);
 }
 
 // This ensures explicit instantiation of
diff --git a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
index 2e932f4edad9b..af1c448497a52 100644
--- a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
+++ b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
@@ -8,7 +8,6 @@
 
 #include "llvm/CodeGen/MachineUniformityAnalysis.h"
 #include "llvm/ADT/GenericUniformityImpl.h"
-#include "llvm/ADT/SmallBitVector.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/MachineCycleAnalysis.h"
 #include "llvm/CodeGen/MachineDominators.h"
@@ -63,8 +62,6 @@ void llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::initialize() {
         markDivergent(instr);
         break;
       case InstructionUniformity::Custom:
-        // Instructions requiring custom divergence analysis based on operands
-        addCustomDivergenceCandidate(&instr);
         break;
       case InstructionUniformity::Default:
         break;
@@ -157,22 +154,9 @@ bool llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::isDivergentUse(
 }
 
 template <>
-bool GenericUniformityAnalysisImpl<MachineSSAContext>::isCustomDivergent(
+bool GenericUniformityAnalysisImpl<MachineSSAContext>::isCustomUniform(
     const MachineInstr &MI) const {
-  const auto &InstrInfo = *F.getSubtarget().getInstrInfo();
-
-  // Build bitvector of divergent operands
-  SmallBitVector DivergentArgs(MI.getNumOperands());
-  for (unsigned OpIdx = 0, E = MI.getNumOperands(); OpIdx != E; ++OpIdx) {
-    const MachineOperand &MO = MI.getOperand(OpIdx);
-    // Register operands: check if divergent
-    // Non-register operands (immediates, etc.): always uniform (never
-    // divergent)
-    DivergentArgs[OpIdx] = MO.isReg() && isDivergentUse(MO);
-  }
-
-  // Query target-specific divergence callback
-  return InstrInfo.isDivergent(MI, DivergentArgs);
+  llvm_unreachable("no MIR instructions use Custom uniformity yet");
 }
 
 // This ensures explicit instantiation of
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 2c4dad60a249b..11f25e6aff5da 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -1732,8 +1732,7 @@ InstructionUniformity
 GCNTTIImpl::getInstructionUniformity(const Value *V) const {
   if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
     switch (Intrinsic->getIntrinsicID()) {
-    case Intrinsic::amdgcn_permlane16:
-    case Intrinsic::amdgcn_permlanex16:
+    case Intrinsic::amdgcn_wave_shuffle:
       return InstructionUniformity::Custom;
     default:
       break;
@@ -1749,7 +1748,6 @@ GCNTTIImpl::getInstructionUniformity(const Value *V) const {
   return InstructionUniformity::Default;
 }
 
-<<<<<<< HEAD
 InstructionCost GCNTTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
                                                  StackOffset BaseOffset,
                                                  bool HasBaseReg, int64_t Scale,
@@ -1799,23 +1797,12 @@ bool GCNTTIImpl::shouldDropLSRSolutionIfLessProfitable() const {
 
 bool GCNTTIImpl::isDivergent(const Instruction *I,
                              const SmallBitVector &DivergentArgs) const {
-  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(I)) {
-    switch (Intrinsic->getIntrinsicID()) {
-    case Intrinsic::amdgcn_permlane16:
-    case Intrinsic::amdgcn_permlanex16:
-      // For permlane16/permlanex16:
-      // Operand 0: old value (ignored for divergence)
-      // Operand 1: src0 (source value to permute)
-      // Operand 2: src1 (lane select within 16-lane group)
-      // Operand 3: src2 (which 16-lane group)
-      // Result is divergent if both src0 (op 1) and src1 (op 2) are divergent
-      if (DivergentArgs.size() > 2) {
-        return DivergentArgs[1] && DivergentArgs[2];
-      }
-      return false;
-    default:
-      break;
-    }
+  const IntrinsicInst *Intrinsic = cast<IntrinsicInst>(I);
+  switch (Intrinsic->getIntrinsicID()) {
+  case Intrinsic::amdgcn_wave_shuffle:
+    // wave_shuffle(Value, Index): result is divergent iff Index is divergent.
+    return DivergentArgs[1];
+  default:
+    llvm_unreachable("unexpected intrinsic in isDivergent");
   }
-  return false;
 }
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index fbe1b9c402188..5086c553da101 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -20,7 +20,6 @@
 #include "SIMachineFunctionInfo.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallBitVector.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
 #include "llvm/CodeGen/LiveIntervals.h"
@@ -10804,12 +10803,6 @@ SIInstrInfo::getInstructionUniformity(const MachineInstr &MI) const {
     return InstructionUniformity::NeverUniform;
 
   unsigned opcode = MI.getOpcode();
-
-  // permlane16/permlanex16 require custom uniformity analysis
-  if (opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
-      opcode == AMDGPU::V_PERMLANEX16_B32_e64)
-    return InstructionUniformity::Custom;
-
   if (opcode == AMDGPU::V_READLANE_B32 ||
       opcode == AMDGPU::V_READFIRSTLANE_B32 ||
       opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
@@ -10887,23 +10880,6 @@ SIInstrInfo::getInstructionUniformity(const MachineInstr &MI) const {
 
   return InstructionUniformity::Default;
 }
-bool SIInstrInfo::isDivergent(const MachineInstr &MI,
-                              const SmallBitVector &DivergentArgs) const {
-  unsigned opcode = MI.getOpcode();
-
-  // Custom divergence check for permlane16/permlanex16
-  if (opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
-      opcode == AMDGPU::V_PERMLANEX16_B32_e64) {
-    // Result is divergent if both src0 and src1 are divergent
-    // DivergentArgs[0] = src0 (source value)
-    // DivergentArgs[1] = src1 (lane select)
-    if (DivergentArgs.size() >= 2) {
-      return DivergentArgs[0] && DivergentArgs[1];
-    }
-  }
-
-  return false;
-}
 
 unsigned SIInstrInfo::getDSShaderTypeValue(const MachineFunction &MF) {
   switch (MF.getFunction().getCallingConv()) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 3d09521e0794f..93d28d22bfd16 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1680,9 +1680,6 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
   InstructionUniformity
   getInstructionUniformity(const MachineInstr &MI) const final;
 
-  bool isDivergent(const MachineInstr &MI,
-                   const SmallBitVector &DivergentArgs) const final;
-
   InstructionUniformity
   getGenericInstructionUniformity(const MachineInstr &MI) const;
 
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/uniform-permlane.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/uniform-permlane.mir
deleted file mode 100644
index da6048d86b2dd..0000000000000
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/uniform-permlane.mir
+++ /dev/null
@@ -1,86 +0,0 @@
-# RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -run-pass=print-machine-uniformity -o - %s 2>&1 | FileCheck %s
-
-# Test the machine-level uniformity analysis for permlane16/permlanex16 instructions.
-#
-# NOTE: Permlane instructions have a hardware constraint that src1 (lane select) and src2
-# must be SGPR (scalar) registers. Since SGPRs are always uniform at machine level, 
-# permlane results are always uniform according to the AnyOfFirstTwoUseOp logic
-# (either src0 OR src1 being uniform makes the result uniform, and src1 is always uniform).
-#
-# These tests verify that the uniformity analysis correctly handles permlane instructions
-# and that uniform results propagate through chains of operations.
-
----
-# Test: permlane16 with divergent VGPR src and uniform SGPR lane select
-# Result is UNIFORM because lane select (SGPR) is always uniform
-name: permlane16_basic
-machineFunctionInfo:
-  isEntryFunction: true
-body: |
-  bb.0:
-    ; CHECK-LABEL: MachineUniformityInfo for function: @permlane16_basic
-    ; CHECK: ALL VALUES UNIFORM
-    %0:vgpr_32 = IMPLICIT_DEF
-    %1:sreg_32 = S_MOV_B32 5
-    %2:sreg_32 = IMPLICIT_DEF
-    %3:vgpr_32 = V_PERMLANE16_B32_e64 0, %0, 0, %1, 0, %2, %0, 0, implicit $exec
-    S_ENDPGM 0
-
-...
----
-# Test: permlanex16 with divergent VGPR src and uniform SGPR lane select
-# Result is UNIFORM because lane select (SGPR) is always uniform
-name: permlanex16_basic
-machineFunctionInfo:
-  isEntryFunction: true
-body: |
-  bb.0:
-    ; CHECK-LABEL: MachineUniformityInfo for function: @permlanex16_basic
-    ; CHECK: ALL VALUES UNIFORM
-    %0:vgpr_32 = IMPLICIT_DEF
-    %1:sreg_32 = S_MOV_B32 7
-    %2:sreg_32 = IMPLICIT_DEF
-    %3:vgpr_32 = V_PERMLANEX16_B32_e64 0, %0, 0, %1, 0, %2, %0, 0, implicit $exec
-    S_ENDPGM 0
-
-...
----
-# Test: Chain of permlane operations - uniformity propagates
-# Both permlanes are uniform, second uses result of first as source
-name: permlane16_chain_uniform
-machineFunctionInfo:
-  isEntryFunction: true
-body: |
-  bb.0:
-    ; CHECK-LABEL: MachineUniformityInfo for function: @permlane16_chain_uniform
-    ; CHECK: ALL VALUES UNIFORM
-    %0:vgpr_32 = IMPLICIT_DEF
-    %1:sreg_32 = S_MOV_B32 3
-    %2:sreg_32 = IMPLICIT_DEF
-    ; First permlane - uniform because lane select is SGPR
-    %3:vgpr_32 = V_PERMLANE16_B32_e64 0, %0, 0, %1, 0, %2, %0, 0, implicit $exec
-    ; Second permlane uses uniform result - also uniform
-    %4:vgpr_32 = V_PERMLANEX16_B32_e64 0, %3, 0, %1, 0, %2, %3, 0, implicit $exec
-    S_ENDPGM 0
-
-...
----
-# Test: Multiple permlane operations in sequence
-# Verifies that uniformity is correctly tracked through complex chains
-name: permlane_multiple
-machineFunctionInfo:
-  isEntryFunction: true
-body: |
-  bb.0:
-    ; CHECK-LABEL: MachineUniformityInfo for function: @permlane_multiple
-    ; CHECK: ALL VALUES UNIFORM
-    %0:vgpr_32 = IMPLICIT_DEF
-    %1:sreg_32 = S_MOV_B32 1
-    %2:sreg_32 = S_MOV_B32 2  
-    %3:vgpr_32 = V_PERMLANE16_B32_e64 0, %0, 0, %1, 0, %2, %0, 0, implicit $exec
-    %4:vgpr_32 = V_PERMLANEX16_B32_e64 0, %3, 0, %1, 0, %2, %3, 0, implicit $exec
-    %5:vgpr_32 = V_PERMLANE16_B32_e64 0, %4, 0, %2, 0, %1, %4, 0, implicit $exec
-    S_ENDPGM 0
-
-...
-
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
index 9b91c7bee84bd..46cb8cc1312dc 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
@@ -7,14 +7,14 @@ define amdgpu_kernel void @ds_swizzle(ptr addrspace(1) %out, i32 %src) #0 {
   ret void
 }
 
-; CHECK: ALL VALUES UNIFORM
+; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
 define amdgpu_kernel void @v_permlane16_b32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
   %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
   store i32 %v, ptr addrspace(1) %out
   ret void
 }
 
-; CHECK: ALL VALUES UNIFORM
+; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
 define amdgpu_kernel void @v_permlanex16_b32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
   %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
   store i32 %v, ptr addrspace(1) %out
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/uniform_intrinsic.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/uniform_intrinsic.ll
index e7391ee0c265b..d7a697e85f376 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/uniform_intrinsic.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/uniform_intrinsic.ll
@@ -1,59 +1,46 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -mtriple amdgcn-- -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s
 
-; CHECK: ALL VALUES UNIFORM
-define amdgpu_kernel void @v_permlane16_b32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
-  %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
-  store i32 %v, ptr addrspace(1) %out
-  ret void
-}
+; wave_shuffle(Value, Index): result is uniform when Index is uniform,
+; regardless of Value's divergence.
 
+; All kernel args are uniform, so Index is uniform => result is uniform.
+; CHECK-LABEL: UniformityInfo for function 'wave_shuffle_all_uniform':
 ; CHECK: ALL VALUES UNIFORM
-define amdgpu_kernel void @v_permlanex16_b32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
-  %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
+define amdgpu_kernel void @wave_shuffle_all_uniform(ptr addrspace(1) %out, i32 %val, i32 %idx) {
+  %v = call i32 @llvm.amdgcn.wave.shuffle(i32 %val, i32 %idx)
   store i32 %v, ptr addrspace(1) %out
   ret void
 }
 
-; CHECK:  DIVERGENT:   %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
-; CHECK:               %v1 = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %v, i32 %src1, i32 %src2, i1 false, i1 false)
-; CHECK:               store i32 %v1, ptr addrspace(1) %out, align 4
-define amdgpu_kernel void @div_permlane16_var_uni_usr_x16(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
-  %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
-  %v1 = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %v, i32 %src1, i32 %src2, i1 false, i1 false) #0
-  store i32 %v1, ptr addrspace(1) %out
-  ret void
-}
-
-; CHECK:  DIVERGENT:   %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
-; CHECK:               %v1 = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
-; CHECK:               store i32 %v1, ptr addrspace(1) %out, align 4
-define amdgpu_kernel void @div_permlane16_var_uni_x16(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
-  %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
-  %v1 = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
-  store i32 %v1, ptr addrspace(1) %out
+; Value is divergent (thread ID), but Index is uniform => result is uniform.
+; CHECK-LABEL: UniformityInfo for function 'wave_shuffle_divergent_val_uniform_idx':
+; CHECK-NOT: DIVERGENT: {{.*}}wave.shuffle
+define amdgpu_kernel void @wave_shuffle_divergent_val_uniform_idx(ptr addrspace(1) %out, i32 %idx) {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %v = call i32 @llvm.amdgcn.wave.shuffle(i32 %tid, i32 %idx)
+  store i32 %v, ptr addrspace(1) %out
   ret void
 }
 
-; CHECK:  DIVERGENT:   %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
-; CHECK:               %v1 = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %v, i32 %src1, i32 %src2, i1 false, i1 false)
-; CHECK:               store i32 %v1, ptr addrspace(1) %out, align 4
-define amdgpu_kernel void @div_permlane16_var_uni_usr_16(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
-  %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
-  %v1 = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %v, i32 %src1, i32 %src2, i1 false, i1 false) #0
-  store i32 %v1, ptr addrspace(1) %out
+; Value is uniform, but Index is divergent (thread ID) => result is divergent.
+; CHECK-LABEL: UniformityInfo for function 'wave_shuffle_uniform_val_divergent_idx':
+; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.wave.shuffle.i32(i32 %val, i32 %tid)
+define amdgpu_kernel void @wave_shuffle_uniform_val_divergent_idx(ptr addrspace(1) %out, i32 %val) {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %v = call i32 @llvm.amdgcn.wave.shuffle(i32 %val, i32 %tid)
+  store i32 %v, ptr addrspace(1) %out
   ret void
 }
 
-; CHECK:  DIVERGENT:   %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
-; CHECK:               %v1 = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
-; CHECK:               store i32 %v1, ptr addrspace(1) %out, align 4
-define amdgpu_kernel void @div_permlane16_var_uni_16(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
-  %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
-  %v1 = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
-  store i32 %v1, ptr addrspace(1) %out
+; Both Value and Index are divergent => result is divergent.
+; CHECK-LABEL: UniformityInfo for function 'wave_shuffle_both_divergent':
+; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.wave.shuffle.i32(i32 %tid, i32 %tid)
+define amdgpu_kernel void @wave_shuffle_both_divergent(ptr addrspace(1) %out) {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %v = call i32 @llvm.amdgcn.wave.shuffle(i32 %tid, i32 %tid)
+  store i32 %v, ptr addrspace(1) %out
   ret void
 }
 
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK: {{.*}}
+declare i32 @llvm.amdgcn.wave.shuffle(i32, i32)
+declare i32 @llvm.amdgcn.workitem.id.x()

>From 06b7234eb0d24410a3dba55728a79ab81a40277c Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Tue, 24 Mar 2026 14:35:46 +0530
Subject: [PATCH 16/17] update the divergent logic for wave_shuffle

---
 .../Target/AMDGPU/AMDGPUTargetTransformInfo.cpp  |  6 ++++--
 .../AMDGPU/uniform_intrinsic.ll                  | 16 ++++++++++------
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 11f25e6aff5da..72ad418d3cf17 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -1800,8 +1800,10 @@ bool GCNTTIImpl::isDivergent(const Instruction *I,
   const IntrinsicInst *Intrinsic = cast<IntrinsicInst>(I);
   switch (Intrinsic->getIntrinsicID()) {
   case Intrinsic::amdgcn_wave_shuffle:
-    // wave_shuffle(Value, Index): result is divergent iff Index is divergent.
-    return DivergentArgs[1];
+    // wave_shuffle(Value, Index): result is divergent only when both Value and
+    // Index are divergent. A uniform Value read from any lane yields the same
+    // result, and a uniform Index makes all lanes read the same source lane.
+    return DivergentArgs[0] && DivergentArgs[1];
   default:
     llvm_unreachable("unexpected intrinsic in isDivergent");
   }
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/uniform_intrinsic.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/uniform_intrinsic.ll
index d7a697e85f376..e3367235c9f6d 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/uniform_intrinsic.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/uniform_intrinsic.ll
@@ -1,9 +1,10 @@
 ; RUN: opt -mtriple amdgcn-- -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s
 
-; wave_shuffle(Value, Index): result is uniform when Index is uniform,
-; regardless of Value's divergence.
+; wave_shuffle(Value, Index): result is divergent only when both Value and
+; Index are divergent. A uniform Value read from any lane yields the same
+; result, and a uniform Index makes all lanes read the same source lane.
 
-; All kernel args are uniform, so Index is uniform => result is uniform.
+; All kernel args are uniform => result is uniform.
 ; CHECK-LABEL: UniformityInfo for function 'wave_shuffle_all_uniform':
 ; CHECK: ALL VALUES UNIFORM
 define amdgpu_kernel void @wave_shuffle_all_uniform(ptr addrspace(1) %out, i32 %val, i32 %idx) {
@@ -12,7 +13,8 @@ define amdgpu_kernel void @wave_shuffle_all_uniform(ptr addrspace(1) %out, i32 %
   ret void
 }
 
-; Value is divergent (thread ID), but Index is uniform => result is uniform.
+; Value is divergent, Index is uniform => result is uniform.
+; All lanes read from the same source lane, so the result is the same.
 ; CHECK-LABEL: UniformityInfo for function 'wave_shuffle_divergent_val_uniform_idx':
 ; CHECK-NOT: DIVERGENT: {{.*}}wave.shuffle
 define amdgpu_kernel void @wave_shuffle_divergent_val_uniform_idx(ptr addrspace(1) %out, i32 %idx) {
@@ -22,9 +24,11 @@ define amdgpu_kernel void @wave_shuffle_divergent_val_uniform_idx(ptr addrspace(
   ret void
 }
 
-; Value is uniform, but Index is divergent (thread ID) => result is divergent.
+; Value is uniform, Index is divergent => result is uniform.
+; Each lane may read from a different source lane, but Value is the same
+; across all lanes so the result is still uniform.
 ; CHECK-LABEL: UniformityInfo for function 'wave_shuffle_uniform_val_divergent_idx':
-; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.wave.shuffle.i32(i32 %val, i32 %tid)
+; CHECK-NOT: DIVERGENT: {{.*}}wave.shuffle
 define amdgpu_kernel void @wave_shuffle_uniform_val_divergent_idx(ptr addrspace(1) %out, i32 %val) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %v = call i32 @llvm.amdgcn.wave.shuffle(i32 %val, i32 %tid)

>From 2c8c0c981478f704418d4d5b9bf059c13ccc511b Mon Sep 17 00:00:00 2001
From: Pankaj Dwivedi <divedi.pk.117 at gmail.com>
Date: Tue, 24 Mar 2026 16:32:50 +0530
Subject: [PATCH 17/17] Update llvm/include/llvm/ADT/GenericUniformityImpl.h

Co-authored-by: Jay Foad <jay.foad at gmail.com>
---
 llvm/include/llvm/ADT/GenericUniformityImpl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h
index 44f4db3cccb4c..a306ed8cb6354 100644
--- a/llvm/include/llvm/ADT/GenericUniformityImpl.h
+++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h
@@ -798,7 +798,7 @@ void GenericUniformityAnalysisImpl<ContextT>::markDivergent(
   // For custom uniformity candidates, check if the instruction can be
   // proven uniform based on which operands are uniform/divergent.
   // The candidate will be re-evaluated as operands become divergent.
-  if (CustomUniformityCandidates.count(&I)) {
+  if (CustomUniformityCandidates.contains(&I)) {
     if (isCustomUniform(I))
       return;
   }