[llvm] [NaryReassociate] Teach NaryReassociate about UniformityAnalysis (PR #175167)

Wed Apr 1 01:03:46 PDT 2026

https://github.com/PankajDwivedi-25 updated https://github.com/llvm/llvm-project/pull/175167

>From 40984c61fd9e6b7e6141c1d1b1390483e918a6ba Mon Sep 17 00:00:00 2001
From: padivedi <padivedi at amd.com>
Date: Tue, 13 Jan 2026 18:30:13 +0530
Subject: [PATCH 1/4] [NaryReassociate][AMDGPU] Pre-commit test for
 uniformity-aware reassociation (NFC)

---
 .../AMDGPU/nary-add-uniform.ll                | 319 ++++++++++++++++++
 1 file changed, 319 insertions(+)
 create mode 100644 llvm/test/Transforms/NaryReassociate/AMDGPU/nary-add-uniform.ll

diff --git a/llvm/test/Transforms/NaryReassociate/AMDGPU/nary-add-uniform.ll b/llvm/test/Transforms/NaryReassociate/AMDGPU/nary-add-uniform.ll
new file mode 100644
index 0000000000000..d8f5c6009aa5e
--- /dev/null
+++ b/llvm/test/Transforms/NaryReassociate/AMDGPU/nary-add-uniform.ll
@@ -0,0 +1,319 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; REQUIRES: amdgpu-registered-target
+; RUN: opt < %s -mtriple=amdgcn-amd-amdhsa -passes='nary-reassociate' -S | FileCheck %s
+
+declare i32 @llvm.amdgcn.workitem.id.x()
+declare i32 @llvm.amdgcn.workitem.id.y()
+declare void @use(i32)
+
+; Test that NaryReassociate prefers grouping uniform values together when
+; uniformity analysis is available and both reassociation options exist.
+;
+; For I = (A op B) op RHS, the pass can form:
+;   - (A op RHS) op B
+;   - (B op RHS) op A
+;
+; When both dominating expressions exist, prefer the one grouping uniforms.
+
+; Both %d_u2 and %u1_u2 exist as dominating expressions.
+; For (d + u1) + u2:
+;   - Without UA preference: would try (d + u2) first, find %d_u2, return %d_u2 + u1
+;   - With UA preference: B=u1 and RHS=u2 are uniform, A=d is divergent
+;                         So prefer (u1 + u2) + d, returning %u1_u2 + d
+;
+
+define amdgpu_kernel void @prefer_uniform_grouping(i32 %u1, i32 %u2) {
+; CHECK-LABEL: define amdgpu_kernel void @prefer_uniform_grouping(
+; CHECK-SAME: i32 [[U1:%.*]], i32 [[U2:%.*]]) {
+; CHECK-NEXT:    [[D:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[D_U2:%.*]] = add i32 [[D]], [[U2]]
+; CHECK-NEXT:    [[U1_U2:%.*]] = add i32 [[U1]], [[U2]]
+; CHECK-NEXT:    call void @use(i32 [[D_U2]])
+; CHECK-NEXT:    call void @use(i32 [[U1_U2]])
+; CHECK-NEXT:    [[RESULT:%.*]] = add i32 [[D_U2]], [[U1]]
+; CHECK-NEXT:    call void @use(i32 [[RESULT]])
+; CHECK-NEXT:    ret void
+;
+  %d = call i32 @llvm.amdgcn.workitem.id.x()
+
+  ; Create both possible reassociation targets
+  %d_u2 = add i32 %d, %u2        ; divergent + uniform
+  %u1_u2 = add i32 %u1, %u2      ; uniform + uniform (should be preferred!)
+
+  call void @use(i32 %d_u2)
+  call void @use(i32 %u1_u2)
+
+  ; (d + u1) + u2: both (d + u2) and (u1 + u2) exist
+  ; Should prefer (u1 + u2) + d to group uniforms
+  %tmp = add i32 %d, %u1
+  %result = add i32 %tmp, %u2
+  call void @use(i32 %result)
+
+  ret void
+}
+
+define amdgpu_kernel void @prefer_uniform_grouping_mul(i32 %u1, i32 %u2) {
+; CHECK-LABEL: define amdgpu_kernel void @prefer_uniform_grouping_mul(
+; CHECK-SAME: i32 [[U1:%.*]], i32 [[U2:%.*]]) {
+; CHECK-NEXT:    [[D:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[D_U2:%.*]] = mul i32 [[D]], [[U2]]
+; CHECK-NEXT:    [[U1_U2:%.*]] = mul i32 [[U1]], [[U2]]
+; CHECK-NEXT:    call void @use(i32 [[D_U2]])
+; CHECK-NEXT:    call void @use(i32 [[U1_U2]])
+; CHECK-NEXT:    [[RESULT:%.*]] = mul i32 [[D_U2]], [[U1]]
+; CHECK-NEXT:    call void @use(i32 [[RESULT]])
+; CHECK-NEXT:    ret void
+;
+  %d = call i32 @llvm.amdgcn.workitem.id.x()
+
+  %d_u2 = mul i32 %d, %u2
+  %u1_u2 = mul i32 %u1, %u2
+
+  call void @use(i32 %d_u2)
+  call void @use(i32 %u1_u2)
+
+  %tmp = mul i32 %d, %u1
+  %result = mul i32 %tmp, %u2
+  call void @use(i32 %result)
+
+  ret void
+}
+
+define amdgpu_kernel void @only_one_option(i32 %u1, i32 %u2) {
+; CHECK-LABEL: define amdgpu_kernel void @only_one_option(
+; CHECK-SAME: i32 [[U1:%.*]], i32 [[U2:%.*]]) {
+; CHECK-NEXT:    [[D:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[U1_U2:%.*]] = add i32 [[U1]], [[U2]]
+; CHECK-NEXT:    call void @use(i32 [[U1_U2]])
+; CHECK-NEXT:    [[RESULT:%.*]] = add i32 [[U1_U2]], [[D]]
+; CHECK-NEXT:    call void @use(i32 [[RESULT]])
+; CHECK-NEXT:    ret void
+;
+  %d = call i32 @llvm.amdgcn.workitem.id.x()
+
+  ; Only u1 + u2 exists, not d + u2
+  %u1_u2 = add i32 %u1, %u2
+  call void @use(i32 %u1_u2)
+
+  %tmp = add i32 %d, %u1
+  %result = add i32 %tmp, %u2
+  call void @use(i32 %result)
+
+  ret void
+}
+
+; When no dominating expression exists, no reassociation happens
+define amdgpu_kernel void @no_dominating_expr(i32 %u1, i32 %u2) {
+; CHECK-LABEL: define amdgpu_kernel void @no_dominating_expr(
+; CHECK-SAME: i32 [[U1:%.*]], i32 [[U2:%.*]]) {
+; CHECK-NEXT:    [[D:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[TMP:%.*]] = add i32 [[D]], [[U1]]
+; CHECK-NEXT:    [[RESULT:%.*]] = add i32 [[TMP]], [[U2]]
+; CHECK-NEXT:    call void @use(i32 [[RESULT]])
+; CHECK-NEXT:    ret void
+;
+  %d = call i32 @llvm.amdgcn.workitem.id.x()
+
+  ; No dominating expressions exist
+  %tmp = add i32 %d, %u1
+  %result = add i32 %tmp, %u2
+  call void @use(i32 %result)
+
+  ret void
+}
+
+; Test smax: prefer grouping uniform values together
+; For smax(smax(A, B), RHS):
+;   - smax(smax(A, RHS), B): groups A and RHS
+;   - smax(smax(B, RHS), A): groups B and RHS
+; When B and RHS are uniform but A is divergent, prefer smax(smax(B, RHS), A)
+define amdgpu_kernel void @prefer_uniform_grouping_smax(i32 %u1, i32 %u2) {
+; CHECK-LABEL: define amdgpu_kernel void @prefer_uniform_grouping_smax(
+; CHECK-SAME: i32 [[U1:%.*]], i32 [[U2:%.*]]) {
+; CHECK-NEXT:    [[D:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[D_U2:%.*]] = call i32 @llvm.smax.i32(i32 [[D]], i32 [[U2]])
+; CHECK-NEXT:    [[U1_U2:%.*]] = call i32 @llvm.smax.i32(i32 [[U1]], i32 [[U2]])
+; CHECK-NEXT:    call void @use(i32 [[D_U2]])
+; CHECK-NEXT:    call void @use(i32 [[U1_U2]])
+; CHECK-NEXT:    [[RESULT_NARY:%.*]] = call i32 @llvm.smax.i32(i32 [[D_U2]], i32 [[U1]])
+; CHECK-NEXT:    call void @use(i32 [[RESULT_NARY]])
+; CHECK-NEXT:    ret void
+;
+  %d = call i32 @llvm.amdgcn.workitem.id.x()
+
+  ; Create both possible reassociation targets
+  %d_u2 = call i32 @llvm.smax.i32(i32 %d, i32 %u2)    ; divergent, uniform
+  %u1_u2 = call i32 @llvm.smax.i32(i32 %u1, i32 %u2)  ; uniform, uniform (preferred!)
+
+  call void @use(i32 %d_u2)
+  call void @use(i32 %u1_u2)
+
+  ; smax(smax(d, u1), u2): both smax(d, u2) and smax(u1, u2) exist
+  ; Should prefer smax(smax(u1, u2), d) to group uniforms
+  %tmp = call i32 @llvm.smax.i32(i32 %d, i32 %u1)
+  %result = call i32 @llvm.smax.i32(i32 %tmp, i32 %u2)
+  call void @use(i32 %result)
+
+  ret void
+}
+
+; Test umin: prefer grouping uniform values together
+define amdgpu_kernel void @prefer_uniform_grouping_umin(i32 %u1, i32 %u2) {
+; CHECK-LABEL: define amdgpu_kernel void @prefer_uniform_grouping_umin(
+; CHECK-SAME: i32 [[U1:%.*]], i32 [[U2:%.*]]) {
+; CHECK-NEXT:    [[D:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[D_U2:%.*]] = call i32 @llvm.umin.i32(i32 [[D]], i32 [[U2]])
+; CHECK-NEXT:    [[U1_U2:%.*]] = call i32 @llvm.umin.i32(i32 [[U1]], i32 [[U2]])
+; CHECK-NEXT:    call void @use(i32 [[D_U2]])
+; CHECK-NEXT:    call void @use(i32 [[U1_U2]])
+; CHECK-NEXT:    [[RESULT_NARY:%.*]] = call i32 @llvm.umin.i32(i32 [[D_U2]], i32 [[U1]])
+; CHECK-NEXT:    call void @use(i32 [[RESULT_NARY]])
+; CHECK-NEXT:    ret void
+;
+  %d = call i32 @llvm.amdgcn.workitem.id.x()
+
+  %d_u2 = call i32 @llvm.umin.i32(i32 %d, i32 %u2)
+  %u1_u2 = call i32 @llvm.umin.i32(i32 %u1, i32 %u2)
+
+  call void @use(i32 %d_u2)
+  call void @use(i32 %u1_u2)
+
+  %tmp = call i32 @llvm.umin.i32(i32 %d, i32 %u1)
+  %result = call i32 @llvm.umin.i32(i32 %tmp, i32 %u2)
+  call void @use(i32 %result)
+
+  ret void
+}
+
+; Test GEP with LHS=uniform, RHS=divergent
+define amdgpu_kernel void @gep_lhs_uniform_rhs_divergent(ptr %base, i64 %u_offset) {
+; CHECK-LABEL: define amdgpu_kernel void @gep_lhs_uniform_rhs_divergent(
+; CHECK-SAME: ptr [[BASE:%.*]], i64 [[U_OFFSET:%.*]]) {
+; CHECK-NEXT:    [[D:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[D_EXT:%.*]] = zext i32 [[D]] to i64
+; CHECK-NEXT:    [[GEP_U:%.*]] = getelementptr i32, ptr [[BASE]], i64 [[U_OFFSET]]
+; CHECK-NEXT:    [[GEP_D:%.*]] = getelementptr i32, ptr [[BASE]], i64 [[D_EXT]]
+; CHECK-NEXT:    call void @use_ptr(ptr [[GEP_U]])
+; CHECK-NEXT:    call void @use_ptr(ptr [[GEP_D]])
+; CHECK-NEXT:    [[GEP_RESULT:%.*]] = getelementptr i32, ptr [[GEP_U]], i64 [[D_EXT]]
+; CHECK-NEXT:    call void @use_ptr(ptr [[GEP_RESULT]])
+; CHECK-NEXT:    ret void
+;
+  %d = call i32 @llvm.amdgcn.workitem.id.x()
+  %d_ext = zext i32 %d to i64
+
+  ; Create BOTH dominating GEPs so there's a choice
+  %gep_u = getelementptr i32, ptr %base, i64 %u_offset  ; uniform index
+  %gep_d = getelementptr i32, ptr %base, i64 %d_ext     ; divergent index
+
+  call void @use_ptr(ptr %gep_u)
+  call void @use_ptr(ptr %gep_d)
+
+  ; idx = u_offset + d_ext (LHS=uniform, RHS=divergent)
+  %idx = add i64 %u_offset, %d_ext
+  %gep_result = getelementptr i32, ptr %base, i64 %idx
+  call void @use_ptr(ptr %gep_result)
+
+  ret void
+}
+
+; Test GEP with LHS=divergent, RHS=uniform
+define amdgpu_kernel void @gep_lhs_divergent_rhs_uniform(ptr %base, i64 %u_offset) {
+; CHECK-LABEL: define amdgpu_kernel void @gep_lhs_divergent_rhs_uniform(
+; CHECK-SAME: ptr [[BASE:%.*]], i64 [[U_OFFSET:%.*]]) {
+; CHECK-NEXT:    [[D:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[D_EXT:%.*]] = zext i32 [[D]] to i64
+; CHECK-NEXT:    [[GEP_U:%.*]] = getelementptr i32, ptr [[BASE]], i64 [[U_OFFSET]]
+; CHECK-NEXT:    [[GEP_D:%.*]] = getelementptr i32, ptr [[BASE]], i64 [[D_EXT]]
+; CHECK-NEXT:    call void @use_ptr(ptr [[GEP_U]])
+; CHECK-NEXT:    call void @use_ptr(ptr [[GEP_D]])
+; CHECK-NEXT:    [[GEP_RESULT:%.*]] = getelementptr i32, ptr [[GEP_D]], i64 [[U_OFFSET]]
+; CHECK-NEXT:    call void @use_ptr(ptr [[GEP_RESULT]])
+; CHECK-NEXT:    ret void
+;
+  %d = call i32 @llvm.amdgcn.workitem.id.x()
+  %d_ext = zext i32 %d to i64
+
+  ; Create BOTH dominating GEPs so there's a choice
+  %gep_u = getelementptr i32, ptr %base, i64 %u_offset  ; uniform index
+  %gep_d = getelementptr i32, ptr %base, i64 %d_ext     ; divergent index
+
+  call void @use_ptr(ptr %gep_u)
+  call void @use_ptr(ptr %gep_d)
+
+  ; idx = d_ext + u_offset (LHS=divergent, RHS=uniform)
+  %idx = add i64 %d_ext, %u_offset
+  %gep_result = getelementptr i32, ptr %base, i64 %idx
+  call void @use_ptr(ptr %gep_result)
+
+  ret void
+}
+
+; Test GEP with both LHS and RHS uniform - no preference needed
+define amdgpu_kernel void @gep_both_uniform(ptr %base, i64 %u1, i64 %u2) {
+; CHECK-LABEL: define amdgpu_kernel void @gep_both_uniform(
+; CHECK-SAME: ptr [[BASE:%.*]], i64 [[U1:%.*]], i64 [[U2:%.*]]) {
+; CHECK-NEXT:    [[GEP_U1:%.*]] = getelementptr i32, ptr [[BASE]], i64 [[U1]]
+; CHECK-NEXT:    [[GEP_U2:%.*]] = getelementptr i32, ptr [[BASE]], i64 [[U2]]
+; CHECK-NEXT:    call void @use_ptr(ptr [[GEP_U1]])
+; CHECK-NEXT:    call void @use_ptr(ptr [[GEP_U2]])
+; CHECK-NEXT:    [[GEP_RESULT:%.*]] = getelementptr i32, ptr [[GEP_U1]], i64 [[U2]]
+; CHECK-NEXT:    call void @use_ptr(ptr [[GEP_RESULT]])
+; CHECK-NEXT:    ret void
+;
+  ; Create both dominating GEPs with uniform indices
+  %gep_u1 = getelementptr i32, ptr %base, i64 %u1
+  %gep_u2 = getelementptr i32, ptr %base, i64 %u2
+
+  call void @use_ptr(ptr %gep_u1)
+  call void @use_ptr(ptr %gep_u2)
+
+  ; idx = u1 + u2 (both uniform - no preference needed)
+  %idx = add i64 %u1, %u2
+  %gep_result = getelementptr i32, ptr %base, i64 %idx
+  call void @use_ptr(ptr %gep_result)
+
+  ret void
+}
+
+; Test GEP with both LHS and RHS divergent - no preference needed
+define amdgpu_kernel void @gep_both_divergent(ptr %base) {
+; CHECK-LABEL: define amdgpu_kernel void @gep_both_divergent(
+; CHECK-SAME: ptr [[BASE:%.*]]) {
+; CHECK-NEXT:    [[D1:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[D2:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT:    [[D1_EXT:%.*]] = zext i32 [[D1]] to i64
+; CHECK-NEXT:    [[D2_EXT:%.*]] = zext i32 [[D2]] to i64
+; CHECK-NEXT:    [[GEP_D1:%.*]] = getelementptr i32, ptr [[BASE]], i64 [[D1_EXT]]
+; CHECK-NEXT:    [[GEP_D2:%.*]] = getelementptr i32, ptr [[BASE]], i64 [[D2_EXT]]
+; CHECK-NEXT:    call void @use_ptr(ptr [[GEP_D1]])
+; CHECK-NEXT:    call void @use_ptr(ptr [[GEP_D2]])
+; CHECK-NEXT:    [[GEP_RESULT:%.*]] = getelementptr i32, ptr [[GEP_D1]], i64 [[D2_EXT]]
+; CHECK-NEXT:    call void @use_ptr(ptr [[GEP_RESULT]])
+; CHECK-NEXT:    ret void
+;
+  %d1 = call i32 @llvm.amdgcn.workitem.id.x()
+  %d2 = call i32 @llvm.amdgcn.workitem.id.y()
+  %d1_ext = zext i32 %d1 to i64
+  %d2_ext = zext i32 %d2 to i64
+
+  ; Create both dominating GEPs with divergent indices
+  %gep_d1 = getelementptr i32, ptr %base, i64 %d1_ext
+  %gep_d2 = getelementptr i32, ptr %base, i64 %d2_ext
+
+  call void @use_ptr(ptr %gep_d1)
+  call void @use_ptr(ptr %gep_d2)
+
+  ; idx = d1_ext + d2_ext (both divergent - no preference needed)
+  %idx = add i64 %d1_ext, %d2_ext
+  %gep_result = getelementptr i32, ptr %base, i64 %idx
+  call void @use_ptr(ptr %gep_result)
+
+  ret void
+}
+
+declare i32 @llvm.smax.i32(i32, i32)
+declare i32 @llvm.smin.i32(i32, i32)
+declare i32 @llvm.umax.i32(i32, i32)
+declare i32 @llvm.umin.i32(i32, i32)
+declare void @use_ptr(ptr)

>From bf65765d667a28b3b974a1840c9bbdf7c4c0a16f Mon Sep 17 00:00:00 2001
From: padivedi <padivedi at amd.com>
Date: Tue, 13 Jan 2026 18:40:20 +0530
Subject: [PATCH 2/4] [NaryReassociate] Make uniformity-aware to prefer
 grouping uniform values

---
 .../llvm/Transforms/Scalar/NaryReassociate.h  |  4 +-
 .../lib/Transforms/Scalar/NaryReassociate.cpp | 99 ++++++++++++++++++-
 llvm/test/CodeGen/AMDGPU/llc-pipeline.ll      |  9 +-
 .../AMDGPU/nary-add-uniform.ll                | 10 +-
 4 files changed, 110 insertions(+), 12 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Scalar/NaryReassociate.h b/llvm/include/llvm/Transforms/Scalar/NaryReassociate.h
index 417801d470800..d09d4eb79e366 100644
--- a/llvm/include/llvm/Transforms/Scalar/NaryReassociate.h
+++ b/llvm/include/llvm/Transforms/Scalar/NaryReassociate.h
@@ -81,6 +81,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/UniformityAnalysis.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/ValueHandle.h"
 
@@ -105,7 +106,7 @@ class NaryReassociatePass : public PassInfoMixin<NaryReassociatePass> {
   // Glue for old PM.
   bool runImpl(Function &F, AssumptionCache *AC_, DominatorTree *DT_,
                ScalarEvolution *SE_, TargetLibraryInfo *TLI_,
-               TargetTransformInfo *TTI_);
+               TargetTransformInfo *TTI_, UniformityInfo *UI_ = nullptr);
 
 private:
   // Runs only one iteration of the dominator-based algorithm. See the header
@@ -180,6 +181,7 @@ class NaryReassociatePass : public PassInfoMixin<NaryReassociatePass> {
   ScalarEvolution *SE;
   TargetLibraryInfo *TLI;
   TargetTransformInfo *TTI;
+  UniformityInfo *UI;
 
   // A lookup table quickly telling which instructions compute the given SCEV.
   // Note that there can be multiple instructions at different locations
diff --git a/llvm/lib/Transforms/Scalar/NaryReassociate.cpp b/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
index b0a33710c25bc..72dad5db0e931 100644
--- a/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
+++ b/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
@@ -104,6 +104,7 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -141,6 +142,7 @@ class NaryReassociateLegacyPass : public FunctionPass {
     AU.addRequired<ScalarEvolutionWrapperPass>();
     AU.addRequired<TargetLibraryInfoWrapperPass>();
     AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addRequired<UniformityInfoWrapperPass>();
     AU.setPreservesCFG();
   }
 
@@ -159,6 +161,7 @@ INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
 INITIALIZE_PASS_END(NaryReassociateLegacyPass, "nary-reassociate",
                     "Nary reassociation", false, false)
 
@@ -176,7 +179,11 @@ bool NaryReassociateLegacyPass::runOnFunction(Function &F) {
   auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
   auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
 
-  return Impl.runImpl(F, AC, DT, SE, TLI, TTI);
+  // UniformityInfo is required on all targets, but on targets without branch
+  // divergence it does no work and reports everything as uniform.
+  auto *UI = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
+
+  return Impl.runImpl(F, AC, DT, SE, TLI, TTI, UI);
 }
 
 PreservedAnalyses NaryReassociatePass::run(Function &F,
@@ -187,7 +194,11 @@ PreservedAnalyses NaryReassociatePass::run(Function &F,
   auto *TLI = &AM.getResult<TargetLibraryAnalysis>(F);
   auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
 
-  if (!runImpl(F, AC, DT, SE, TLI, TTI))
+  // UniformityInfo is required on all targets, but on targets without branch
+  // divergence it does no work and reports everything as uniform.
+  auto *UI = &AM.getResult<UniformityInfoAnalysis>(F);
+
+  if (!runImpl(F, AC, DT, SE, TLI, TTI, UI))
     return PreservedAnalyses::all();
 
   PreservedAnalyses PA;
@@ -199,12 +210,14 @@ PreservedAnalyses NaryReassociatePass::run(Function &F,
 bool NaryReassociatePass::runImpl(Function &F, AssumptionCache *AC_,
                                   DominatorTree *DT_, ScalarEvolution *SE_,
                                   TargetLibraryInfo *TLI_,
-                                  TargetTransformInfo *TTI_) {
+                                  TargetTransformInfo *TTI_,
+                                  UniformityInfo *UI_) {
   AC = AC_;
   DT = DT_;
   SE = SE_;
   TLI = TLI_;
   TTI = TTI_;
+  UI = UI_;
   DL = &F.getDataLayout();
 
   bool Changed = false, ChangedInThisIteration;
@@ -379,6 +392,33 @@ NaryReassociatePass::tryReassociateGEPAtIndex(GetElementPtrInst *GEP,
 
     Value *LHS = AO->getOperand(0), *RHS = AO->getOperand(1);
     // IndexToSplit = LHS + RHS.
+    // tryReassociateGEPAtIndex(GEP, I, LHS, RHS, ...) looks for a dominating
+    // GEP with LHS as index, then creates: NewGEP = existingGEP + RHS * scale.
+    // So the RHS becomes the "remaining" index calculation.
+    //
+    // For uniformity: prefer the remaining calculation to be uniform, as it
+    // can then stay in scalar registers.
+    //
+    // Default order tries LHS first (RHS as remainder). If LHS is uniform and
+    // RHS is divergent, we want to try RHS first so uniform LHS becomes the
+    // remainder. The case where RHS is uniform and LHS is divergent is already
+    // handled by the default order.
+    if (UI && UI->isUniform(LHS) && !UI->isUniform(RHS)) {
+      LLVM_DEBUG(
+          dbgs() << "NARY: Preferring uniform remainder for GEP index\n");
+      // LHS is uniform, prefer it as remainder - try RHS first
+      if (LHS != RHS) {
+        if (auto *NewGEP =
+                tryReassociateGEPAtIndex(GEP, I, RHS, LHS, IndexedType))
+          return NewGEP;
+      }
+      if (auto *NewGEP =
+              tryReassociateGEPAtIndex(GEP, I, LHS, RHS, IndexedType))
+        return NewGEP;
+      return nullptr;
+    }
+
+    // Default order
     if (auto *NewGEP = tryReassociateGEPAtIndex(GEP, I, LHS, RHS, IndexedType))
       return NewGEP;
     // Symmetrically, try IndexToSplit = RHS + LHS.
@@ -482,6 +522,38 @@ Instruction *NaryReassociatePass::tryReassociateBinaryOp(Value *LHS, Value *RHS,
     //   = (A op RHS) op B or (B op RHS) op A
     SCEVUse AExpr = SE->getSCEV(A), BExpr = SE->getSCEV(B);
     SCEVUse RHSExpr = SE->getSCEV(RHS);
+
+    // When uniformity analysis is available, prefer reassociations that group
+    // uniform values together. This can reduce register pressure on targets
+    // with divergent execution.
+    //
+    // For I = (A op B) op RHS, we can form:
+    //   - (A op RHS) op B: groups A and RHS
+    //   - (B op RHS) op A: groups B and RHS
+    //
+    // Prefer the grouping where both operands in the new sub-expression are
+    // uniform, keeping uniform computations grouped together.
+    //
+    // We only need to handle the case where B and RHS are uniform but A is
+    // divergent. The symmetric case (A and RHS uniform, B divergent) is already
+    // handled by the default order which tries (A op RHS) op B first.
+    if (UI && UI->isUniform(B) && UI->isUniform(RHS) && !UI->isUniform(A)) {
+      LLVM_DEBUG(dbgs() << "NARY: Preferring uniform grouping for " << *I
+                        << "\n");
+      if (AExpr != RHSExpr) {
+        if (Instruction *NewI =
+                tryReassociatedBinaryOp(getBinarySCEV(I, BExpr, RHSExpr), A, I))
+          return NewI;
+      }
+      if (BExpr != RHSExpr) {
+        if (Instruction *NewI =
+                tryReassociatedBinaryOp(getBinarySCEV(I, AExpr, RHSExpr), B, I))
+          return NewI;
+      }
+      return nullptr;
+    }
+
+    // Default order: try (A op RHS) op B first
     if (BExpr != RHSExpr) {
       if (auto *NewI =
               tryReassociatedBinaryOp(getBinarySCEV(I, AExpr, RHSExpr), B, I))
@@ -649,6 +721,27 @@ Value *NaryReassociatePass::tryReassociateMinOrMax(Instruction *I,
   SCEVUse BExpr = SE->getSCEV(B);
   SCEVUse RHSExpr = SE->getSCEV(RHS);
 
+  // Similar to binary ops, prefer grouping uniform values together when
+  // uniformity analysis is available.
+  // For I = minmax(minmax(A, B), RHS), we can form:
+  //   - minmax(minmax(A, RHS), B): groups A and RHS
+  //   - minmax(minmax(B, RHS), A): groups B and RHS
+  if (UI && UI->isUniform(B) && UI->isUniform(RHS) && !UI->isUniform(A)) {
+    LLVM_DEBUG(dbgs() << "NARY: Preferring uniform grouping for minmax " << *I
+                      << "\n");
+    // Try (B op RHS) op A first - groups uniform B with uniform RHS
+    if (AExpr != RHSExpr) {
+      if (auto *NewMinMax = tryCombination(RHS, RHSExpr, B, BExpr, A, AExpr))
+        return NewMinMax;
+    }
+    if (BExpr != RHSExpr) {
+      if (auto *NewMinMax = tryCombination(A, AExpr, RHS, RHSExpr, B, BExpr))
+        return NewMinMax;
+    }
+    return nullptr;
+  }
+
+  // Default order
   if (BExpr != RHSExpr) {
     // Try (A op RHS) op B
     if (auto *NewMinMax = tryCombination(A, AExpr, RHS, RHSExpr, B, BExpr))
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 81b9aae775ed8..3592c2d208947 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -516,9 +516,10 @@
 ; GCN-O1-OPTS-NEXT:      Straight line strength reduction
 ; GCN-O1-OPTS-NEXT:      Early CSE
 ; GCN-O1-OPTS-NEXT:      Scalar Evolution Analysis
+; GCN-O1-OPTS-NEXT:      Cycle Info Analysis
+; GCN-O1-OPTS-NEXT:      Uniformity Analysis
 ; GCN-O1-OPTS-NEXT:      Nary reassociation
 ; GCN-O1-OPTS-NEXT:      Early CSE
-; GCN-O1-OPTS-NEXT:      Cycle Info Analysis
 ; GCN-O1-OPTS-NEXT:      Uniformity Analysis
 ; GCN-O1-OPTS-NEXT:      AMDGPU IR optimizations
 ; GCN-O1-OPTS-NEXT:      Basic Alias Analysis (stateless AA impl)
@@ -833,9 +834,10 @@
 ; GCN-O2-NEXT:      Straight line strength reduction
 ; GCN-O2-NEXT:      Early CSE
 ; GCN-O2-NEXT:      Scalar Evolution Analysis
+; GCN-O2-NEXT:      Cycle Info Analysis
+; GCN-O2-NEXT:      Uniformity Analysis
 ; GCN-O2-NEXT:      Nary reassociation
 ; GCN-O2-NEXT:      Early CSE
-; GCN-O2-NEXT:      Cycle Info Analysis
 ; GCN-O2-NEXT:      Uniformity Analysis
 ; GCN-O2-NEXT:      AMDGPU IR optimizations
 ; GCN-O2-NEXT:      Basic Alias Analysis (stateless AA impl)
@@ -1166,9 +1168,10 @@
 ; GCN-O3-NEXT:      Optimization Remark Emitter
 ; GCN-O3-NEXT:      Global Value Numbering
 ; GCN-O3-NEXT:      Scalar Evolution Analysis
+; GCN-O3-NEXT:      Cycle Info Analysis
+; GCN-O3-NEXT:      Uniformity Analysis
 ; GCN-O3-NEXT:      Nary reassociation
 ; GCN-O3-NEXT:      Early CSE
-; GCN-O3-NEXT:      Cycle Info Analysis
 ; GCN-O3-NEXT:      Uniformity Analysis
 ; GCN-O3-NEXT:      AMDGPU IR optimizations
 ; GCN-O3-NEXT:      Basic Alias Analysis (stateless AA impl)
diff --git a/llvm/test/Transforms/NaryReassociate/AMDGPU/nary-add-uniform.ll b/llvm/test/Transforms/NaryReassociate/AMDGPU/nary-add-uniform.ll
index d8f5c6009aa5e..487a6d9270121 100644
--- a/llvm/test/Transforms/NaryReassociate/AMDGPU/nary-add-uniform.ll
+++ b/llvm/test/Transforms/NaryReassociate/AMDGPU/nary-add-uniform.ll
@@ -30,7 +30,7 @@ define amdgpu_kernel void @prefer_uniform_grouping(i32 %u1, i32 %u2) {
 ; CHECK-NEXT:    [[U1_U2:%.*]] = add i32 [[U1]], [[U2]]
 ; CHECK-NEXT:    call void @use(i32 [[D_U2]])
 ; CHECK-NEXT:    call void @use(i32 [[U1_U2]])
-; CHECK-NEXT:    [[RESULT:%.*]] = add i32 [[D_U2]], [[U1]]
+; CHECK-NEXT:    [[RESULT:%.*]] = add i32 [[U1_U2]], [[D]]
 ; CHECK-NEXT:    call void @use(i32 [[RESULT]])
 ; CHECK-NEXT:    ret void
 ;
@@ -60,7 +60,7 @@ define amdgpu_kernel void @prefer_uniform_grouping_mul(i32 %u1, i32 %u2) {
 ; CHECK-NEXT:    [[U1_U2:%.*]] = mul i32 [[U1]], [[U2]]
 ; CHECK-NEXT:    call void @use(i32 [[D_U2]])
 ; CHECK-NEXT:    call void @use(i32 [[U1_U2]])
-; CHECK-NEXT:    [[RESULT:%.*]] = mul i32 [[D_U2]], [[U1]]
+; CHECK-NEXT:    [[RESULT:%.*]] = mul i32 [[U1_U2]], [[D]]
 ; CHECK-NEXT:    call void @use(i32 [[RESULT]])
 ; CHECK-NEXT:    ret void
 ;
@@ -135,7 +135,7 @@ define amdgpu_kernel void @prefer_uniform_grouping_smax(i32 %u1, i32 %u2) {
 ; CHECK-NEXT:    [[U1_U2:%.*]] = call i32 @llvm.smax.i32(i32 [[U1]], i32 [[U2]])
 ; CHECK-NEXT:    call void @use(i32 [[D_U2]])
 ; CHECK-NEXT:    call void @use(i32 [[U1_U2]])
-; CHECK-NEXT:    [[RESULT_NARY:%.*]] = call i32 @llvm.smax.i32(i32 [[D_U2]], i32 [[U1]])
+; CHECK-NEXT:    [[RESULT_NARY:%.*]] = call i32 @llvm.smax.i32(i32 [[U1_U2]], i32 [[D]])
 ; CHECK-NEXT:    call void @use(i32 [[RESULT_NARY]])
 ; CHECK-NEXT:    ret void
 ;
@@ -166,7 +166,7 @@ define amdgpu_kernel void @prefer_uniform_grouping_umin(i32 %u1, i32 %u2) {
 ; CHECK-NEXT:    [[U1_U2:%.*]] = call i32 @llvm.umin.i32(i32 [[U1]], i32 [[U2]])
 ; CHECK-NEXT:    call void @use(i32 [[D_U2]])
 ; CHECK-NEXT:    call void @use(i32 [[U1_U2]])
-; CHECK-NEXT:    [[RESULT_NARY:%.*]] = call i32 @llvm.umin.i32(i32 [[D_U2]], i32 [[U1]])
+; CHECK-NEXT:    [[RESULT_NARY:%.*]] = call i32 @llvm.umin.i32(i32 [[U1_U2]], i32 [[D]])
 ; CHECK-NEXT:    call void @use(i32 [[RESULT_NARY]])
 ; CHECK-NEXT:    ret void
 ;
@@ -195,7 +195,7 @@ define amdgpu_kernel void @gep_lhs_uniform_rhs_divergent(ptr %base, i64 %u_offse
 ; CHECK-NEXT:    [[GEP_D:%.*]] = getelementptr i32, ptr [[BASE]], i64 [[D_EXT]]
 ; CHECK-NEXT:    call void @use_ptr(ptr [[GEP_U]])
 ; CHECK-NEXT:    call void @use_ptr(ptr [[GEP_D]])
-; CHECK-NEXT:    [[GEP_RESULT:%.*]] = getelementptr i32, ptr [[GEP_U]], i64 [[D_EXT]]
+; CHECK-NEXT:    [[GEP_RESULT:%.*]] = getelementptr i32, ptr [[GEP_D]], i64 [[U_OFFSET]]
 ; CHECK-NEXT:    call void @use_ptr(ptr [[GEP_RESULT]])
 ; CHECK-NEXT:    ret void
 ;

>From eac646711a17072bd2abffd75d4147e8ef21f5f3 Mon Sep 17 00:00:00 2001
From: padivedi <padivedi at amd.com>
Date: Thu, 26 Feb 2026 18:40:31 +0530
Subject: [PATCH 3/4] review: address suggestions

---
 .../lib/Transforms/Scalar/NaryReassociate.cpp | 48 ++++++++-----------
 1 file changed, 20 insertions(+), 28 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/NaryReassociate.cpp b/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
index 72dad5db0e931..c239e09470140 100644
--- a/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
+++ b/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
@@ -181,7 +181,8 @@ bool NaryReassociateLegacyPass::runOnFunction(Function &F) {
 
   // UniformityInfo is required on all targets, but on targets without branch
   // divergence it does no work and reports everything as uniform.
-  auto *UI = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
+  UniformityInfo *UI =
+      &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
 
   return Impl.runImpl(F, AC, DT, SE, TLI, TTI, UI);
 }
@@ -196,7 +197,7 @@ PreservedAnalyses NaryReassociatePass::run(Function &F,
 
   // UniformityInfo is required on all targets, but on targets without branch
   // divergence it does no work and reports everything as uniform.
-  auto *UI = &AM.getResult<UniformityInfoAnalysis>(F);
+  UniformityInfo *UI = &AM.getResult<UniformityInfoAnalysis>(F);
 
   if (!runImpl(F, AC, DT, SE, TLI, TTI, UI))
     return PreservedAnalyses::all();
@@ -395,38 +396,29 @@ NaryReassociatePass::tryReassociateGEPAtIndex(GetElementPtrInst *GEP,
     // tryReassociateGEPAtIndex(GEP, I, LHS, RHS, ...) looks for a dominating
     // GEP with LHS as index, then creates: NewGEP = existingGEP + RHS * scale.
     // So the RHS becomes the "remaining" index calculation.
-    //
-    // For uniformity: prefer the remaining calculation to be uniform, as it
-    // can then stay in scalar registers.
-    //
+
+    // When LHS == RHS, both call orders are identical, so only try once.
+    if (LHS == RHS)
+      return tryReassociateGEPAtIndex(GEP, I, LHS, RHS, IndexedType);
+
+    // When uniformity analysis is available, prefer the remaining calculation
+    // to be uniform, keeping uniform computations grouped together.
     // Default order tries LHS first (RHS as remainder). If LHS is uniform and
-    // RHS is divergent, we want to try RHS first so uniform LHS becomes the
-    // remainder. The case where RHS is uniform and LHS is divergent is already
-    // handled by the default order.
+    // RHS is divergent, try RHS first so uniform LHS becomes the remainder.
     if (UI && UI->isUniform(LHS) && !UI->isUniform(RHS)) {
       LLVM_DEBUG(
           dbgs() << "NARY: Preferring uniform remainder for GEP index\n");
-      // LHS is uniform, prefer it as remainder - try RHS first
-      if (LHS != RHS) {
-        if (auto *NewGEP =
-                tryReassociateGEPAtIndex(GEP, I, RHS, LHS, IndexedType))
-          return NewGEP;
-      }
-      if (auto *NewGEP =
-              tryReassociateGEPAtIndex(GEP, I, LHS, RHS, IndexedType))
+      if (GetElementPtrInst *NewGEP =
+              tryReassociateGEPAtIndex(GEP, I, RHS, LHS, IndexedType))
         return NewGEP;
-      return nullptr;
+      return tryReassociateGEPAtIndex(GEP, I, LHS, RHS, IndexedType);
     }
 
-    // Default order
-    if (auto *NewGEP = tryReassociateGEPAtIndex(GEP, I, LHS, RHS, IndexedType))
+    // Default order: try (LHS, RHS) first, then (RHS, LHS).
+    if (GetElementPtrInst *NewGEP =
+            tryReassociateGEPAtIndex(GEP, I, LHS, RHS, IndexedType))
       return NewGEP;
-    // Symmetrically, try IndexToSplit = RHS + LHS.
-    if (LHS != RHS) {
-      if (auto *NewGEP =
-              tryReassociateGEPAtIndex(GEP, I, RHS, LHS, IndexedType))
-        return NewGEP;
-    }
+    return tryReassociateGEPAtIndex(GEP, I, RHS, LHS, IndexedType);
   }
   return nullptr;
 }
@@ -731,11 +723,11 @@ Value *NaryReassociatePass::tryReassociateMinOrMax(Instruction *I,
                       << "\n");
     // Try (B op RHS) op A first - groups uniform B with uniform RHS
     if (AExpr != RHSExpr) {
-      if (auto *NewMinMax = tryCombination(RHS, RHSExpr, B, BExpr, A, AExpr))
+      if (Value *NewMinMax = tryCombination(RHS, RHSExpr, B, BExpr, A, AExpr))
         return NewMinMax;
     }
     if (BExpr != RHSExpr) {
-      if (auto *NewMinMax = tryCombination(A, AExpr, RHS, RHSExpr, B, BExpr))
+      if (Value *NewMinMax = tryCombination(A, AExpr, RHS, RHSExpr, B, BExpr))
         return NewMinMax;
     }
     return nullptr;

>From ca8645f14a6e0f7bf16e83721713686714b1306c Mon Sep 17 00:00:00 2001
From: padivedi <padivedi at amd.com>
Date: Wed, 1 Apr 2026 13:33:29 +0530
Subject: [PATCH 4/4] delay fetching unifromity for target which don't need it

---
 llvm/lib/Transforms/Scalar/NaryReassociate.cpp | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/NaryReassociate.cpp b/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
index c239e09470140..6e4927fe7a329 100644
--- a/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
+++ b/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
@@ -179,10 +179,11 @@ bool NaryReassociateLegacyPass::runOnFunction(Function &F) {
   auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
   auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
 
-  // UniformityInfo is required on all targets, but on targets without branch
-  // divergence it does no work and reports everything as uniform.
-  UniformityInfo *UI =
-      &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
+  // Only compute UniformityInfo on targets with branch divergence to avoid
+  // the compile-time cost of CycleAnalysis on targets that don't need it.
+  UniformityInfo *UI = nullptr;
+  if (TTI->hasBranchDivergence(&F))
+    UI = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
 
   return Impl.runImpl(F, AC, DT, SE, TLI, TTI, UI);
 }
@@ -195,9 +196,11 @@ PreservedAnalyses NaryReassociatePass::run(Function &F,
   auto *TLI = &AM.getResult<TargetLibraryAnalysis>(F);
   auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
 
-  // UniformityInfo is required on all targets, but on targets without branch
-  // divergence it does no work and reports everything as uniform.
-  UniformityInfo *UI = &AM.getResult<UniformityInfoAnalysis>(F);
+  // Only compute UniformityInfo on targets with branch divergence to avoid
+  // the compile-time cost of CycleAnalysis on targets that don't need it.
+  UniformityInfo *UI = nullptr;
+  if (TTI->hasBranchDivergence(&F))
+    UI = &AM.getResult<UniformityInfoAnalysis>(F);
 
   if (!runImpl(F, AC, DT, SE, TLI, TTI, UI))
     return PreservedAnalyses::all();