[llvm] [NaryReassociate] Teach NaryReassociate about UniformityAnalysis (PR #175167)

Mon Jan 12 03:04:52 PST 2026

https://github.com/PankajDwivedi-25 updated https://github.com/llvm/llvm-project/pull/175167

>From 3cdf9ce45a55613456617697cc44bcaa87645331 Mon Sep 17 00:00:00 2001
From: padivedi <padivedi at amd.com>
Date: Mon, 12 Jan 2026 14:12:40 +0530
Subject: [PATCH 1/3] [NaryReassociate][AMDGPU] Pre-commit test for
 uniformity-aware reassociation (NFC)

---
 .../AMDGPU/nary-add-uniform.ll                | 222 ++++++++++++++++++
 1 file changed, 222 insertions(+)
 create mode 100644 llvm/test/Transforms/NaryReassociate/AMDGPU/nary-add-uniform.ll

diff --git a/llvm/test/Transforms/NaryReassociate/AMDGPU/nary-add-uniform.ll b/llvm/test/Transforms/NaryReassociate/AMDGPU/nary-add-uniform.ll
new file mode 100644
index 0000000000000..951b612509e4e
--- /dev/null
+++ b/llvm/test/Transforms/NaryReassociate/AMDGPU/nary-add-uniform.ll
@@ -0,0 +1,222 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; REQUIRES: amdgpu-registered-target
+; RUN: opt < %s -mtriple=amdgcn-amd-amdhsa -passes='nary-reassociate' -S | FileCheck %s
+
+declare i32 @llvm.amdgcn.workitem.id.x()
+declare void @use(i32)
+
+; Test that NaryReassociate prefers grouping uniform values together when
+; uniformity analysis is available and both reassociation options exist.
+;
+; For I = (A op B) op RHS, the pass can form:
+;   - (A op RHS) op B
+;   - (B op RHS) op A
+;
+; When both dominating expressions exist, prefer the one grouping uniforms.
+
+; Both %d_u2 and %u1_u2 exist as dominating expressions.
+; For (d + u1) + u2:
+;   - Without UA preference: would try (d + u2) first, find %d_u2, return %d_u2 + u1
+;   - With UA preference: B=u1 and RHS=u2 are uniform, A=d is divergent
+;                         So prefer (u1 + u2) + d, returning %u1_u2 + d
+;
+
+define amdgpu_kernel void @prefer_uniform_grouping(i32 %u1, i32 %u2) {
+; CHECK-LABEL: define amdgpu_kernel void @prefer_uniform_grouping(
+; CHECK-SAME: i32 [[U1:%.*]], i32 [[U2:%.*]]) {
+; CHECK-NEXT:    [[D:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[D_U2:%.*]] = add i32 [[D]], [[U2]]
+; CHECK-NEXT:    [[U1_U2:%.*]] = add i32 [[U1]], [[U2]]
+; CHECK-NEXT:    call void @use(i32 [[D_U2]])
+; CHECK-NEXT:    call void @use(i32 [[U1_U2]])
+; CHECK-NEXT:    [[RESULT:%.*]] = add i32 [[D_U2]], [[U1]]
+; CHECK-NEXT:    call void @use(i32 [[RESULT]])
+; CHECK-NEXT:    ret void
+;
+  %d = call i32 @llvm.amdgcn.workitem.id.x()
+
+  ; Create both possible reassociation targets
+  %d_u2 = add i32 %d, %u2        ; divergent + uniform
+  %u1_u2 = add i32 %u1, %u2      ; uniform + uniform (should be preferred!)
+
+  call void @use(i32 %d_u2)
+  call void @use(i32 %u1_u2)
+
+  ; (d + u1) + u2: both (d + u2) and (u1 + u2) exist
+  ; Should prefer (u1 + u2) + d to group uniforms
+  %tmp = add i32 %d, %u1
+  %result = add i32 %tmp, %u2
+  call void @use(i32 %result)
+
+  ret void
+}
+
+define amdgpu_kernel void @prefer_uniform_grouping_mul(i32 %u1, i32 %u2) {
+; CHECK-LABEL: define amdgpu_kernel void @prefer_uniform_grouping_mul(
+; CHECK-SAME: i32 [[U1:%.*]], i32 [[U2:%.*]]) {
+; CHECK-NEXT:    [[D:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[D_U2:%.*]] = mul i32 [[D]], [[U2]]
+; CHECK-NEXT:    [[U1_U2:%.*]] = mul i32 [[U1]], [[U2]]
+; CHECK-NEXT:    call void @use(i32 [[D_U2]])
+; CHECK-NEXT:    call void @use(i32 [[U1_U2]])
+; CHECK-NEXT:    [[RESULT:%.*]] = mul i32 [[D_U2]], [[U1]]
+; CHECK-NEXT:    call void @use(i32 [[RESULT]])
+; CHECK-NEXT:    ret void
+;
+  %d = call i32 @llvm.amdgcn.workitem.id.x()
+
+  %d_u2 = mul i32 %d, %u2
+  %u1_u2 = mul i32 %u1, %u2
+
+  call void @use(i32 %d_u2)
+  call void @use(i32 %u1_u2)
+
+  %tmp = mul i32 %d, %u1
+  %result = mul i32 %tmp, %u2
+  call void @use(i32 %result)
+
+  ret void
+}
+
+define amdgpu_kernel void @only_one_option(i32 %u1, i32 %u2) {
+; CHECK-LABEL: define amdgpu_kernel void @only_one_option(
+; CHECK-SAME: i32 [[U1:%.*]], i32 [[U2:%.*]]) {
+; CHECK-NEXT:    [[D:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[U1_U2:%.*]] = add i32 [[U1]], [[U2]]
+; CHECK-NEXT:    call void @use(i32 [[U1_U2]])
+; CHECK-NEXT:    [[RESULT:%.*]] = add i32 [[U1_U2]], [[D]]
+; CHECK-NEXT:    call void @use(i32 [[RESULT]])
+; CHECK-NEXT:    ret void
+;
+  %d = call i32 @llvm.amdgcn.workitem.id.x()
+
+  ; Only u1 + u2 exists, not d + u2
+  %u1_u2 = add i32 %u1, %u2
+  call void @use(i32 %u1_u2)
+
+  %tmp = add i32 %d, %u1
+  %result = add i32 %tmp, %u2
+  call void @use(i32 %result)
+
+  ret void
+}
+
+; When no dominating expression exists, no reassociation happens
+define amdgpu_kernel void @no_dominating_expr(i32 %u1, i32 %u2) {
+; CHECK-LABEL: define amdgpu_kernel void @no_dominating_expr(
+; CHECK-SAME: i32 [[U1:%.*]], i32 [[U2:%.*]]) {
+; CHECK-NEXT:    [[D:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[TMP:%.*]] = add i32 [[D]], [[U1]]
+; CHECK-NEXT:    [[RESULT:%.*]] = add i32 [[TMP]], [[U2]]
+; CHECK-NEXT:    call void @use(i32 [[RESULT]])
+; CHECK-NEXT:    ret void
+;
+  %d = call i32 @llvm.amdgcn.workitem.id.x()
+
+  ; No dominating expressions exist
+  %tmp = add i32 %d, %u1
+  %result = add i32 %tmp, %u2
+  call void @use(i32 %result)
+
+  ret void
+}
+
+; Test smax: prefer grouping uniform values together
+; For smax(smax(A, B), RHS):
+;   - smax(smax(A, RHS), B): groups A and RHS
+;   - smax(smax(B, RHS), A): groups B and RHS
+; When B and RHS are uniform but A is divergent, prefer smax(smax(B, RHS), A)
+define amdgpu_kernel void @prefer_uniform_grouping_smax(i32 %u1, i32 %u2) {
+; CHECK-LABEL: define amdgpu_kernel void @prefer_uniform_grouping_smax(
+; CHECK-SAME: i32 [[U1:%.*]], i32 [[U2:%.*]]) {
+; CHECK-NEXT:    [[D:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[D_U2:%.*]] = call i32 @llvm.smax.i32(i32 [[D]], i32 [[U2]])
+; CHECK-NEXT:    [[U1_U2:%.*]] = call i32 @llvm.smax.i32(i32 [[U1]], i32 [[U2]])
+; CHECK-NEXT:    call void @use(i32 [[D_U2]])
+; CHECK-NEXT:    call void @use(i32 [[U1_U2]])
+; CHECK-NEXT:    [[RESULT_NARY:%.*]] = call i32 @llvm.smax.i32(i32 [[D_U2]], i32 [[U1]])
+; CHECK-NEXT:    call void @use(i32 [[RESULT_NARY]])
+; CHECK-NEXT:    ret void
+;
+  %d = call i32 @llvm.amdgcn.workitem.id.x()
+
+  ; Create both possible reassociation targets
+  %d_u2 = call i32 @llvm.smax.i32(i32 %d, i32 %u2)    ; divergent, uniform
+  %u1_u2 = call i32 @llvm.smax.i32(i32 %u1, i32 %u2)  ; uniform, uniform (preferred!)
+
+  call void @use(i32 %d_u2)
+  call void @use(i32 %u1_u2)
+
+  ; smax(smax(d, u1), u2): both smax(d, u2) and smax(u1, u2) exist
+  ; Should prefer smax(smax(u1, u2), d) to group uniforms
+  %tmp = call i32 @llvm.smax.i32(i32 %d, i32 %u1)
+  %result = call i32 @llvm.smax.i32(i32 %tmp, i32 %u2)
+  call void @use(i32 %result)
+
+  ret void
+}
+
+; Test umin: prefer grouping uniform values together
+define amdgpu_kernel void @prefer_uniform_grouping_umin(i32 %u1, i32 %u2) {
+; CHECK-LABEL: define amdgpu_kernel void @prefer_uniform_grouping_umin(
+; CHECK-SAME: i32 [[U1:%.*]], i32 [[U2:%.*]]) {
+; CHECK-NEXT:    [[D:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[D_U2:%.*]] = call i32 @llvm.umin.i32(i32 [[D]], i32 [[U2]])
+; CHECK-NEXT:    [[U1_U2:%.*]] = call i32 @llvm.umin.i32(i32 [[U1]], i32 [[U2]])
+; CHECK-NEXT:    call void @use(i32 [[D_U2]])
+; CHECK-NEXT:    call void @use(i32 [[U1_U2]])
+; CHECK-NEXT:    [[RESULT_NARY:%.*]] = call i32 @llvm.umin.i32(i32 [[D_U2]], i32 [[U1]])
+; CHECK-NEXT:    call void @use(i32 [[RESULT_NARY]])
+; CHECK-NEXT:    ret void
+;
+  %d = call i32 @llvm.amdgcn.workitem.id.x()
+
+  %d_u2 = call i32 @llvm.umin.i32(i32 %d, i32 %u2)
+  %u1_u2 = call i32 @llvm.umin.i32(i32 %u1, i32 %u2)
+
+  call void @use(i32 %d_u2)
+  call void @use(i32 %u1_u2)
+
+  %tmp = call i32 @llvm.umin.i32(i32 %d, i32 %u1)
+  %result = call i32 @llvm.umin.i32(i32 %tmp, i32 %u2)
+  call void @use(i32 %result)
+
+  ret void
+}
+
+; Test GEP: prefer uniform remainder in index calculation
+; For GEP with index = LHS + RHS:
+;   - If RHS is uniform, prefer LHS first (uniform RHS as remainder)
+;   - If LHS is uniform, prefer RHS first (uniform LHS as remainder)
+define amdgpu_kernel void @prefer_uniform_gep_remainder(ptr %base, i64 %u_offset) {
+; CHECK-LABEL: define amdgpu_kernel void @prefer_uniform_gep_remainder(
+; CHECK-SAME: ptr [[BASE:%.*]], i64 [[U_OFFSET:%.*]]) {
+; CHECK-NEXT:    [[D:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[D_EXT:%.*]] = zext i32 [[D]] to i64
+; CHECK-NEXT:    [[GEP_D:%.*]] = getelementptr i32, ptr [[BASE]], i64 [[D_EXT]]
+; CHECK-NEXT:    call void @use_ptr(ptr [[GEP_D]])
+; CHECK-NEXT:    [[GEP_RESULT:%.*]] = getelementptr i32, ptr [[GEP_D]], i64 [[U_OFFSET]]
+; CHECK-NEXT:    call void @use_ptr(ptr [[GEP_RESULT]])
+; CHECK-NEXT:    ret void
+;
+  %d = call i32 @llvm.amdgcn.workitem.id.x()
+  %d_ext = zext i32 %d to i64
+
+  ; Create dominating GEP with divergent index
+  %gep_d = getelementptr i32, ptr %base, i64 %d_ext
+  call void @use_ptr(ptr %gep_d)
+
+  ; GEP with index = d + u_offset
+  ; Should prefer finding dominating GEP with d, leaving uniform u_offset as remainder
+  %idx = add i64 %d_ext, %u_offset
+  %gep_result = getelementptr i32, ptr %base, i64 %idx
+  call void @use_ptr(ptr %gep_result)
+
+  ret void
+}
+
+declare i32 @llvm.smax.i32(i32, i32)
+declare i32 @llvm.smin.i32(i32, i32)
+declare i32 @llvm.umax.i32(i32, i32)
+declare i32 @llvm.umin.i32(i32, i32)
+declare void @use_ptr(ptr)

>From 2e51a8a79191a709cfa88258233e5fb1cadbd5f6 Mon Sep 17 00:00:00 2001
From: padivedi <padivedi at amd.com>
Date: Mon, 12 Jan 2026 14:17:18 +0530
Subject: [PATCH 2/3] [NaryReassociate] Make uniformity-aware to prefer
 grouping uniform values

---
 .../llvm/Transforms/Scalar/NaryReassociate.h  |   4 +-
 .../lib/Transforms/Scalar/NaryReassociate.cpp | 151 ++++++++++++++----
 llvm/test/CodeGen/AMDGPU/llc-pipeline.ll      |   9 +-
 .../AMDGPU/nary-add-uniform.ll                |   8 +-
 4 files changed, 137 insertions(+), 35 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Scalar/NaryReassociate.h b/llvm/include/llvm/Transforms/Scalar/NaryReassociate.h
index f0474bc4352e3..4318945bd82b3 100644
--- a/llvm/include/llvm/Transforms/Scalar/NaryReassociate.h
+++ b/llvm/include/llvm/Transforms/Scalar/NaryReassociate.h
@@ -80,6 +80,7 @@
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/UniformityAnalysis.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/ValueHandle.h"
 
@@ -106,7 +107,7 @@ class NaryReassociatePass : public PassInfoMixin<NaryReassociatePass> {
   // Glue for old PM.
   bool runImpl(Function &F, AssumptionCache *AC_, DominatorTree *DT_,
                ScalarEvolution *SE_, TargetLibraryInfo *TLI_,
-               TargetTransformInfo *TTI_);
+               TargetTransformInfo *TTI_, UniformityInfo *UI_ = nullptr);
 
 private:
   // Runs only one iteration of the dominator-based algorithm. See the header
@@ -183,6 +184,7 @@ class NaryReassociatePass : public PassInfoMixin<NaryReassociatePass> {
   ScalarEvolution *SE;
   TargetLibraryInfo *TLI;
   TargetTransformInfo *TTI;
+  UniformityInfo *UI;
 
   // A lookup table quickly telling which instructions compute the given SCEV.
   // Note that there can be multiple instructions at different locations
diff --git a/llvm/lib/Transforms/Scalar/NaryReassociate.cpp b/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
index ec145f2f48bea..43d7a9375b1d0 100644
--- a/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
+++ b/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
@@ -104,6 +104,7 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -141,6 +142,7 @@ class NaryReassociateLegacyPass : public FunctionPass {
     AU.addRequired<ScalarEvolutionWrapperPass>();
     AU.addRequired<TargetLibraryInfoWrapperPass>();
     AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addRequired<UniformityInfoWrapperPass>();
     AU.setPreservesCFG();
   }
 
@@ -159,6 +161,7 @@ INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
 INITIALIZE_PASS_END(NaryReassociateLegacyPass, "nary-reassociate",
                     "Nary reassociation", false, false)
 
@@ -176,7 +179,11 @@ bool NaryReassociateLegacyPass::runOnFunction(Function &F) {
   auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
   auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
 
-  return Impl.runImpl(F, AC, DT, SE, TLI, TTI);
+  // UniformityInfo is required on all targets, but on targets without branch
+  // divergence it does no work and reports everything as uniform.
+  auto *UI = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
+
+  return Impl.runImpl(F, AC, DT, SE, TLI, TTI, UI);
 }
 
 PreservedAnalyses NaryReassociatePass::run(Function &F,
@@ -187,7 +194,11 @@ PreservedAnalyses NaryReassociatePass::run(Function &F,
   auto *TLI = &AM.getResult<TargetLibraryAnalysis>(F);
   auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
 
-  if (!runImpl(F, AC, DT, SE, TLI, TTI))
+  // UniformityInfo is required on all targets, but on targets without branch
+  // divergence it does no work and reports everything as uniform.
+  auto *UI = &AM.getResult<UniformityInfoAnalysis>(F);
+
+  if (!runImpl(F, AC, DT, SE, TLI, TTI, UI))
     return PreservedAnalyses::all();
 
   PreservedAnalyses PA;
@@ -199,12 +210,14 @@ PreservedAnalyses NaryReassociatePass::run(Function &F,
 bool NaryReassociatePass::runImpl(Function &F, AssumptionCache *AC_,
                                   DominatorTree *DT_, ScalarEvolution *SE_,
                                   TargetLibraryInfo *TLI_,
-                                  TargetTransformInfo *TTI_) {
+                                  TargetTransformInfo *TTI_,
+                                  UniformityInfo *UI_) {
   AC = AC_;
   DT = DT_;
   SE = SE_;
   TLI = TLI_;
   TTI = TTI_;
+  UI = UI_;
   DL = &F.getDataLayout();
 
   bool Changed = false, ChangedInThisIteration;
@@ -379,13 +392,46 @@ NaryReassociatePass::tryReassociateGEPAtIndex(GetElementPtrInst *GEP,
 
     Value *LHS = AO->getOperand(0), *RHS = AO->getOperand(1);
     // IndexToSplit = LHS + RHS.
-    if (auto *NewGEP = tryReassociateGEPAtIndex(GEP, I, LHS, RHS, IndexedType))
-      return NewGEP;
-    // Symmetrically, try IndexToSplit = RHS + LHS.
-    if (LHS != RHS) {
+    // tryReassociateGEPAtIndex(GEP, I, LHS, RHS, ...) looks for a dominating
+    // GEP with LHS as index, then creates: NewGEP = existingGEP + RHS * scale.
+    // So the RHS becomes the "remaining" index calculation.
+    //
+    // For uniformity: prefer the remaining calculation to be uniform, as it
+    // can then stay in scalar registers. So if RHS is uniform but LHS is
+    // divergent, try LHS first (leaving uniform RHS as remainder).
+    if (UI && UI->isUniform(RHS) && !UI->isUniform(LHS)) {
+      LLVM_DEBUG(
+          dbgs() << "NARY: Preferring uniform remainder for GEP index\n");
+      if (auto *NewGEP =
+              tryReassociateGEPAtIndex(GEP, I, LHS, RHS, IndexedType))
+        return NewGEP;
+      if (LHS != RHS) {
+        if (auto *NewGEP =
+                tryReassociateGEPAtIndex(GEP, I, RHS, LHS, IndexedType))
+          return NewGEP;
+      }
+    } else if (UI && UI->isUniform(LHS) && !UI->isUniform(RHS)) {
+      LLVM_DEBUG(
+          dbgs() << "NARY: Preferring uniform remainder for GEP index\n");
+      // LHS is uniform, prefer it as remainder - try RHS first
+      if (LHS != RHS) {
+        if (auto *NewGEP =
+                tryReassociateGEPAtIndex(GEP, I, RHS, LHS, IndexedType))
+          return NewGEP;
+      }
+      if (auto *NewGEP =
+              tryReassociateGEPAtIndex(GEP, I, LHS, RHS, IndexedType))
+        return NewGEP;
+    } else {
+      // Default order
       if (auto *NewGEP =
-              tryReassociateGEPAtIndex(GEP, I, RHS, LHS, IndexedType))
+              tryReassociateGEPAtIndex(GEP, I, LHS, RHS, IndexedType))
         return NewGEP;
+      if (LHS != RHS) {
+        if (auto *NewGEP =
+                tryReassociateGEPAtIndex(GEP, I, RHS, LHS, IndexedType))
+          return NewGEP;
+      }
     }
   }
   return nullptr;
@@ -483,15 +529,47 @@ Instruction *NaryReassociatePass::tryReassociateBinaryOp(Value *LHS, Value *RHS,
     //   = (A op RHS) op B or (B op RHS) op A
     const SCEV *AExpr = SE->getSCEV(A), *BExpr = SE->getSCEV(B);
     const SCEV *RHSExpr = SE->getSCEV(RHS);
-    if (BExpr != RHSExpr) {
-      if (auto *NewI =
-              tryReassociatedBinaryOp(getBinarySCEV(I, AExpr, RHSExpr), B, I))
-        return NewI;
-    }
-    if (AExpr != RHSExpr) {
-      if (auto *NewI =
-              tryReassociatedBinaryOp(getBinarySCEV(I, BExpr, RHSExpr), A, I))
-        return NewI;
+
+    // When uniformity analysis is available (e.g., on GPU targets), prefer
+    // reassociations that group uniform values together. This allows
+    // intermediate results to stay in scalar registers (SGPRs on AMDGPU),
+    // reducing vector register (VGPR) pressure.
+    //
+    // For I = (A op B) op RHS, we can form:
+    //   - (A op RHS) op B: groups A and RHS
+    //   - (B op RHS) op A: groups B and RHS
+    //
+    // Prefer the grouping where both operands in the new sub-expression are
+    // uniform, as this sub-expression can then be computed in scalar registers.
+    //
+    // We only need to handle the case where B and RHS are uniform but A is
+    // divergent. The symmetric case (A and RHS uniform, B divergent) is already
+    // handled by the default order which tries (A op RHS) op B first.
+    if (UI && UI->isUniform(B) && UI->isUniform(RHS) && !UI->isUniform(A)) {
+      LLVM_DEBUG(dbgs() << "NARY: Preferring uniform grouping for " << *I
+                        << "\n");
+      if (AExpr != RHSExpr) {
+        if (auto *NewI =
+                tryReassociatedBinaryOp(getBinarySCEV(I, BExpr, RHSExpr), A, I))
+          return NewI;
+      }
+      if (BExpr != RHSExpr) {
+        if (auto *NewI =
+                tryReassociatedBinaryOp(getBinarySCEV(I, AExpr, RHSExpr), B, I))
+          return NewI;
+      }
+    } else {
+      // Default order: try (A op RHS) op B first
+      if (BExpr != RHSExpr) {
+        if (auto *NewI =
+                tryReassociatedBinaryOp(getBinarySCEV(I, AExpr, RHSExpr), B, I))
+          return NewI;
+      }
+      if (AExpr != RHSExpr) {
+        if (auto *NewI =
+                tryReassociatedBinaryOp(getBinarySCEV(I, BExpr, RHSExpr), A, I))
+          return NewI;
+      }
     }
   }
   return nullptr;
@@ -653,16 +731,35 @@ Value *NaryReassociatePass::tryReassociateMinOrMax(Instruction *I,
   const SCEV *BExpr = SE->getSCEV(B);
   const SCEV *RHSExpr = SE->getSCEV(RHS);
 
-  if (BExpr != RHSExpr) {
-    // Try (A op RHS) op B
-    if (auto *NewMinMax = tryCombination(A, AExpr, RHS, RHSExpr, B, BExpr))
-      return NewMinMax;
-  }
-
-  if (AExpr != RHSExpr) {
-    // Try (RHS op B) op A
-    if (auto *NewMinMax = tryCombination(RHS, RHSExpr, B, BExpr, A, AExpr))
-      return NewMinMax;
+  // Similar to binary ops, prefer grouping uniform values together when
+  // uniformity analysis is available.
+  // For I = minmax(minmax(A, B), RHS), we can form:
+  //   - minmax(minmax(A, RHS), B): groups A and RHS
+  //   - minmax(minmax(B, RHS), A): groups B and RHS
+  if (UI && UI->isUniform(B) && UI->isUniform(RHS) && !UI->isUniform(A)) {
+    LLVM_DEBUG(dbgs() << "NARY: Preferring uniform grouping for minmax " << *I
+                      << "\n");
+    // Try (B op RHS) op A first - groups uniform B with uniform RHS
+    if (AExpr != RHSExpr) {
+      if (auto *NewMinMax = tryCombination(RHS, RHSExpr, B, BExpr, A, AExpr))
+        return NewMinMax;
+    }
+    if (BExpr != RHSExpr) {
+      if (auto *NewMinMax = tryCombination(A, AExpr, RHS, RHSExpr, B, BExpr))
+        return NewMinMax;
+    }
+  } else {
+    // Default order
+    if (BExpr != RHSExpr) {
+      // Try (A op RHS) op B
+      if (auto *NewMinMax = tryCombination(A, AExpr, RHS, RHSExpr, B, BExpr))
+        return NewMinMax;
+    }
+    if (AExpr != RHSExpr) {
+      // Try (RHS op B) op A
+      if (auto *NewMinMax = tryCombination(RHS, RHSExpr, B, BExpr, A, AExpr))
+        return NewMinMax;
+    }
   }
 
   return nullptr;
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 6940c1b238e1d..61e8ad30dc44f 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -515,9 +515,10 @@
 ; GCN-O1-OPTS-NEXT:      Straight line strength reduction
 ; GCN-O1-OPTS-NEXT:      Early CSE
 ; GCN-O1-OPTS-NEXT:      Scalar Evolution Analysis
+; GCN-O1-OPTS-NEXT:      Cycle Info Analysis
+; GCN-O1-OPTS-NEXT:      Uniformity Analysis
 ; GCN-O1-OPTS-NEXT:      Nary reassociation
 ; GCN-O1-OPTS-NEXT:      Early CSE
-; GCN-O1-OPTS-NEXT:      Cycle Info Analysis
 ; GCN-O1-OPTS-NEXT:      Uniformity Analysis
 ; GCN-O1-OPTS-NEXT:      AMDGPU IR optimizations
 ; GCN-O1-OPTS-NEXT:      Basic Alias Analysis (stateless AA impl)
@@ -831,9 +832,10 @@
 ; GCN-O2-NEXT:      Straight line strength reduction
 ; GCN-O2-NEXT:      Early CSE
 ; GCN-O2-NEXT:      Scalar Evolution Analysis
+; GCN-O2-NEXT:      Cycle Info Analysis
+; GCN-O2-NEXT:      Uniformity Analysis
 ; GCN-O2-NEXT:      Nary reassociation
 ; GCN-O2-NEXT:      Early CSE
-; GCN-O2-NEXT:      Cycle Info Analysis
 ; GCN-O2-NEXT:      Uniformity Analysis
 ; GCN-O2-NEXT:      AMDGPU IR optimizations
 ; GCN-O2-NEXT:      Basic Alias Analysis (stateless AA impl)
@@ -1163,9 +1165,10 @@
 ; GCN-O3-NEXT:      Optimization Remark Emitter
 ; GCN-O3-NEXT:      Global Value Numbering
 ; GCN-O3-NEXT:      Scalar Evolution Analysis
+; GCN-O3-NEXT:      Cycle Info Analysis
+; GCN-O3-NEXT:      Uniformity Analysis
 ; GCN-O3-NEXT:      Nary reassociation
 ; GCN-O3-NEXT:      Early CSE
-; GCN-O3-NEXT:      Cycle Info Analysis
 ; GCN-O3-NEXT:      Uniformity Analysis
 ; GCN-O3-NEXT:      AMDGPU IR optimizations
 ; GCN-O3-NEXT:      Basic Alias Analysis (stateless AA impl)
diff --git a/llvm/test/Transforms/NaryReassociate/AMDGPU/nary-add-uniform.ll b/llvm/test/Transforms/NaryReassociate/AMDGPU/nary-add-uniform.ll
index 951b612509e4e..3ac8ef54e4c68 100644
--- a/llvm/test/Transforms/NaryReassociate/AMDGPU/nary-add-uniform.ll
+++ b/llvm/test/Transforms/NaryReassociate/AMDGPU/nary-add-uniform.ll
@@ -29,7 +29,7 @@ define amdgpu_kernel void @prefer_uniform_grouping(i32 %u1, i32 %u2) {
 ; CHECK-NEXT:    [[U1_U2:%.*]] = add i32 [[U1]], [[U2]]
 ; CHECK-NEXT:    call void @use(i32 [[D_U2]])
 ; CHECK-NEXT:    call void @use(i32 [[U1_U2]])
-; CHECK-NEXT:    [[RESULT:%.*]] = add i32 [[D_U2]], [[U1]]
+; CHECK-NEXT:    [[RESULT:%.*]] = add i32 [[U1_U2]], [[D]]
 ; CHECK-NEXT:    call void @use(i32 [[RESULT]])
 ; CHECK-NEXT:    ret void
 ;
@@ -59,7 +59,7 @@ define amdgpu_kernel void @prefer_uniform_grouping_mul(i32 %u1, i32 %u2) {
 ; CHECK-NEXT:    [[U1_U2:%.*]] = mul i32 [[U1]], [[U2]]
 ; CHECK-NEXT:    call void @use(i32 [[D_U2]])
 ; CHECK-NEXT:    call void @use(i32 [[U1_U2]])
-; CHECK-NEXT:    [[RESULT:%.*]] = mul i32 [[D_U2]], [[U1]]
+; CHECK-NEXT:    [[RESULT:%.*]] = mul i32 [[U1_U2]], [[D]]
 ; CHECK-NEXT:    call void @use(i32 [[RESULT]])
 ; CHECK-NEXT:    ret void
 ;
@@ -134,7 +134,7 @@ define amdgpu_kernel void @prefer_uniform_grouping_smax(i32 %u1, i32 %u2) {
 ; CHECK-NEXT:    [[U1_U2:%.*]] = call i32 @llvm.smax.i32(i32 [[U1]], i32 [[U2]])
 ; CHECK-NEXT:    call void @use(i32 [[D_U2]])
 ; CHECK-NEXT:    call void @use(i32 [[U1_U2]])
-; CHECK-NEXT:    [[RESULT_NARY:%.*]] = call i32 @llvm.smax.i32(i32 [[D_U2]], i32 [[U1]])
+; CHECK-NEXT:    [[RESULT_NARY:%.*]] = call i32 @llvm.smax.i32(i32 [[U1_U2]], i32 [[D]])
 ; CHECK-NEXT:    call void @use(i32 [[RESULT_NARY]])
 ; CHECK-NEXT:    ret void
 ;
@@ -165,7 +165,7 @@ define amdgpu_kernel void @prefer_uniform_grouping_umin(i32 %u1, i32 %u2) {
 ; CHECK-NEXT:    [[U1_U2:%.*]] = call i32 @llvm.umin.i32(i32 [[U1]], i32 [[U2]])
 ; CHECK-NEXT:    call void @use(i32 [[D_U2]])
 ; CHECK-NEXT:    call void @use(i32 [[U1_U2]])
-; CHECK-NEXT:    [[RESULT_NARY:%.*]] = call i32 @llvm.umin.i32(i32 [[D_U2]], i32 [[U1]])
+; CHECK-NEXT:    [[RESULT_NARY:%.*]] = call i32 @llvm.umin.i32(i32 [[U1_U2]], i32 [[D]])
 ; CHECK-NEXT:    call void @use(i32 [[RESULT_NARY]])
 ; CHECK-NEXT:    ret void
 ;

>From 74001cc516cc4ab364586feee61471d63949b19c Mon Sep 17 00:00:00 2001
From: padivedi <padivedi at amd.com>
Date: Mon, 12 Jan 2026 16:34:36 +0530
Subject: [PATCH 3/3] review: refactor to keep default order code unchanged

---
 .../lib/Transforms/Scalar/NaryReassociate.cpp | 71 ++++++++++---------
 1 file changed, 38 insertions(+), 33 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/NaryReassociate.cpp b/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
index 43d7a9375b1d0..7ce9c0d289b78 100644
--- a/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
+++ b/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
@@ -410,7 +410,10 @@ NaryReassociatePass::tryReassociateGEPAtIndex(GetElementPtrInst *GEP,
                 tryReassociateGEPAtIndex(GEP, I, RHS, LHS, IndexedType))
           return NewGEP;
       }
-    } else if (UI && UI->isUniform(LHS) && !UI->isUniform(RHS)) {
+      return nullptr;
+    }
+
+    if (UI && UI->isUniform(LHS) && !UI->isUniform(RHS)) {
       LLVM_DEBUG(
           dbgs() << "NARY: Preferring uniform remainder for GEP index\n");
       // LHS is uniform, prefer it as remainder - try RHS first
@@ -422,16 +425,16 @@ NaryReassociatePass::tryReassociateGEPAtIndex(GetElementPtrInst *GEP,
       if (auto *NewGEP =
               tryReassociateGEPAtIndex(GEP, I, LHS, RHS, IndexedType))
         return NewGEP;
-    } else {
-      // Default order
+      return nullptr;
+    }
+
+    // Default order
+    if (auto *NewGEP = tryReassociateGEPAtIndex(GEP, I, LHS, RHS, IndexedType))
+      return NewGEP;
+    if (LHS != RHS) {
       if (auto *NewGEP =
-              tryReassociateGEPAtIndex(GEP, I, LHS, RHS, IndexedType))
+              tryReassociateGEPAtIndex(GEP, I, RHS, LHS, IndexedType))
         return NewGEP;
-      if (LHS != RHS) {
-        if (auto *NewGEP =
-                tryReassociateGEPAtIndex(GEP, I, RHS, LHS, IndexedType))
-          return NewGEP;
-      }
     }
   }
   return nullptr;
@@ -558,18 +561,19 @@ Instruction *NaryReassociatePass::tryReassociateBinaryOp(Value *LHS, Value *RHS,
                 tryReassociatedBinaryOp(getBinarySCEV(I, AExpr, RHSExpr), B, I))
           return NewI;
       }
-    } else {
-      // Default order: try (A op RHS) op B first
-      if (BExpr != RHSExpr) {
-        if (auto *NewI =
-                tryReassociatedBinaryOp(getBinarySCEV(I, AExpr, RHSExpr), B, I))
-          return NewI;
-      }
-      if (AExpr != RHSExpr) {
-        if (auto *NewI =
-                tryReassociatedBinaryOp(getBinarySCEV(I, BExpr, RHSExpr), A, I))
-          return NewI;
-      }
+      return nullptr;
+    }
+
+    // Default order: try (A op RHS) op B first
+    if (BExpr != RHSExpr) {
+      if (auto *NewI =
+              tryReassociatedBinaryOp(getBinarySCEV(I, AExpr, RHSExpr), B, I))
+        return NewI;
+    }
+    if (AExpr != RHSExpr) {
+      if (auto *NewI =
+              tryReassociatedBinaryOp(getBinarySCEV(I, BExpr, RHSExpr), A, I))
+        return NewI;
     }
   }
   return nullptr;
@@ -748,18 +752,19 @@ Value *NaryReassociatePass::tryReassociateMinOrMax(Instruction *I,
       if (auto *NewMinMax = tryCombination(A, AExpr, RHS, RHSExpr, B, BExpr))
         return NewMinMax;
     }
-  } else {
-    // Default order
-    if (BExpr != RHSExpr) {
-      // Try (A op RHS) op B
-      if (auto *NewMinMax = tryCombination(A, AExpr, RHS, RHSExpr, B, BExpr))
-        return NewMinMax;
-    }
-    if (AExpr != RHSExpr) {
-      // Try (RHS op B) op A
-      if (auto *NewMinMax = tryCombination(RHS, RHSExpr, B, BExpr, A, AExpr))
-        return NewMinMax;
-    }
+    return nullptr;
+  }
+
+  // Default order
+  if (BExpr != RHSExpr) {
+    // Try (A op RHS) op B
+    if (auto *NewMinMax = tryCombination(A, AExpr, RHS, RHSExpr, B, BExpr))
+      return NewMinMax;
+  }
+  if (AExpr != RHSExpr) {
+    // Try (RHS op B) op A
+    if (auto *NewMinMax = tryCombination(RHS, RHSExpr, B, BExpr, A, AExpr))
+      return NewMinMax;
   }
 
   return nullptr;