[llvm] [NaryReassociate] Teach NaryReassociate about UniformityAnalysis (PR #175167)
Pankaj Dwivedi via llvm-commits
llvm-commits at lists.llvm.org
Mon Jan 12 03:04:52 PST 2026
https://github.com/PankajDwivedi-25 updated https://github.com/llvm/llvm-project/pull/175167
>From 3cdf9ce45a55613456617697cc44bcaa87645331 Mon Sep 17 00:00:00 2001
From: padivedi <padivedi at amd.com>
Date: Mon, 12 Jan 2026 14:12:40 +0530
Subject: [PATCH 1/3] [NaryReassociate][AMDGPU] Pre-commit test for
uniformity-aware reassociation (NFC)
---
.../AMDGPU/nary-add-uniform.ll | 222 ++++++++++++++++++
1 file changed, 222 insertions(+)
create mode 100644 llvm/test/Transforms/NaryReassociate/AMDGPU/nary-add-uniform.ll
diff --git a/llvm/test/Transforms/NaryReassociate/AMDGPU/nary-add-uniform.ll b/llvm/test/Transforms/NaryReassociate/AMDGPU/nary-add-uniform.ll
new file mode 100644
index 0000000000000..951b612509e4e
--- /dev/null
+++ b/llvm/test/Transforms/NaryReassociate/AMDGPU/nary-add-uniform.ll
@@ -0,0 +1,222 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; REQUIRES: amdgpu-registered-target
+; RUN: opt < %s -mtriple=amdgcn-amd-amdhsa -passes='nary-reassociate' -S | FileCheck %s
+
+declare i32 @llvm.amdgcn.workitem.id.x()
+declare void @use(i32)
+
+; Test that NaryReassociate prefers grouping uniform values together when
+; uniformity analysis is available and both reassociation options exist.
+;
+; For I = (A op B) op RHS, the pass can form:
+; - (A op RHS) op B
+; - (B op RHS) op A
+;
+; When both dominating expressions exist, prefer the one grouping uniforms.
+
+; Both %d_u2 and %u1_u2 exist as dominating expressions.
+; For (d + u1) + u2:
+; - Without UA preference: would try (d + u2) first, find %d_u2, return %d_u2 + u1
+; - With UA preference: B=u1 and RHS=u2 are uniform, A=d is divergent
+; So prefer (u1 + u2) + d, returning %u1_u2 + d
+;
+
+define amdgpu_kernel void @prefer_uniform_grouping(i32 %u1, i32 %u2) {
+; CHECK-LABEL: define amdgpu_kernel void @prefer_uniform_grouping(
+; CHECK-SAME: i32 [[U1:%.*]], i32 [[U2:%.*]]) {
+; CHECK-NEXT: [[D:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[D_U2:%.*]] = add i32 [[D]], [[U2]]
+; CHECK-NEXT: [[U1_U2:%.*]] = add i32 [[U1]], [[U2]]
+; CHECK-NEXT: call void @use(i32 [[D_U2]])
+; CHECK-NEXT: call void @use(i32 [[U1_U2]])
+; CHECK-NEXT: [[RESULT:%.*]] = add i32 [[D_U2]], [[U1]]
+; CHECK-NEXT: call void @use(i32 [[RESULT]])
+; CHECK-NEXT: ret void
+;
+ %d = call i32 @llvm.amdgcn.workitem.id.x()
+
+ ; Create both possible reassociation targets
+ %d_u2 = add i32 %d, %u2 ; divergent + uniform
+ %u1_u2 = add i32 %u1, %u2 ; uniform + uniform (should be preferred!)
+
+ call void @use(i32 %d_u2)
+ call void @use(i32 %u1_u2)
+
+ ; (d + u1) + u2: both (d + u2) and (u1 + u2) exist
+ ; Should prefer (u1 + u2) + d to group uniforms
+ %tmp = add i32 %d, %u1
+ %result = add i32 %tmp, %u2
+ call void @use(i32 %result)
+
+ ret void
+}
+
+define amdgpu_kernel void @prefer_uniform_grouping_mul(i32 %u1, i32 %u2) {
+; CHECK-LABEL: define amdgpu_kernel void @prefer_uniform_grouping_mul(
+; CHECK-SAME: i32 [[U1:%.*]], i32 [[U2:%.*]]) {
+; CHECK-NEXT: [[D:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[D_U2:%.*]] = mul i32 [[D]], [[U2]]
+; CHECK-NEXT: [[U1_U2:%.*]] = mul i32 [[U1]], [[U2]]
+; CHECK-NEXT: call void @use(i32 [[D_U2]])
+; CHECK-NEXT: call void @use(i32 [[U1_U2]])
+; CHECK-NEXT: [[RESULT:%.*]] = mul i32 [[D_U2]], [[U1]]
+; CHECK-NEXT: call void @use(i32 [[RESULT]])
+; CHECK-NEXT: ret void
+;
+ %d = call i32 @llvm.amdgcn.workitem.id.x()
+
+ %d_u2 = mul i32 %d, %u2
+ %u1_u2 = mul i32 %u1, %u2
+
+ call void @use(i32 %d_u2)
+ call void @use(i32 %u1_u2)
+
+ %tmp = mul i32 %d, %u1
+ %result = mul i32 %tmp, %u2
+ call void @use(i32 %result)
+
+ ret void
+}
+
+define amdgpu_kernel void @only_one_option(i32 %u1, i32 %u2) {
+; CHECK-LABEL: define amdgpu_kernel void @only_one_option(
+; CHECK-SAME: i32 [[U1:%.*]], i32 [[U2:%.*]]) {
+; CHECK-NEXT: [[D:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[U1_U2:%.*]] = add i32 [[U1]], [[U2]]
+; CHECK-NEXT: call void @use(i32 [[U1_U2]])
+; CHECK-NEXT: [[RESULT:%.*]] = add i32 [[U1_U2]], [[D]]
+; CHECK-NEXT: call void @use(i32 [[RESULT]])
+; CHECK-NEXT: ret void
+;
+ %d = call i32 @llvm.amdgcn.workitem.id.x()
+
+ ; Only u1 + u2 exists, not d + u2
+ %u1_u2 = add i32 %u1, %u2
+ call void @use(i32 %u1_u2)
+
+ %tmp = add i32 %d, %u1
+ %result = add i32 %tmp, %u2
+ call void @use(i32 %result)
+
+ ret void
+}
+
+; When no dominating expression exists, no reassociation happens
+define amdgpu_kernel void @no_dominating_expr(i32 %u1, i32 %u2) {
+; CHECK-LABEL: define amdgpu_kernel void @no_dominating_expr(
+; CHECK-SAME: i32 [[U1:%.*]], i32 [[U2:%.*]]) {
+; CHECK-NEXT: [[D:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[TMP:%.*]] = add i32 [[D]], [[U1]]
+; CHECK-NEXT: [[RESULT:%.*]] = add i32 [[TMP]], [[U2]]
+; CHECK-NEXT: call void @use(i32 [[RESULT]])
+; CHECK-NEXT: ret void
+;
+ %d = call i32 @llvm.amdgcn.workitem.id.x()
+
+ ; No dominating expressions exist
+ %tmp = add i32 %d, %u1
+ %result = add i32 %tmp, %u2
+ call void @use(i32 %result)
+
+ ret void
+}
+
+; Test smax: prefer grouping uniform values together
+; For smax(smax(A, B), RHS):
+; - smax(smax(A, RHS), B): groups A and RHS
+; - smax(smax(B, RHS), A): groups B and RHS
+; When B and RHS are uniform but A is divergent, prefer smax(smax(B, RHS), A)
+define amdgpu_kernel void @prefer_uniform_grouping_smax(i32 %u1, i32 %u2) {
+; CHECK-LABEL: define amdgpu_kernel void @prefer_uniform_grouping_smax(
+; CHECK-SAME: i32 [[U1:%.*]], i32 [[U2:%.*]]) {
+; CHECK-NEXT: [[D:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[D_U2:%.*]] = call i32 @llvm.smax.i32(i32 [[D]], i32 [[U2]])
+; CHECK-NEXT: [[U1_U2:%.*]] = call i32 @llvm.smax.i32(i32 [[U1]], i32 [[U2]])
+; CHECK-NEXT: call void @use(i32 [[D_U2]])
+; CHECK-NEXT: call void @use(i32 [[U1_U2]])
+; CHECK-NEXT: [[RESULT_NARY:%.*]] = call i32 @llvm.smax.i32(i32 [[D_U2]], i32 [[U1]])
+; CHECK-NEXT: call void @use(i32 [[RESULT_NARY]])
+; CHECK-NEXT: ret void
+;
+ %d = call i32 @llvm.amdgcn.workitem.id.x()
+
+ ; Create both possible reassociation targets
+ %d_u2 = call i32 @llvm.smax.i32(i32 %d, i32 %u2) ; divergent, uniform
+ %u1_u2 = call i32 @llvm.smax.i32(i32 %u1, i32 %u2) ; uniform, uniform (preferred!)
+
+ call void @use(i32 %d_u2)
+ call void @use(i32 %u1_u2)
+
+ ; smax(smax(d, u1), u2): both smax(d, u2) and smax(u1, u2) exist
+ ; Should prefer smax(smax(u1, u2), d) to group uniforms
+ %tmp = call i32 @llvm.smax.i32(i32 %d, i32 %u1)
+ %result = call i32 @llvm.smax.i32(i32 %tmp, i32 %u2)
+ call void @use(i32 %result)
+
+ ret void
+}
+
+; Test umin: prefer grouping uniform values together
+define amdgpu_kernel void @prefer_uniform_grouping_umin(i32 %u1, i32 %u2) {
+; CHECK-LABEL: define amdgpu_kernel void @prefer_uniform_grouping_umin(
+; CHECK-SAME: i32 [[U1:%.*]], i32 [[U2:%.*]]) {
+; CHECK-NEXT: [[D:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[D_U2:%.*]] = call i32 @llvm.umin.i32(i32 [[D]], i32 [[U2]])
+; CHECK-NEXT: [[U1_U2:%.*]] = call i32 @llvm.umin.i32(i32 [[U1]], i32 [[U2]])
+; CHECK-NEXT: call void @use(i32 [[D_U2]])
+; CHECK-NEXT: call void @use(i32 [[U1_U2]])
+; CHECK-NEXT: [[RESULT_NARY:%.*]] = call i32 @llvm.umin.i32(i32 [[D_U2]], i32 [[U1]])
+; CHECK-NEXT: call void @use(i32 [[RESULT_NARY]])
+; CHECK-NEXT: ret void
+;
+ %d = call i32 @llvm.amdgcn.workitem.id.x()
+
+ %d_u2 = call i32 @llvm.umin.i32(i32 %d, i32 %u2)
+ %u1_u2 = call i32 @llvm.umin.i32(i32 %u1, i32 %u2)
+
+ call void @use(i32 %d_u2)
+ call void @use(i32 %u1_u2)
+
+ %tmp = call i32 @llvm.umin.i32(i32 %d, i32 %u1)
+ %result = call i32 @llvm.umin.i32(i32 %tmp, i32 %u2)
+ call void @use(i32 %result)
+
+ ret void
+}
+
+; Test GEP: prefer uniform remainder in index calculation
+; For GEP with index = LHS + RHS:
+; - If RHS is uniform, prefer LHS first (uniform RHS as remainder)
+; - If LHS is uniform, prefer RHS first (uniform LHS as remainder)
+define amdgpu_kernel void @prefer_uniform_gep_remainder(ptr %base, i64 %u_offset) {
+; CHECK-LABEL: define amdgpu_kernel void @prefer_uniform_gep_remainder(
+; CHECK-SAME: ptr [[BASE:%.*]], i64 [[U_OFFSET:%.*]]) {
+; CHECK-NEXT: [[D:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[D_EXT:%.*]] = zext i32 [[D]] to i64
+; CHECK-NEXT: [[GEP_D:%.*]] = getelementptr i32, ptr [[BASE]], i64 [[D_EXT]]
+; CHECK-NEXT: call void @use_ptr(ptr [[GEP_D]])
+; CHECK-NEXT: [[GEP_RESULT:%.*]] = getelementptr i32, ptr [[GEP_D]], i64 [[U_OFFSET]]
+; CHECK-NEXT: call void @use_ptr(ptr [[GEP_RESULT]])
+; CHECK-NEXT: ret void
+;
+ %d = call i32 @llvm.amdgcn.workitem.id.x()
+ %d_ext = zext i32 %d to i64
+
+ ; Create dominating GEP with divergent index
+ %gep_d = getelementptr i32, ptr %base, i64 %d_ext
+ call void @use_ptr(ptr %gep_d)
+
+ ; GEP with index = d + u_offset
+ ; Should prefer finding dominating GEP with d, leaving uniform u_offset as remainder
+ %idx = add i64 %d_ext, %u_offset
+ %gep_result = getelementptr i32, ptr %base, i64 %idx
+ call void @use_ptr(ptr %gep_result)
+
+ ret void
+}
+
+declare i32 @llvm.smax.i32(i32, i32)
+declare i32 @llvm.smin.i32(i32, i32)
+declare i32 @llvm.umax.i32(i32, i32)
+declare i32 @llvm.umin.i32(i32, i32)
+declare void @use_ptr(ptr)
>From 2e51a8a79191a709cfa88258233e5fb1cadbd5f6 Mon Sep 17 00:00:00 2001
From: padivedi <padivedi at amd.com>
Date: Mon, 12 Jan 2026 14:17:18 +0530
Subject: [PATCH 2/3] [NaryReassociate] Make uniformity-aware to prefer
grouping uniform values
---
.../llvm/Transforms/Scalar/NaryReassociate.h | 4 +-
.../lib/Transforms/Scalar/NaryReassociate.cpp | 151 ++++++++++++++----
llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 9 +-
.../AMDGPU/nary-add-uniform.ll | 8 +-
4 files changed, 137 insertions(+), 35 deletions(-)
diff --git a/llvm/include/llvm/Transforms/Scalar/NaryReassociate.h b/llvm/include/llvm/Transforms/Scalar/NaryReassociate.h
index f0474bc4352e3..4318945bd82b3 100644
--- a/llvm/include/llvm/Transforms/Scalar/NaryReassociate.h
+++ b/llvm/include/llvm/Transforms/Scalar/NaryReassociate.h
@@ -80,6 +80,7 @@
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/UniformityAnalysis.h"
#include "llvm/IR/PassManager.h"
#include "llvm/IR/ValueHandle.h"
@@ -106,7 +107,7 @@ class NaryReassociatePass : public PassInfoMixin<NaryReassociatePass> {
// Glue for old PM.
bool runImpl(Function &F, AssumptionCache *AC_, DominatorTree *DT_,
ScalarEvolution *SE_, TargetLibraryInfo *TLI_,
- TargetTransformInfo *TTI_);
+ TargetTransformInfo *TTI_, UniformityInfo *UI_ = nullptr);
private:
// Runs only one iteration of the dominator-based algorithm. See the header
@@ -183,6 +184,7 @@ class NaryReassociatePass : public PassInfoMixin<NaryReassociatePass> {
ScalarEvolution *SE;
TargetLibraryInfo *TLI;
TargetTransformInfo *TTI;
+ UniformityInfo *UI;
// A lookup table quickly telling which instructions compute the given SCEV.
// Note that there can be multiple instructions at different locations
diff --git a/llvm/lib/Transforms/Scalar/NaryReassociate.cpp b/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
index ec145f2f48bea..43d7a9375b1d0 100644
--- a/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
+++ b/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
@@ -104,6 +104,7 @@
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/Local.h"
@@ -141,6 +142,7 @@ class NaryReassociateLegacyPass : public FunctionPass {
AU.addRequired<ScalarEvolutionWrapperPass>();
AU.addRequired<TargetLibraryInfoWrapperPass>();
AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.addRequired<UniformityInfoWrapperPass>();
AU.setPreservesCFG();
}
@@ -159,6 +161,7 @@ INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
INITIALIZE_PASS_END(NaryReassociateLegacyPass, "nary-reassociate",
"Nary reassociation", false, false)
@@ -176,7 +179,11 @@ bool NaryReassociateLegacyPass::runOnFunction(Function &F) {
auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
- return Impl.runImpl(F, AC, DT, SE, TLI, TTI);
+ // UniformityInfo is required on all targets, but on targets without branch
+ // divergence it does no work and reports everything as uniform.
+ auto *UI = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
+
+ return Impl.runImpl(F, AC, DT, SE, TLI, TTI, UI);
}
PreservedAnalyses NaryReassociatePass::run(Function &F,
@@ -187,7 +194,11 @@ PreservedAnalyses NaryReassociatePass::run(Function &F,
auto *TLI = &AM.getResult<TargetLibraryAnalysis>(F);
auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
- if (!runImpl(F, AC, DT, SE, TLI, TTI))
+ // UniformityInfo is required on all targets, but on targets without branch
+ // divergence it does no work and reports everything as uniform.
+ auto *UI = &AM.getResult<UniformityInfoAnalysis>(F);
+
+ if (!runImpl(F, AC, DT, SE, TLI, TTI, UI))
return PreservedAnalyses::all();
PreservedAnalyses PA;
@@ -199,12 +210,14 @@ PreservedAnalyses NaryReassociatePass::run(Function &F,
bool NaryReassociatePass::runImpl(Function &F, AssumptionCache *AC_,
DominatorTree *DT_, ScalarEvolution *SE_,
TargetLibraryInfo *TLI_,
- TargetTransformInfo *TTI_) {
+ TargetTransformInfo *TTI_,
+ UniformityInfo *UI_) {
AC = AC_;
DT = DT_;
SE = SE_;
TLI = TLI_;
TTI = TTI_;
+ UI = UI_;
DL = &F.getDataLayout();
bool Changed = false, ChangedInThisIteration;
@@ -379,13 +392,46 @@ NaryReassociatePass::tryReassociateGEPAtIndex(GetElementPtrInst *GEP,
Value *LHS = AO->getOperand(0), *RHS = AO->getOperand(1);
// IndexToSplit = LHS + RHS.
- if (auto *NewGEP = tryReassociateGEPAtIndex(GEP, I, LHS, RHS, IndexedType))
- return NewGEP;
- // Symmetrically, try IndexToSplit = RHS + LHS.
- if (LHS != RHS) {
+ // tryReassociateGEPAtIndex(GEP, I, LHS, RHS, ...) looks for a dominating
+ // GEP with LHS as index, then creates: NewGEP = existingGEP + RHS * scale.
+ // So the RHS becomes the "remaining" index calculation.
+ //
+ // For uniformity: prefer the remaining calculation to be uniform, as it
+ // can then stay in scalar registers. So if RHS is uniform but LHS is
+ // divergent, try LHS first (leaving uniform RHS as remainder).
+ if (UI && UI->isUniform(RHS) && !UI->isUniform(LHS)) {
+ LLVM_DEBUG(
+ dbgs() << "NARY: Preferring uniform remainder for GEP index\n");
+ if (auto *NewGEP =
+ tryReassociateGEPAtIndex(GEP, I, LHS, RHS, IndexedType))
+ return NewGEP;
+ if (LHS != RHS) {
+ if (auto *NewGEP =
+ tryReassociateGEPAtIndex(GEP, I, RHS, LHS, IndexedType))
+ return NewGEP;
+ }
+ } else if (UI && UI->isUniform(LHS) && !UI->isUniform(RHS)) {
+ LLVM_DEBUG(
+ dbgs() << "NARY: Preferring uniform remainder for GEP index\n");
+ // LHS is uniform, prefer it as remainder - try RHS first
+ if (LHS != RHS) {
+ if (auto *NewGEP =
+ tryReassociateGEPAtIndex(GEP, I, RHS, LHS, IndexedType))
+ return NewGEP;
+ }
+ if (auto *NewGEP =
+ tryReassociateGEPAtIndex(GEP, I, LHS, RHS, IndexedType))
+ return NewGEP;
+ } else {
+ // Default order
if (auto *NewGEP =
- tryReassociateGEPAtIndex(GEP, I, RHS, LHS, IndexedType))
+ tryReassociateGEPAtIndex(GEP, I, LHS, RHS, IndexedType))
return NewGEP;
+ if (LHS != RHS) {
+ if (auto *NewGEP =
+ tryReassociateGEPAtIndex(GEP, I, RHS, LHS, IndexedType))
+ return NewGEP;
+ }
}
}
return nullptr;
@@ -483,15 +529,47 @@ Instruction *NaryReassociatePass::tryReassociateBinaryOp(Value *LHS, Value *RHS,
// = (A op RHS) op B or (B op RHS) op A
const SCEV *AExpr = SE->getSCEV(A), *BExpr = SE->getSCEV(B);
const SCEV *RHSExpr = SE->getSCEV(RHS);
- if (BExpr != RHSExpr) {
- if (auto *NewI =
- tryReassociatedBinaryOp(getBinarySCEV(I, AExpr, RHSExpr), B, I))
- return NewI;
- }
- if (AExpr != RHSExpr) {
- if (auto *NewI =
- tryReassociatedBinaryOp(getBinarySCEV(I, BExpr, RHSExpr), A, I))
- return NewI;
+
+ // When uniformity analysis is available (e.g., on GPU targets), prefer
+ // reassociations that group uniform values together. This allows
+ // intermediate results to stay in scalar registers (SGPRs on AMDGPU),
+ // reducing vector register (VGPR) pressure.
+ //
+ // For I = (A op B) op RHS, we can form:
+ // - (A op RHS) op B: groups A and RHS
+ // - (B op RHS) op A: groups B and RHS
+ //
+ // Prefer the grouping where both operands in the new sub-expression are
+ // uniform, as this sub-expression can then be computed in scalar registers.
+ //
+ // We only need to handle the case where B and RHS are uniform but A is
+ // divergent. The symmetric case (A and RHS uniform, B divergent) is already
+ // handled by the default order which tries (A op RHS) op B first.
+ if (UI && UI->isUniform(B) && UI->isUniform(RHS) && !UI->isUniform(A)) {
+ LLVM_DEBUG(dbgs() << "NARY: Preferring uniform grouping for " << *I
+ << "\n");
+ if (AExpr != RHSExpr) {
+ if (auto *NewI =
+ tryReassociatedBinaryOp(getBinarySCEV(I, BExpr, RHSExpr), A, I))
+ return NewI;
+ }
+ if (BExpr != RHSExpr) {
+ if (auto *NewI =
+ tryReassociatedBinaryOp(getBinarySCEV(I, AExpr, RHSExpr), B, I))
+ return NewI;
+ }
+ } else {
+ // Default order: try (A op RHS) op B first
+ if (BExpr != RHSExpr) {
+ if (auto *NewI =
+ tryReassociatedBinaryOp(getBinarySCEV(I, AExpr, RHSExpr), B, I))
+ return NewI;
+ }
+ if (AExpr != RHSExpr) {
+ if (auto *NewI =
+ tryReassociatedBinaryOp(getBinarySCEV(I, BExpr, RHSExpr), A, I))
+ return NewI;
+ }
}
}
return nullptr;
@@ -653,16 +731,35 @@ Value *NaryReassociatePass::tryReassociateMinOrMax(Instruction *I,
const SCEV *BExpr = SE->getSCEV(B);
const SCEV *RHSExpr = SE->getSCEV(RHS);
- if (BExpr != RHSExpr) {
- // Try (A op RHS) op B
- if (auto *NewMinMax = tryCombination(A, AExpr, RHS, RHSExpr, B, BExpr))
- return NewMinMax;
- }
-
- if (AExpr != RHSExpr) {
- // Try (RHS op B) op A
- if (auto *NewMinMax = tryCombination(RHS, RHSExpr, B, BExpr, A, AExpr))
- return NewMinMax;
+ // Similar to binary ops, prefer grouping uniform values together when
+ // uniformity analysis is available.
+ // For I = minmax(minmax(A, B), RHS), we can form:
+ // - minmax(minmax(A, RHS), B): groups A and RHS
+ // - minmax(minmax(B, RHS), A): groups B and RHS
+ if (UI && UI->isUniform(B) && UI->isUniform(RHS) && !UI->isUniform(A)) {
+ LLVM_DEBUG(dbgs() << "NARY: Preferring uniform grouping for minmax " << *I
+ << "\n");
+ // Try (B op RHS) op A first - groups uniform B with uniform RHS
+ if (AExpr != RHSExpr) {
+ if (auto *NewMinMax = tryCombination(RHS, RHSExpr, B, BExpr, A, AExpr))
+ return NewMinMax;
+ }
+ if (BExpr != RHSExpr) {
+ if (auto *NewMinMax = tryCombination(A, AExpr, RHS, RHSExpr, B, BExpr))
+ return NewMinMax;
+ }
+ } else {
+ // Default order
+ if (BExpr != RHSExpr) {
+ // Try (A op RHS) op B
+ if (auto *NewMinMax = tryCombination(A, AExpr, RHS, RHSExpr, B, BExpr))
+ return NewMinMax;
+ }
+ if (AExpr != RHSExpr) {
+ // Try (RHS op B) op A
+ if (auto *NewMinMax = tryCombination(RHS, RHSExpr, B, BExpr, A, AExpr))
+ return NewMinMax;
+ }
}
return nullptr;
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 6940c1b238e1d..61e8ad30dc44f 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -515,9 +515,10 @@
; GCN-O1-OPTS-NEXT: Straight line strength reduction
; GCN-O1-OPTS-NEXT: Early CSE
; GCN-O1-OPTS-NEXT: Scalar Evolution Analysis
+; GCN-O1-OPTS-NEXT: Cycle Info Analysis
+; GCN-O1-OPTS-NEXT: Uniformity Analysis
; GCN-O1-OPTS-NEXT: Nary reassociation
; GCN-O1-OPTS-NEXT: Early CSE
-; GCN-O1-OPTS-NEXT: Cycle Info Analysis
; GCN-O1-OPTS-NEXT: Uniformity Analysis
; GCN-O1-OPTS-NEXT: AMDGPU IR optimizations
; GCN-O1-OPTS-NEXT: Basic Alias Analysis (stateless AA impl)
@@ -831,9 +832,10 @@
; GCN-O2-NEXT: Straight line strength reduction
; GCN-O2-NEXT: Early CSE
; GCN-O2-NEXT: Scalar Evolution Analysis
+; GCN-O2-NEXT: Cycle Info Analysis
+; GCN-O2-NEXT: Uniformity Analysis
; GCN-O2-NEXT: Nary reassociation
; GCN-O2-NEXT: Early CSE
-; GCN-O2-NEXT: Cycle Info Analysis
; GCN-O2-NEXT: Uniformity Analysis
; GCN-O2-NEXT: AMDGPU IR optimizations
; GCN-O2-NEXT: Basic Alias Analysis (stateless AA impl)
@@ -1163,9 +1165,10 @@
; GCN-O3-NEXT: Optimization Remark Emitter
; GCN-O3-NEXT: Global Value Numbering
; GCN-O3-NEXT: Scalar Evolution Analysis
+; GCN-O3-NEXT: Cycle Info Analysis
+; GCN-O3-NEXT: Uniformity Analysis
; GCN-O3-NEXT: Nary reassociation
; GCN-O3-NEXT: Early CSE
-; GCN-O3-NEXT: Cycle Info Analysis
; GCN-O3-NEXT: Uniformity Analysis
; GCN-O3-NEXT: AMDGPU IR optimizations
; GCN-O3-NEXT: Basic Alias Analysis (stateless AA impl)
diff --git a/llvm/test/Transforms/NaryReassociate/AMDGPU/nary-add-uniform.ll b/llvm/test/Transforms/NaryReassociate/AMDGPU/nary-add-uniform.ll
index 951b612509e4e..3ac8ef54e4c68 100644
--- a/llvm/test/Transforms/NaryReassociate/AMDGPU/nary-add-uniform.ll
+++ b/llvm/test/Transforms/NaryReassociate/AMDGPU/nary-add-uniform.ll
@@ -29,7 +29,7 @@ define amdgpu_kernel void @prefer_uniform_grouping(i32 %u1, i32 %u2) {
; CHECK-NEXT: [[U1_U2:%.*]] = add i32 [[U1]], [[U2]]
; CHECK-NEXT: call void @use(i32 [[D_U2]])
; CHECK-NEXT: call void @use(i32 [[U1_U2]])
-; CHECK-NEXT: [[RESULT:%.*]] = add i32 [[D_U2]], [[U1]]
+; CHECK-NEXT: [[RESULT:%.*]] = add i32 [[U1_U2]], [[D]]
; CHECK-NEXT: call void @use(i32 [[RESULT]])
; CHECK-NEXT: ret void
;
@@ -59,7 +59,7 @@ define amdgpu_kernel void @prefer_uniform_grouping_mul(i32 %u1, i32 %u2) {
; CHECK-NEXT: [[U1_U2:%.*]] = mul i32 [[U1]], [[U2]]
; CHECK-NEXT: call void @use(i32 [[D_U2]])
; CHECK-NEXT: call void @use(i32 [[U1_U2]])
-; CHECK-NEXT: [[RESULT:%.*]] = mul i32 [[D_U2]], [[U1]]
+; CHECK-NEXT: [[RESULT:%.*]] = mul i32 [[U1_U2]], [[D]]
; CHECK-NEXT: call void @use(i32 [[RESULT]])
; CHECK-NEXT: ret void
;
@@ -134,7 +134,7 @@ define amdgpu_kernel void @prefer_uniform_grouping_smax(i32 %u1, i32 %u2) {
; CHECK-NEXT: [[U1_U2:%.*]] = call i32 @llvm.smax.i32(i32 [[U1]], i32 [[U2]])
; CHECK-NEXT: call void @use(i32 [[D_U2]])
; CHECK-NEXT: call void @use(i32 [[U1_U2]])
-; CHECK-NEXT: [[RESULT_NARY:%.*]] = call i32 @llvm.smax.i32(i32 [[D_U2]], i32 [[U1]])
+; CHECK-NEXT: [[RESULT_NARY:%.*]] = call i32 @llvm.smax.i32(i32 [[U1_U2]], i32 [[D]])
; CHECK-NEXT: call void @use(i32 [[RESULT_NARY]])
; CHECK-NEXT: ret void
;
@@ -165,7 +165,7 @@ define amdgpu_kernel void @prefer_uniform_grouping_umin(i32 %u1, i32 %u2) {
; CHECK-NEXT: [[U1_U2:%.*]] = call i32 @llvm.umin.i32(i32 [[U1]], i32 [[U2]])
; CHECK-NEXT: call void @use(i32 [[D_U2]])
; CHECK-NEXT: call void @use(i32 [[U1_U2]])
-; CHECK-NEXT: [[RESULT_NARY:%.*]] = call i32 @llvm.umin.i32(i32 [[D_U2]], i32 [[U1]])
+; CHECK-NEXT: [[RESULT_NARY:%.*]] = call i32 @llvm.umin.i32(i32 [[U1_U2]], i32 [[D]])
; CHECK-NEXT: call void @use(i32 [[RESULT_NARY]])
; CHECK-NEXT: ret void
;
>From 74001cc516cc4ab364586feee61471d63949b19c Mon Sep 17 00:00:00 2001
From: padivedi <padivedi at amd.com>
Date: Mon, 12 Jan 2026 16:34:36 +0530
Subject: [PATCH 3/3] review: refactor to keep default order code unchanged
---
.../lib/Transforms/Scalar/NaryReassociate.cpp | 71 ++++++++++---------
1 file changed, 38 insertions(+), 33 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/NaryReassociate.cpp b/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
index 43d7a9375b1d0..7ce9c0d289b78 100644
--- a/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
+++ b/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
@@ -410,7 +410,10 @@ NaryReassociatePass::tryReassociateGEPAtIndex(GetElementPtrInst *GEP,
tryReassociateGEPAtIndex(GEP, I, RHS, LHS, IndexedType))
return NewGEP;
}
- } else if (UI && UI->isUniform(LHS) && !UI->isUniform(RHS)) {
+ return nullptr;
+ }
+
+ if (UI && UI->isUniform(LHS) && !UI->isUniform(RHS)) {
LLVM_DEBUG(
dbgs() << "NARY: Preferring uniform remainder for GEP index\n");
// LHS is uniform, prefer it as remainder - try RHS first
@@ -422,16 +425,16 @@ NaryReassociatePass::tryReassociateGEPAtIndex(GetElementPtrInst *GEP,
if (auto *NewGEP =
tryReassociateGEPAtIndex(GEP, I, LHS, RHS, IndexedType))
return NewGEP;
- } else {
- // Default order
+ return nullptr;
+ }
+
+ // Default order
+ if (auto *NewGEP = tryReassociateGEPAtIndex(GEP, I, LHS, RHS, IndexedType))
+ return NewGEP;
+ if (LHS != RHS) {
if (auto *NewGEP =
- tryReassociateGEPAtIndex(GEP, I, LHS, RHS, IndexedType))
+ tryReassociateGEPAtIndex(GEP, I, RHS, LHS, IndexedType))
return NewGEP;
- if (LHS != RHS) {
- if (auto *NewGEP =
- tryReassociateGEPAtIndex(GEP, I, RHS, LHS, IndexedType))
- return NewGEP;
- }
}
}
return nullptr;
@@ -558,18 +561,19 @@ Instruction *NaryReassociatePass::tryReassociateBinaryOp(Value *LHS, Value *RHS,
tryReassociatedBinaryOp(getBinarySCEV(I, AExpr, RHSExpr), B, I))
return NewI;
}
- } else {
- // Default order: try (A op RHS) op B first
- if (BExpr != RHSExpr) {
- if (auto *NewI =
- tryReassociatedBinaryOp(getBinarySCEV(I, AExpr, RHSExpr), B, I))
- return NewI;
- }
- if (AExpr != RHSExpr) {
- if (auto *NewI =
- tryReassociatedBinaryOp(getBinarySCEV(I, BExpr, RHSExpr), A, I))
- return NewI;
- }
+ return nullptr;
+ }
+
+ // Default order: try (A op RHS) op B first
+ if (BExpr != RHSExpr) {
+ if (auto *NewI =
+ tryReassociatedBinaryOp(getBinarySCEV(I, AExpr, RHSExpr), B, I))
+ return NewI;
+ }
+ if (AExpr != RHSExpr) {
+ if (auto *NewI =
+ tryReassociatedBinaryOp(getBinarySCEV(I, BExpr, RHSExpr), A, I))
+ return NewI;
}
}
return nullptr;
@@ -748,18 +752,19 @@ Value *NaryReassociatePass::tryReassociateMinOrMax(Instruction *I,
if (auto *NewMinMax = tryCombination(A, AExpr, RHS, RHSExpr, B, BExpr))
return NewMinMax;
}
- } else {
- // Default order
- if (BExpr != RHSExpr) {
- // Try (A op RHS) op B
- if (auto *NewMinMax = tryCombination(A, AExpr, RHS, RHSExpr, B, BExpr))
- return NewMinMax;
- }
- if (AExpr != RHSExpr) {
- // Try (RHS op B) op A
- if (auto *NewMinMax = tryCombination(RHS, RHSExpr, B, BExpr, A, AExpr))
- return NewMinMax;
- }
+ return nullptr;
+ }
+
+ // Default order
+ if (BExpr != RHSExpr) {
+ // Try (A op RHS) op B
+ if (auto *NewMinMax = tryCombination(A, AExpr, RHS, RHSExpr, B, BExpr))
+ return NewMinMax;
+ }
+ if (AExpr != RHSExpr) {
+ // Try (RHS op B) op A
+ if (auto *NewMinMax = tryCombination(RHS, RHSExpr, B, BExpr, A, AExpr))
+ return NewMinMax;
}
return nullptr;
More information about the llvm-commits
mailing list