[llvm] [NaryReassociate] Teach NaryReassociate about UniformityAnalysis (PR #175167)
Pankaj Dwivedi via llvm-commits
llvm-commits at lists.llvm.org
Wed Apr 1 01:03:46 PDT 2026
https://github.com/PankajDwivedi-25 updated https://github.com/llvm/llvm-project/pull/175167
>From 40984c61fd9e6b7e6141c1d1b1390483e918a6ba Mon Sep 17 00:00:00 2001
From: padivedi <padivedi at amd.com>
Date: Tue, 13 Jan 2026 18:30:13 +0530
Subject: [PATCH 1/4] [NaryReassociate][AMDGPU] Pre-commit test for
uniformity-aware reassociation (NFC)
---
.../AMDGPU/nary-add-uniform.ll | 319 ++++++++++++++++++
1 file changed, 319 insertions(+)
create mode 100644 llvm/test/Transforms/NaryReassociate/AMDGPU/nary-add-uniform.ll
diff --git a/llvm/test/Transforms/NaryReassociate/AMDGPU/nary-add-uniform.ll b/llvm/test/Transforms/NaryReassociate/AMDGPU/nary-add-uniform.ll
new file mode 100644
index 0000000000000..d8f5c6009aa5e
--- /dev/null
+++ b/llvm/test/Transforms/NaryReassociate/AMDGPU/nary-add-uniform.ll
@@ -0,0 +1,319 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; REQUIRES: amdgpu-registered-target
+; RUN: opt < %s -mtriple=amdgcn-amd-amdhsa -passes='nary-reassociate' -S | FileCheck %s
+
+declare i32 @llvm.amdgcn.workitem.id.x()
+declare i32 @llvm.amdgcn.workitem.id.y()
+declare void @use(i32)
+
+; Test that NaryReassociate prefers grouping uniform values together when
+; uniformity analysis is available and both reassociation options exist.
+;
+; For I = (A op B) op RHS, the pass can form:
+; - (A op RHS) op B
+; - (B op RHS) op A
+;
+; When both dominating expressions exist, prefer the one grouping uniforms.
+
+; Both %d_u2 and %u1_u2 exist as dominating expressions.
+; For (d + u1) + u2:
+; - Without UA preference: would try (d + u2) first, find %d_u2, return %d_u2 + u1
+; - With UA preference: B=u1 and RHS=u2 are uniform, A=d is divergent
+; So prefer (u1 + u2) + d, returning %u1_u2 + d
+;
+
+define amdgpu_kernel void @prefer_uniform_grouping(i32 %u1, i32 %u2) {
+; CHECK-LABEL: define amdgpu_kernel void @prefer_uniform_grouping(
+; CHECK-SAME: i32 [[U1:%.*]], i32 [[U2:%.*]]) {
+; CHECK-NEXT: [[D:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[D_U2:%.*]] = add i32 [[D]], [[U2]]
+; CHECK-NEXT: [[U1_U2:%.*]] = add i32 [[U1]], [[U2]]
+; CHECK-NEXT: call void @use(i32 [[D_U2]])
+; CHECK-NEXT: call void @use(i32 [[U1_U2]])
+; CHECK-NEXT: [[RESULT:%.*]] = add i32 [[D_U2]], [[U1]]
+; CHECK-NEXT: call void @use(i32 [[RESULT]])
+; CHECK-NEXT: ret void
+;
+ %d = call i32 @llvm.amdgcn.workitem.id.x()
+
+ ; Create both possible reassociation targets
+ %d_u2 = add i32 %d, %u2 ; divergent + uniform
+ %u1_u2 = add i32 %u1, %u2 ; uniform + uniform (should be preferred!)
+
+ call void @use(i32 %d_u2)
+ call void @use(i32 %u1_u2)
+
+ ; (d + u1) + u2: both (d + u2) and (u1 + u2) exist
+ ; Should prefer (u1 + u2) + d to group uniforms
+ %tmp = add i32 %d, %u1
+ %result = add i32 %tmp, %u2
+ call void @use(i32 %result)
+
+ ret void
+}
+
+define amdgpu_kernel void @prefer_uniform_grouping_mul(i32 %u1, i32 %u2) {
+; CHECK-LABEL: define amdgpu_kernel void @prefer_uniform_grouping_mul(
+; CHECK-SAME: i32 [[U1:%.*]], i32 [[U2:%.*]]) {
+; CHECK-NEXT: [[D:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[D_U2:%.*]] = mul i32 [[D]], [[U2]]
+; CHECK-NEXT: [[U1_U2:%.*]] = mul i32 [[U1]], [[U2]]
+; CHECK-NEXT: call void @use(i32 [[D_U2]])
+; CHECK-NEXT: call void @use(i32 [[U1_U2]])
+; CHECK-NEXT: [[RESULT:%.*]] = mul i32 [[D_U2]], [[U1]]
+; CHECK-NEXT: call void @use(i32 [[RESULT]])
+; CHECK-NEXT: ret void
+;
+ %d = call i32 @llvm.amdgcn.workitem.id.x()
+
+ %d_u2 = mul i32 %d, %u2
+ %u1_u2 = mul i32 %u1, %u2
+
+ call void @use(i32 %d_u2)
+ call void @use(i32 %u1_u2)
+
+ %tmp = mul i32 %d, %u1
+ %result = mul i32 %tmp, %u2
+ call void @use(i32 %result)
+
+ ret void
+}
+
+define amdgpu_kernel void @only_one_option(i32 %u1, i32 %u2) {
+; CHECK-LABEL: define amdgpu_kernel void @only_one_option(
+; CHECK-SAME: i32 [[U1:%.*]], i32 [[U2:%.*]]) {
+; CHECK-NEXT: [[D:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[U1_U2:%.*]] = add i32 [[U1]], [[U2]]
+; CHECK-NEXT: call void @use(i32 [[U1_U2]])
+; CHECK-NEXT: [[RESULT:%.*]] = add i32 [[U1_U2]], [[D]]
+; CHECK-NEXT: call void @use(i32 [[RESULT]])
+; CHECK-NEXT: ret void
+;
+ %d = call i32 @llvm.amdgcn.workitem.id.x()
+
+ ; Only u1 + u2 exists, not d + u2
+ %u1_u2 = add i32 %u1, %u2
+ call void @use(i32 %u1_u2)
+
+ %tmp = add i32 %d, %u1
+ %result = add i32 %tmp, %u2
+ call void @use(i32 %result)
+
+ ret void
+}
+
+; When no dominating expression exists, no reassociation happens
+define amdgpu_kernel void @no_dominating_expr(i32 %u1, i32 %u2) {
+; CHECK-LABEL: define amdgpu_kernel void @no_dominating_expr(
+; CHECK-SAME: i32 [[U1:%.*]], i32 [[U2:%.*]]) {
+; CHECK-NEXT: [[D:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[TMP:%.*]] = add i32 [[D]], [[U1]]
+; CHECK-NEXT: [[RESULT:%.*]] = add i32 [[TMP]], [[U2]]
+; CHECK-NEXT: call void @use(i32 [[RESULT]])
+; CHECK-NEXT: ret void
+;
+ %d = call i32 @llvm.amdgcn.workitem.id.x()
+
+ ; No dominating expressions exist
+ %tmp = add i32 %d, %u1
+ %result = add i32 %tmp, %u2
+ call void @use(i32 %result)
+
+ ret void
+}
+
+; Test smax: prefer grouping uniform values together
+; For smax(smax(A, B), RHS):
+; - smax(smax(A, RHS), B): groups A and RHS
+; - smax(smax(B, RHS), A): groups B and RHS
+; When B and RHS are uniform but A is divergent, prefer smax(smax(B, RHS), A)
+define amdgpu_kernel void @prefer_uniform_grouping_smax(i32 %u1, i32 %u2) {
+; CHECK-LABEL: define amdgpu_kernel void @prefer_uniform_grouping_smax(
+; CHECK-SAME: i32 [[U1:%.*]], i32 [[U2:%.*]]) {
+; CHECK-NEXT: [[D:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[D_U2:%.*]] = call i32 @llvm.smax.i32(i32 [[D]], i32 [[U2]])
+; CHECK-NEXT: [[U1_U2:%.*]] = call i32 @llvm.smax.i32(i32 [[U1]], i32 [[U2]])
+; CHECK-NEXT: call void @use(i32 [[D_U2]])
+; CHECK-NEXT: call void @use(i32 [[U1_U2]])
+; CHECK-NEXT: [[RESULT_NARY:%.*]] = call i32 @llvm.smax.i32(i32 [[D_U2]], i32 [[U1]])
+; CHECK-NEXT: call void @use(i32 [[RESULT_NARY]])
+; CHECK-NEXT: ret void
+;
+ %d = call i32 @llvm.amdgcn.workitem.id.x()
+
+ ; Create both possible reassociation targets
+ %d_u2 = call i32 @llvm.smax.i32(i32 %d, i32 %u2) ; divergent, uniform
+ %u1_u2 = call i32 @llvm.smax.i32(i32 %u1, i32 %u2) ; uniform, uniform (preferred!)
+
+ call void @use(i32 %d_u2)
+ call void @use(i32 %u1_u2)
+
+ ; smax(smax(d, u1), u2): both smax(d, u2) and smax(u1, u2) exist
+ ; Should prefer smax(smax(u1, u2), d) to group uniforms
+ %tmp = call i32 @llvm.smax.i32(i32 %d, i32 %u1)
+ %result = call i32 @llvm.smax.i32(i32 %tmp, i32 %u2)
+ call void @use(i32 %result)
+
+ ret void
+}
+
+; Test umin: prefer grouping uniform values together
+define amdgpu_kernel void @prefer_uniform_grouping_umin(i32 %u1, i32 %u2) {
+; CHECK-LABEL: define amdgpu_kernel void @prefer_uniform_grouping_umin(
+; CHECK-SAME: i32 [[U1:%.*]], i32 [[U2:%.*]]) {
+; CHECK-NEXT: [[D:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[D_U2:%.*]] = call i32 @llvm.umin.i32(i32 [[D]], i32 [[U2]])
+; CHECK-NEXT: [[U1_U2:%.*]] = call i32 @llvm.umin.i32(i32 [[U1]], i32 [[U2]])
+; CHECK-NEXT: call void @use(i32 [[D_U2]])
+; CHECK-NEXT: call void @use(i32 [[U1_U2]])
+; CHECK-NEXT: [[RESULT_NARY:%.*]] = call i32 @llvm.umin.i32(i32 [[D_U2]], i32 [[U1]])
+; CHECK-NEXT: call void @use(i32 [[RESULT_NARY]])
+; CHECK-NEXT: ret void
+;
+ %d = call i32 @llvm.amdgcn.workitem.id.x()
+
+ %d_u2 = call i32 @llvm.umin.i32(i32 %d, i32 %u2)
+ %u1_u2 = call i32 @llvm.umin.i32(i32 %u1, i32 %u2)
+
+ call void @use(i32 %d_u2)
+ call void @use(i32 %u1_u2)
+
+ %tmp = call i32 @llvm.umin.i32(i32 %d, i32 %u1)
+ %result = call i32 @llvm.umin.i32(i32 %tmp, i32 %u2)
+ call void @use(i32 %result)
+
+ ret void
+}
+
+; Test GEP with LHS=uniform, RHS=divergent
+define amdgpu_kernel void @gep_lhs_uniform_rhs_divergent(ptr %base, i64 %u_offset) {
+; CHECK-LABEL: define amdgpu_kernel void @gep_lhs_uniform_rhs_divergent(
+; CHECK-SAME: ptr [[BASE:%.*]], i64 [[U_OFFSET:%.*]]) {
+; CHECK-NEXT: [[D:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[D_EXT:%.*]] = zext i32 [[D]] to i64
+; CHECK-NEXT: [[GEP_U:%.*]] = getelementptr i32, ptr [[BASE]], i64 [[U_OFFSET]]
+; CHECK-NEXT: [[GEP_D:%.*]] = getelementptr i32, ptr [[BASE]], i64 [[D_EXT]]
+; CHECK-NEXT: call void @use_ptr(ptr [[GEP_U]])
+; CHECK-NEXT: call void @use_ptr(ptr [[GEP_D]])
+; CHECK-NEXT: [[GEP_RESULT:%.*]] = getelementptr i32, ptr [[GEP_U]], i64 [[D_EXT]]
+; CHECK-NEXT: call void @use_ptr(ptr [[GEP_RESULT]])
+; CHECK-NEXT: ret void
+;
+ %d = call i32 @llvm.amdgcn.workitem.id.x()
+ %d_ext = zext i32 %d to i64
+
+ ; Create BOTH dominating GEPs so there's a choice
+ %gep_u = getelementptr i32, ptr %base, i64 %u_offset ; uniform index
+ %gep_d = getelementptr i32, ptr %base, i64 %d_ext ; divergent index
+
+ call void @use_ptr(ptr %gep_u)
+ call void @use_ptr(ptr %gep_d)
+
+ ; idx = u_offset + d_ext (LHS=uniform, RHS=divergent)
+ %idx = add i64 %u_offset, %d_ext
+ %gep_result = getelementptr i32, ptr %base, i64 %idx
+ call void @use_ptr(ptr %gep_result)
+
+ ret void
+}
+
+; Test GEP with LHS=divergent, RHS=uniform
+define amdgpu_kernel void @gep_lhs_divergent_rhs_uniform(ptr %base, i64 %u_offset) {
+; CHECK-LABEL: define amdgpu_kernel void @gep_lhs_divergent_rhs_uniform(
+; CHECK-SAME: ptr [[BASE:%.*]], i64 [[U_OFFSET:%.*]]) {
+; CHECK-NEXT: [[D:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[D_EXT:%.*]] = zext i32 [[D]] to i64
+; CHECK-NEXT: [[GEP_U:%.*]] = getelementptr i32, ptr [[BASE]], i64 [[U_OFFSET]]
+; CHECK-NEXT: [[GEP_D:%.*]] = getelementptr i32, ptr [[BASE]], i64 [[D_EXT]]
+; CHECK-NEXT: call void @use_ptr(ptr [[GEP_U]])
+; CHECK-NEXT: call void @use_ptr(ptr [[GEP_D]])
+; CHECK-NEXT: [[GEP_RESULT:%.*]] = getelementptr i32, ptr [[GEP_D]], i64 [[U_OFFSET]]
+; CHECK-NEXT: call void @use_ptr(ptr [[GEP_RESULT]])
+; CHECK-NEXT: ret void
+;
+ %d = call i32 @llvm.amdgcn.workitem.id.x()
+ %d_ext = zext i32 %d to i64
+
+ ; Create BOTH dominating GEPs so there's a choice
+ %gep_u = getelementptr i32, ptr %base, i64 %u_offset ; uniform index
+ %gep_d = getelementptr i32, ptr %base, i64 %d_ext ; divergent index
+
+ call void @use_ptr(ptr %gep_u)
+ call void @use_ptr(ptr %gep_d)
+
+ ; idx = d_ext + u_offset (LHS=divergent, RHS=uniform)
+ %idx = add i64 %d_ext, %u_offset
+ %gep_result = getelementptr i32, ptr %base, i64 %idx
+ call void @use_ptr(ptr %gep_result)
+
+ ret void
+}
+
+; Test GEP with both LHS and RHS uniform - no preference needed
+define amdgpu_kernel void @gep_both_uniform(ptr %base, i64 %u1, i64 %u2) {
+; CHECK-LABEL: define amdgpu_kernel void @gep_both_uniform(
+; CHECK-SAME: ptr [[BASE:%.*]], i64 [[U1:%.*]], i64 [[U2:%.*]]) {
+; CHECK-NEXT: [[GEP_U1:%.*]] = getelementptr i32, ptr [[BASE]], i64 [[U1]]
+; CHECK-NEXT: [[GEP_U2:%.*]] = getelementptr i32, ptr [[BASE]], i64 [[U2]]
+; CHECK-NEXT: call void @use_ptr(ptr [[GEP_U1]])
+; CHECK-NEXT: call void @use_ptr(ptr [[GEP_U2]])
+; CHECK-NEXT: [[GEP_RESULT:%.*]] = getelementptr i32, ptr [[GEP_U1]], i64 [[U2]]
+; CHECK-NEXT: call void @use_ptr(ptr [[GEP_RESULT]])
+; CHECK-NEXT: ret void
+;
+ ; Create both dominating GEPs with uniform indices
+ %gep_u1 = getelementptr i32, ptr %base, i64 %u1
+ %gep_u2 = getelementptr i32, ptr %base, i64 %u2
+
+ call void @use_ptr(ptr %gep_u1)
+ call void @use_ptr(ptr %gep_u2)
+
+ ; idx = u1 + u2 (both uniform - no preference needed)
+ %idx = add i64 %u1, %u2
+ %gep_result = getelementptr i32, ptr %base, i64 %idx
+ call void @use_ptr(ptr %gep_result)
+
+ ret void
+}
+
+; Test GEP with both LHS and RHS divergent - no preference needed
+define amdgpu_kernel void @gep_both_divergent(ptr %base) {
+; CHECK-LABEL: define amdgpu_kernel void @gep_both_divergent(
+; CHECK-SAME: ptr [[BASE:%.*]]) {
+; CHECK-NEXT: [[D1:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[D2:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT: [[D1_EXT:%.*]] = zext i32 [[D1]] to i64
+; CHECK-NEXT: [[D2_EXT:%.*]] = zext i32 [[D2]] to i64
+; CHECK-NEXT: [[GEP_D1:%.*]] = getelementptr i32, ptr [[BASE]], i64 [[D1_EXT]]
+; CHECK-NEXT: [[GEP_D2:%.*]] = getelementptr i32, ptr [[BASE]], i64 [[D2_EXT]]
+; CHECK-NEXT: call void @use_ptr(ptr [[GEP_D1]])
+; CHECK-NEXT: call void @use_ptr(ptr [[GEP_D2]])
+; CHECK-NEXT: [[GEP_RESULT:%.*]] = getelementptr i32, ptr [[GEP_D1]], i64 [[D2_EXT]]
+; CHECK-NEXT: call void @use_ptr(ptr [[GEP_RESULT]])
+; CHECK-NEXT: ret void
+;
+ %d1 = call i32 @llvm.amdgcn.workitem.id.x()
+ %d2 = call i32 @llvm.amdgcn.workitem.id.y()
+ %d1_ext = zext i32 %d1 to i64
+ %d2_ext = zext i32 %d2 to i64
+
+ ; Create both dominating GEPs with divergent indices
+ %gep_d1 = getelementptr i32, ptr %base, i64 %d1_ext
+ %gep_d2 = getelementptr i32, ptr %base, i64 %d2_ext
+
+ call void @use_ptr(ptr %gep_d1)
+ call void @use_ptr(ptr %gep_d2)
+
+ ; idx = d1_ext + d2_ext (both divergent - no preference needed)
+ %idx = add i64 %d1_ext, %d2_ext
+ %gep_result = getelementptr i32, ptr %base, i64 %idx
+ call void @use_ptr(ptr %gep_result)
+
+ ret void
+}
+
+declare i32 @llvm.smax.i32(i32, i32)
+declare i32 @llvm.smin.i32(i32, i32)
+declare i32 @llvm.umax.i32(i32, i32)
+declare i32 @llvm.umin.i32(i32, i32)
+declare void @use_ptr(ptr)
>From bf65765d667a28b3b974a1840c9bbdf7c4c0a16f Mon Sep 17 00:00:00 2001
From: padivedi <padivedi at amd.com>
Date: Tue, 13 Jan 2026 18:40:20 +0530
Subject: [PATCH 2/4] [NaryReassociate] Make uniformity-aware to prefer
grouping uniform values
---
.../llvm/Transforms/Scalar/NaryReassociate.h | 4 +-
.../lib/Transforms/Scalar/NaryReassociate.cpp | 99 ++++++++++++++++++-
llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 9 +-
.../AMDGPU/nary-add-uniform.ll | 10 +-
4 files changed, 110 insertions(+), 12 deletions(-)
diff --git a/llvm/include/llvm/Transforms/Scalar/NaryReassociate.h b/llvm/include/llvm/Transforms/Scalar/NaryReassociate.h
index 417801d470800..d09d4eb79e366 100644
--- a/llvm/include/llvm/Transforms/Scalar/NaryReassociate.h
+++ b/llvm/include/llvm/Transforms/Scalar/NaryReassociate.h
@@ -81,6 +81,7 @@
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/UniformityAnalysis.h"
#include "llvm/IR/PassManager.h"
#include "llvm/IR/ValueHandle.h"
@@ -105,7 +106,7 @@ class NaryReassociatePass : public PassInfoMixin<NaryReassociatePass> {
// Glue for old PM.
bool runImpl(Function &F, AssumptionCache *AC_, DominatorTree *DT_,
ScalarEvolution *SE_, TargetLibraryInfo *TLI_,
- TargetTransformInfo *TTI_);
+ TargetTransformInfo *TTI_, UniformityInfo *UI_ = nullptr);
private:
// Runs only one iteration of the dominator-based algorithm. See the header
@@ -180,6 +181,7 @@ class NaryReassociatePass : public PassInfoMixin<NaryReassociatePass> {
ScalarEvolution *SE;
TargetLibraryInfo *TLI;
TargetTransformInfo *TTI;
+ UniformityInfo *UI;
// A lookup table quickly telling which instructions compute the given SCEV.
// Note that there can be multiple instructions at different locations
diff --git a/llvm/lib/Transforms/Scalar/NaryReassociate.cpp b/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
index b0a33710c25bc..72dad5db0e931 100644
--- a/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
+++ b/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
@@ -104,6 +104,7 @@
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/Local.h"
@@ -141,6 +142,7 @@ class NaryReassociateLegacyPass : public FunctionPass {
AU.addRequired<ScalarEvolutionWrapperPass>();
AU.addRequired<TargetLibraryInfoWrapperPass>();
AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.addRequired<UniformityInfoWrapperPass>();
AU.setPreservesCFG();
}
@@ -159,6 +161,7 @@ INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
INITIALIZE_PASS_END(NaryReassociateLegacyPass, "nary-reassociate",
"Nary reassociation", false, false)
@@ -176,7 +179,11 @@ bool NaryReassociateLegacyPass::runOnFunction(Function &F) {
auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
- return Impl.runImpl(F, AC, DT, SE, TLI, TTI);
+ // UniformityInfo is required on all targets, but on targets without branch
+ // divergence it does no work and reports everything as uniform.
+ auto *UI = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
+
+ return Impl.runImpl(F, AC, DT, SE, TLI, TTI, UI);
}
PreservedAnalyses NaryReassociatePass::run(Function &F,
@@ -187,7 +194,11 @@ PreservedAnalyses NaryReassociatePass::run(Function &F,
auto *TLI = &AM.getResult<TargetLibraryAnalysis>(F);
auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
- if (!runImpl(F, AC, DT, SE, TLI, TTI))
+ // UniformityInfo is required on all targets, but on targets without branch
+ // divergence it does no work and reports everything as uniform.
+ auto *UI = &AM.getResult<UniformityInfoAnalysis>(F);
+
+ if (!runImpl(F, AC, DT, SE, TLI, TTI, UI))
return PreservedAnalyses::all();
PreservedAnalyses PA;
@@ -199,12 +210,14 @@ PreservedAnalyses NaryReassociatePass::run(Function &F,
bool NaryReassociatePass::runImpl(Function &F, AssumptionCache *AC_,
DominatorTree *DT_, ScalarEvolution *SE_,
TargetLibraryInfo *TLI_,
- TargetTransformInfo *TTI_) {
+ TargetTransformInfo *TTI_,
+ UniformityInfo *UI_) {
AC = AC_;
DT = DT_;
SE = SE_;
TLI = TLI_;
TTI = TTI_;
+ UI = UI_;
DL = &F.getDataLayout();
bool Changed = false, ChangedInThisIteration;
@@ -379,6 +392,33 @@ NaryReassociatePass::tryReassociateGEPAtIndex(GetElementPtrInst *GEP,
Value *LHS = AO->getOperand(0), *RHS = AO->getOperand(1);
// IndexToSplit = LHS + RHS.
+ // tryReassociateGEPAtIndex(GEP, I, LHS, RHS, ...) looks for a dominating
+ // GEP with LHS as index, then creates: NewGEP = existingGEP + RHS * scale.
+ // So the RHS becomes the "remaining" index calculation.
+ //
+ // For uniformity: prefer the remaining calculation to be uniform, as it
+ // can then stay in scalar registers.
+ //
+ // Default order tries LHS first (RHS as remainder). If LHS is uniform and
+ // RHS is divergent, we want to try RHS first so uniform LHS becomes the
+ // remainder. The case where RHS is uniform and LHS is divergent is already
+ // handled by the default order.
+ if (UI && UI->isUniform(LHS) && !UI->isUniform(RHS)) {
+ LLVM_DEBUG(
+ dbgs() << "NARY: Preferring uniform remainder for GEP index\n");
+ // LHS is uniform, prefer it as remainder - try RHS first
+ if (LHS != RHS) {
+ if (auto *NewGEP =
+ tryReassociateGEPAtIndex(GEP, I, RHS, LHS, IndexedType))
+ return NewGEP;
+ }
+ if (auto *NewGEP =
+ tryReassociateGEPAtIndex(GEP, I, LHS, RHS, IndexedType))
+ return NewGEP;
+ return nullptr;
+ }
+
+ // Default order
if (auto *NewGEP = tryReassociateGEPAtIndex(GEP, I, LHS, RHS, IndexedType))
return NewGEP;
// Symmetrically, try IndexToSplit = RHS + LHS.
@@ -482,6 +522,38 @@ Instruction *NaryReassociatePass::tryReassociateBinaryOp(Value *LHS, Value *RHS,
// = (A op RHS) op B or (B op RHS) op A
SCEVUse AExpr = SE->getSCEV(A), BExpr = SE->getSCEV(B);
SCEVUse RHSExpr = SE->getSCEV(RHS);
+
+ // When uniformity analysis is available, prefer reassociations that group
+ // uniform values together. This can reduce register pressure on targets
+ // with divergent execution.
+ //
+ // For I = (A op B) op RHS, we can form:
+ // - (A op RHS) op B: groups A and RHS
+ // - (B op RHS) op A: groups B and RHS
+ //
+ // Prefer the grouping where both operands in the new sub-expression are
+ // uniform, keeping uniform computations grouped together.
+ //
+ // We only need to handle the case where B and RHS are uniform but A is
+ // divergent. The symmetric case (A and RHS uniform, B divergent) is already
+ // handled by the default order which tries (A op RHS) op B first.
+ if (UI && UI->isUniform(B) && UI->isUniform(RHS) && !UI->isUniform(A)) {
+ LLVM_DEBUG(dbgs() << "NARY: Preferring uniform grouping for " << *I
+ << "\n");
+ if (AExpr != RHSExpr) {
+ if (Instruction *NewI =
+ tryReassociatedBinaryOp(getBinarySCEV(I, BExpr, RHSExpr), A, I))
+ return NewI;
+ }
+ if (BExpr != RHSExpr) {
+ if (Instruction *NewI =
+ tryReassociatedBinaryOp(getBinarySCEV(I, AExpr, RHSExpr), B, I))
+ return NewI;
+ }
+ return nullptr;
+ }
+
+ // Default order: try (A op RHS) op B first
if (BExpr != RHSExpr) {
if (auto *NewI =
tryReassociatedBinaryOp(getBinarySCEV(I, AExpr, RHSExpr), B, I))
@@ -649,6 +721,27 @@ Value *NaryReassociatePass::tryReassociateMinOrMax(Instruction *I,
SCEVUse BExpr = SE->getSCEV(B);
SCEVUse RHSExpr = SE->getSCEV(RHS);
+ // Similar to binary ops, prefer grouping uniform values together when
+ // uniformity analysis is available.
+ // For I = minmax(minmax(A, B), RHS), we can form:
+ // - minmax(minmax(A, RHS), B): groups A and RHS
+ // - minmax(minmax(B, RHS), A): groups B and RHS
+ if (UI && UI->isUniform(B) && UI->isUniform(RHS) && !UI->isUniform(A)) {
+ LLVM_DEBUG(dbgs() << "NARY: Preferring uniform grouping for minmax " << *I
+ << "\n");
+ // Try (B op RHS) op A first - groups uniform B with uniform RHS
+ if (AExpr != RHSExpr) {
+ if (auto *NewMinMax = tryCombination(RHS, RHSExpr, B, BExpr, A, AExpr))
+ return NewMinMax;
+ }
+ if (BExpr != RHSExpr) {
+ if (auto *NewMinMax = tryCombination(A, AExpr, RHS, RHSExpr, B, BExpr))
+ return NewMinMax;
+ }
+ return nullptr;
+ }
+
+ // Default order
if (BExpr != RHSExpr) {
// Try (A op RHS) op B
if (auto *NewMinMax = tryCombination(A, AExpr, RHS, RHSExpr, B, BExpr))
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 81b9aae775ed8..3592c2d208947 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -516,9 +516,10 @@
; GCN-O1-OPTS-NEXT: Straight line strength reduction
; GCN-O1-OPTS-NEXT: Early CSE
; GCN-O1-OPTS-NEXT: Scalar Evolution Analysis
+; GCN-O1-OPTS-NEXT: Cycle Info Analysis
+; GCN-O1-OPTS-NEXT: Uniformity Analysis
; GCN-O1-OPTS-NEXT: Nary reassociation
; GCN-O1-OPTS-NEXT: Early CSE
-; GCN-O1-OPTS-NEXT: Cycle Info Analysis
; GCN-O1-OPTS-NEXT: Uniformity Analysis
; GCN-O1-OPTS-NEXT: AMDGPU IR optimizations
; GCN-O1-OPTS-NEXT: Basic Alias Analysis (stateless AA impl)
@@ -833,9 +834,10 @@
; GCN-O2-NEXT: Straight line strength reduction
; GCN-O2-NEXT: Early CSE
; GCN-O2-NEXT: Scalar Evolution Analysis
+; GCN-O2-NEXT: Cycle Info Analysis
+; GCN-O2-NEXT: Uniformity Analysis
; GCN-O2-NEXT: Nary reassociation
; GCN-O2-NEXT: Early CSE
-; GCN-O2-NEXT: Cycle Info Analysis
; GCN-O2-NEXT: Uniformity Analysis
; GCN-O2-NEXT: AMDGPU IR optimizations
; GCN-O2-NEXT: Basic Alias Analysis (stateless AA impl)
@@ -1166,9 +1168,10 @@
; GCN-O3-NEXT: Optimization Remark Emitter
; GCN-O3-NEXT: Global Value Numbering
; GCN-O3-NEXT: Scalar Evolution Analysis
+; GCN-O3-NEXT: Cycle Info Analysis
+; GCN-O3-NEXT: Uniformity Analysis
; GCN-O3-NEXT: Nary reassociation
; GCN-O3-NEXT: Early CSE
-; GCN-O3-NEXT: Cycle Info Analysis
; GCN-O3-NEXT: Uniformity Analysis
; GCN-O3-NEXT: AMDGPU IR optimizations
; GCN-O3-NEXT: Basic Alias Analysis (stateless AA impl)
diff --git a/llvm/test/Transforms/NaryReassociate/AMDGPU/nary-add-uniform.ll b/llvm/test/Transforms/NaryReassociate/AMDGPU/nary-add-uniform.ll
index d8f5c6009aa5e..487a6d9270121 100644
--- a/llvm/test/Transforms/NaryReassociate/AMDGPU/nary-add-uniform.ll
+++ b/llvm/test/Transforms/NaryReassociate/AMDGPU/nary-add-uniform.ll
@@ -30,7 +30,7 @@ define amdgpu_kernel void @prefer_uniform_grouping(i32 %u1, i32 %u2) {
; CHECK-NEXT: [[U1_U2:%.*]] = add i32 [[U1]], [[U2]]
; CHECK-NEXT: call void @use(i32 [[D_U2]])
; CHECK-NEXT: call void @use(i32 [[U1_U2]])
-; CHECK-NEXT: [[RESULT:%.*]] = add i32 [[D_U2]], [[U1]]
+; CHECK-NEXT: [[RESULT:%.*]] = add i32 [[U1_U2]], [[D]]
; CHECK-NEXT: call void @use(i32 [[RESULT]])
; CHECK-NEXT: ret void
;
@@ -60,7 +60,7 @@ define amdgpu_kernel void @prefer_uniform_grouping_mul(i32 %u1, i32 %u2) {
; CHECK-NEXT: [[U1_U2:%.*]] = mul i32 [[U1]], [[U2]]
; CHECK-NEXT: call void @use(i32 [[D_U2]])
; CHECK-NEXT: call void @use(i32 [[U1_U2]])
-; CHECK-NEXT: [[RESULT:%.*]] = mul i32 [[D_U2]], [[U1]]
+; CHECK-NEXT: [[RESULT:%.*]] = mul i32 [[U1_U2]], [[D]]
; CHECK-NEXT: call void @use(i32 [[RESULT]])
; CHECK-NEXT: ret void
;
@@ -135,7 +135,7 @@ define amdgpu_kernel void @prefer_uniform_grouping_smax(i32 %u1, i32 %u2) {
; CHECK-NEXT: [[U1_U2:%.*]] = call i32 @llvm.smax.i32(i32 [[U1]], i32 [[U2]])
; CHECK-NEXT: call void @use(i32 [[D_U2]])
; CHECK-NEXT: call void @use(i32 [[U1_U2]])
-; CHECK-NEXT: [[RESULT_NARY:%.*]] = call i32 @llvm.smax.i32(i32 [[D_U2]], i32 [[U1]])
+; CHECK-NEXT: [[RESULT_NARY:%.*]] = call i32 @llvm.smax.i32(i32 [[U1_U2]], i32 [[D]])
; CHECK-NEXT: call void @use(i32 [[RESULT_NARY]])
; CHECK-NEXT: ret void
;
@@ -166,7 +166,7 @@ define amdgpu_kernel void @prefer_uniform_grouping_umin(i32 %u1, i32 %u2) {
; CHECK-NEXT: [[U1_U2:%.*]] = call i32 @llvm.umin.i32(i32 [[U1]], i32 [[U2]])
; CHECK-NEXT: call void @use(i32 [[D_U2]])
; CHECK-NEXT: call void @use(i32 [[U1_U2]])
-; CHECK-NEXT: [[RESULT_NARY:%.*]] = call i32 @llvm.umin.i32(i32 [[D_U2]], i32 [[U1]])
+; CHECK-NEXT: [[RESULT_NARY:%.*]] = call i32 @llvm.umin.i32(i32 [[U1_U2]], i32 [[D]])
; CHECK-NEXT: call void @use(i32 [[RESULT_NARY]])
; CHECK-NEXT: ret void
;
@@ -195,7 +195,7 @@ define amdgpu_kernel void @gep_lhs_uniform_rhs_divergent(ptr %base, i64 %u_offse
; CHECK-NEXT: [[GEP_D:%.*]] = getelementptr i32, ptr [[BASE]], i64 [[D_EXT]]
; CHECK-NEXT: call void @use_ptr(ptr [[GEP_U]])
; CHECK-NEXT: call void @use_ptr(ptr [[GEP_D]])
-; CHECK-NEXT: [[GEP_RESULT:%.*]] = getelementptr i32, ptr [[GEP_U]], i64 [[D_EXT]]
+; CHECK-NEXT: [[GEP_RESULT:%.*]] = getelementptr i32, ptr [[GEP_D]], i64 [[U_OFFSET]]
; CHECK-NEXT: call void @use_ptr(ptr [[GEP_RESULT]])
; CHECK-NEXT: ret void
;
>From eac646711a17072bd2abffd75d4147e8ef21f5f3 Mon Sep 17 00:00:00 2001
From: padivedi <padivedi at amd.com>
Date: Thu, 26 Feb 2026 18:40:31 +0530
Subject: [PATCH 3/4] review: address suggestions
---
.../lib/Transforms/Scalar/NaryReassociate.cpp | 48 ++++++++-----------
1 file changed, 20 insertions(+), 28 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/NaryReassociate.cpp b/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
index 72dad5db0e931..c239e09470140 100644
--- a/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
+++ b/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
@@ -181,7 +181,8 @@ bool NaryReassociateLegacyPass::runOnFunction(Function &F) {
// UniformityInfo is required on all targets, but on targets without branch
// divergence it does no work and reports everything as uniform.
- auto *UI = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
+ UniformityInfo *UI =
+ &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
return Impl.runImpl(F, AC, DT, SE, TLI, TTI, UI);
}
@@ -196,7 +197,7 @@ PreservedAnalyses NaryReassociatePass::run(Function &F,
// UniformityInfo is required on all targets, but on targets without branch
// divergence it does no work and reports everything as uniform.
- auto *UI = &AM.getResult<UniformityInfoAnalysis>(F);
+ UniformityInfo *UI = &AM.getResult<UniformityInfoAnalysis>(F);
if (!runImpl(F, AC, DT, SE, TLI, TTI, UI))
return PreservedAnalyses::all();
@@ -395,38 +396,29 @@ NaryReassociatePass::tryReassociateGEPAtIndex(GetElementPtrInst *GEP,
// tryReassociateGEPAtIndex(GEP, I, LHS, RHS, ...) looks for a dominating
// GEP with LHS as index, then creates: NewGEP = existingGEP + RHS * scale.
// So the RHS becomes the "remaining" index calculation.
- //
- // For uniformity: prefer the remaining calculation to be uniform, as it
- // can then stay in scalar registers.
- //
+
+ // When LHS == RHS, both call orders are identical, so only try once.
+ if (LHS == RHS)
+ return tryReassociateGEPAtIndex(GEP, I, LHS, RHS, IndexedType);
+
+ // When uniformity analysis is available, prefer the remaining calculation
+ // to be uniform, keeping uniform computations grouped together.
// Default order tries LHS first (RHS as remainder). If LHS is uniform and
- // RHS is divergent, we want to try RHS first so uniform LHS becomes the
- // remainder. The case where RHS is uniform and LHS is divergent is already
- // handled by the default order.
+ // RHS is divergent, try RHS first so uniform LHS becomes the remainder.
if (UI && UI->isUniform(LHS) && !UI->isUniform(RHS)) {
LLVM_DEBUG(
dbgs() << "NARY: Preferring uniform remainder for GEP index\n");
- // LHS is uniform, prefer it as remainder - try RHS first
- if (LHS != RHS) {
- if (auto *NewGEP =
- tryReassociateGEPAtIndex(GEP, I, RHS, LHS, IndexedType))
- return NewGEP;
- }
- if (auto *NewGEP =
- tryReassociateGEPAtIndex(GEP, I, LHS, RHS, IndexedType))
+ if (GetElementPtrInst *NewGEP =
+ tryReassociateGEPAtIndex(GEP, I, RHS, LHS, IndexedType))
return NewGEP;
- return nullptr;
+ return tryReassociateGEPAtIndex(GEP, I, LHS, RHS, IndexedType);
}
- // Default order
- if (auto *NewGEP = tryReassociateGEPAtIndex(GEP, I, LHS, RHS, IndexedType))
+ // Default order: try (LHS, RHS) first, then (RHS, LHS).
+ if (GetElementPtrInst *NewGEP =
+ tryReassociateGEPAtIndex(GEP, I, LHS, RHS, IndexedType))
return NewGEP;
- // Symmetrically, try IndexToSplit = RHS + LHS.
- if (LHS != RHS) {
- if (auto *NewGEP =
- tryReassociateGEPAtIndex(GEP, I, RHS, LHS, IndexedType))
- return NewGEP;
- }
+ return tryReassociateGEPAtIndex(GEP, I, RHS, LHS, IndexedType);
}
return nullptr;
}
@@ -731,11 +723,11 @@ Value *NaryReassociatePass::tryReassociateMinOrMax(Instruction *I,
<< "\n");
// Try (B op RHS) op A first - groups uniform B with uniform RHS
if (AExpr != RHSExpr) {
- if (auto *NewMinMax = tryCombination(RHS, RHSExpr, B, BExpr, A, AExpr))
+ if (Value *NewMinMax = tryCombination(RHS, RHSExpr, B, BExpr, A, AExpr))
return NewMinMax;
}
if (BExpr != RHSExpr) {
- if (auto *NewMinMax = tryCombination(A, AExpr, RHS, RHSExpr, B, BExpr))
+ if (Value *NewMinMax = tryCombination(A, AExpr, RHS, RHSExpr, B, BExpr))
return NewMinMax;
}
return nullptr;
>From ca8645f14a6e0f7bf16e83721713686714b1306c Mon Sep 17 00:00:00 2001
From: padivedi <padivedi at amd.com>
Date: Wed, 1 Apr 2026 13:33:29 +0530
Subject: [PATCH 4/4] delay fetching unifromity for target which don't need it
---
llvm/lib/Transforms/Scalar/NaryReassociate.cpp | 17 ++++++++++-------
1 file changed, 10 insertions(+), 7 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/NaryReassociate.cpp b/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
index c239e09470140..6e4927fe7a329 100644
--- a/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
+++ b/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
@@ -179,10 +179,11 @@ bool NaryReassociateLegacyPass::runOnFunction(Function &F) {
auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
- // UniformityInfo is required on all targets, but on targets without branch
- // divergence it does no work and reports everything as uniform.
- UniformityInfo *UI =
- &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
+ // Only compute UniformityInfo on targets with branch divergence to avoid
+ // the compile-time cost of CycleAnalysis on targets that don't need it.
+ UniformityInfo *UI = nullptr;
+ if (TTI->hasBranchDivergence(&F))
+ UI = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
return Impl.runImpl(F, AC, DT, SE, TLI, TTI, UI);
}
@@ -195,9 +196,11 @@ PreservedAnalyses NaryReassociatePass::run(Function &F,
auto *TLI = &AM.getResult<TargetLibraryAnalysis>(F);
auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
- // UniformityInfo is required on all targets, but on targets without branch
- // divergence it does no work and reports everything as uniform.
- UniformityInfo *UI = &AM.getResult<UniformityInfoAnalysis>(F);
+ // Only compute UniformityInfo on targets with branch divergence to avoid
+ // the compile-time cost of CycleAnalysis on targets that don't need it.
+ UniformityInfo *UI = nullptr;
+ if (TTI->hasBranchDivergence(&F))
+ UI = &AM.getResult<UniformityInfoAnalysis>(F);
if (!runImpl(F, AC, DT, SE, TLI, TTI, UI))
return PreservedAnalyses::all();
More information about the llvm-commits
mailing list