[llvm] [AMDGPU] Limit promoting allocas that have users with dynamic index above a threshold on number of elements (PR #170327)
Kevin Choi via llvm-commits
llvm-commits at lists.llvm.org
Wed Dec 3 11:14:45 PST 2025
https://github.com/choikwa updated https://github.com/llvm/llvm-project/pull/170327
>From 97f7c78f07b35264305b54d1ce400a83c6282e42 Mon Sep 17 00:00:00 2001
From: Kevin Choi <kevin.choi at amd.com>
Date: Tue, 2 Dec 2025 09:51:10 -0600
Subject: [PATCH 1/6] [AMDGPU] Limit promoting allocas that have users with
dynamic index above a threshold on number of elements
AMDGPU backend has poor code generation (scalarized copy) for extracting subvectors with dynamic index that can impact compile-time, reg-pressure, etc.
For vectors with large number of elements (i.e. <128 x i8> with <32 x i8> user), dynamic indexing will blow up compile-time in GreedyRA.
Added check in GEP to see if it's used in a load.
Added testcase to test different number of elements in subvector user.
---
.../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 22 +++++
.../AMDGPU/promote-alloca-vector-gep.ll | 80 +++++++++++++++++++
2 files changed, 102 insertions(+)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index bb95265a794a0..aba660ffb6e45 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -85,6 +85,11 @@ static cl::opt<unsigned>
"when sorting profitable allocas"),
cl::init(4));
+static cl::opt<unsigned> DynIdxNumElmLimit("dynamic-index-num-element-limit",
+ cl::desc("Maximum number of elements for promoting alloca with dynamic"
+ " index"),
+ cl::init(8));
+
// Shared implementation which can do both promotion to vector and to LDS.
class AMDGPUPromoteAllocaImpl {
private:
@@ -919,6 +924,23 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
Value *Index = GEPToVectorIndex(GEP, &Alloca, VecEltTy, *DL, NewGEPInsts);
if (!Index)
return RejectUser(Inst, "cannot compute vector index for GEP");
+
+ if (!isa<ConstantInt>(Index)) {
+ bool UsedInLoad = false;
+ for (auto *U : GEP->users()) {
+ if(isa<LoadInst>(U)) {
+ UsedInLoad = true;
+ break;
+ }
+ }
+ if (auto *UserVecTy = dyn_cast<FixedVectorType>(
+ GEP->getSourceElementType())) {
+ if (UsedInLoad && UserVecTy->getNumElements() > DynIdxNumElmLimit) {
+ return RejectUser(Inst,
+ "user has too many number of elements for dynamic index");
+ }
+ }
+ }
GEPVectorIdx[GEP] = Index;
UsersToRemove.push_back(Inst);
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll
index 76e1868b3c4b9..caab29b58c13f 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll
@@ -3,6 +3,8 @@
; Check that invalid IR is not produced on a vector typed
; getelementptr with a scalar alloca pointer base.
+; Also check if GEP with dynamic index is rejected above
+; threshold # of elements.
define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset() {
; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset() {
@@ -250,6 +252,84 @@ bb2:
store i32 0, ptr addrspace(5) %extractelement
ret void
}
+
+define amdgpu_kernel void @GEP_dynamic_idx_v32i8(ptr addrspace(1) %out, i32 %idx) {
+; CHECK-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_v32i8(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [64 x i8], align 4, addrspace(5)
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr addrspace(5) [[ALLOCA]], i32 [[IDX]]
+; CHECK-NEXT: [[VEC:%.*]] = load <16 x i8>, ptr addrspace(5) [[GEP]], align 4
+; CHECK-NEXT: store <16 x i8> [[VEC]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: ret void
+;
+entry:
+ %alloca = alloca [64 x i8], align 4, addrspace(5)
+ %gep = getelementptr inbounds <16 x i8>, ptr addrspace(5) %alloca, i32 %idx
+ %vec = load <16 x i8>, ptr addrspace(5) %gep, align 4
+ store <16 x i8> %vec, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @GEP_dynamic_idx_v8i8(ptr addrspace(1) %out, i32 %idx) {
+; CHECK-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_v8i8(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <64 x i8> poison
+; CHECK-NEXT: [[TMP0:%.*]] = mul i32 [[IDX]], 8
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP0]]
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0
+; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP0]], 1
+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP3]]
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i8> [[TMP2]], i8 [[TMP4]], i64 1
+; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP0]], 2
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP6]]
+; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i8> [[TMP5]], i8 [[TMP7]], i64 2
+; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP0]], 3
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP9]]
+; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x i8> [[TMP8]], i8 [[TMP10]], i64 3
+; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP0]], 4
+; CHECK-NEXT: [[TMP13:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP12]]
+; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x i8> [[TMP11]], i8 [[TMP13]], i64 4
+; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP0]], 5
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP15]]
+; CHECK-NEXT: [[TMP17:%.*]] = insertelement <8 x i8> [[TMP14]], i8 [[TMP16]], i64 5
+; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP0]], 6
+; CHECK-NEXT: [[TMP19:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP18]]
+; CHECK-NEXT: [[TMP20:%.*]] = insertelement <8 x i8> [[TMP17]], i8 [[TMP19]], i64 6
+; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TMP0]], 7
+; CHECK-NEXT: [[TMP22:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP21]]
+; CHECK-NEXT: [[TMP23:%.*]] = insertelement <8 x i8> [[TMP20]], i8 [[TMP22]], i64 7
+; CHECK-NEXT: store <8 x i8> [[TMP23]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: ret void
+;
+entry:
+ %alloca = alloca [64 x i8], align 4, addrspace(5)
+ %gep = getelementptr inbounds <8 x i8>, ptr addrspace(5) %alloca, i32 %idx
+ %vec = load <8 x i8>, ptr addrspace(5) %gep, align 4
+ store <8 x i8> %vec, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @GEP_dynamic_idx_noload(ptr addrspace(1) %out, i32 %idx) {
+; CHECK-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_noload(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [64 x i8], align 4, addrspace(5)
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i8>, ptr addrspace(5) [[ALLOCA]], i32 [[IDX]]
+; CHECK-NEXT: [[GEPINT:%.*]] = ptrtoint ptr addrspace(5) [[GEP]] to i64
+; CHECK-NEXT: store i64 [[GEPINT]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: ret void
+;
+entry:
+ %alloca = alloca [64 x i8], align 4, addrspace(5)
+ %gep = getelementptr inbounds <8 x i8>, ptr addrspace(5) %alloca, i32 %idx
+ %gepint = ptrtoint ptr addrspace(5) %gep to i64
+ store i64 %gepint, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+
;.
; CHECK: [[META0]] = !{}
; CHECK: [[RNG1]] = !{i32 0, i32 1025}
>From 406a57568f58eb0016f916cf028f11c51c9ace0a Mon Sep 17 00:00:00 2001
From: Kevin Choi <5455710+choikwa at users.noreply.github.com>
Date: Tue, 2 Dec 2025 12:07:35 -0500
Subject: [PATCH 2/6] Update llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
Co-authored-by: Copilot <175728472+Copilot at users.noreply.github.com>
---
llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index aba660ffb6e45..783a9408e249f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -937,7 +937,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
GEP->getSourceElementType())) {
if (UsedInLoad && UserVecTy->getNumElements() > DynIdxNumElmLimit) {
return RejectUser(Inst,
- "user has too many number of elements for dynamic index");
+ "user has too many elements for dynamic index");
}
}
}
>From a311a654a85840de44e4a5a8d42cf65c36ed4045 Mon Sep 17 00:00:00 2001
From: Kevin Choi <kevin.choi at amd.com>
Date: Tue, 2 Dec 2025 16:57:33 -0600
Subject: [PATCH 3/6] NFC, formatting
---
llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 17 +++++++++--------
1 file changed, 9 insertions(+), 8 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 783a9408e249f..1282641bd3325 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -85,9 +85,10 @@ static cl::opt<unsigned>
"when sorting profitable allocas"),
cl::init(4));
-static cl::opt<unsigned> DynIdxNumElmLimit("dynamic-index-num-element-limit",
+static cl::opt<unsigned> DynIdxNumElmLimit(
+ "dynamic-index-num-element-limit",
cl::desc("Maximum number of elements for promoting alloca with dynamic"
- " index"),
+ " index"),
cl::init(8));
// Shared implementation which can do both promotion to vector and to LDS.
@@ -924,20 +925,20 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
Value *Index = GEPToVectorIndex(GEP, &Alloca, VecEltTy, *DL, NewGEPInsts);
if (!Index)
return RejectUser(Inst, "cannot compute vector index for GEP");
-
+
if (!isa<ConstantInt>(Index)) {
bool UsedInLoad = false;
for (auto *U : GEP->users()) {
- if(isa<LoadInst>(U)) {
+ if (isa<LoadInst>(U)) {
UsedInLoad = true;
break;
}
}
- if (auto *UserVecTy = dyn_cast<FixedVectorType>(
- GEP->getSourceElementType())) {
+ if (auto *UserVecTy =
+ dyn_cast<FixedVectorType>(GEP->getSourceElementType())) {
if (UsedInLoad && UserVecTy->getNumElements() > DynIdxNumElmLimit) {
- return RejectUser(Inst,
- "user has too many elements for dynamic index");
+ return RejectUser(Inst,
+ "user has too many elements for dynamic index");
}
}
}
>From 80fb5a41eb4d5f0a2dbba5a5d51e192492d6e6da Mon Sep 17 00:00:00 2001
From: Kevin Choi <kevin.choi at amd.com>
Date: Wed, 3 Dec 2025 12:38:44 -0600
Subject: [PATCH 4/6] addressing feedback
---
llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 11 +++--------
1 file changed, 3 insertions(+), 8 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 1282641bd3325..e618b88253457 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -86,7 +86,7 @@ static cl::opt<unsigned>
cl::init(4));
static cl::opt<unsigned> DynIdxNumElmLimit(
- "dynamic-index-num-element-limit",
+ "amdgpu-dynamic-index-num-element-limit",
cl::desc("Maximum number of elements for promoting alloca with dynamic"
" index"),
cl::init(8));
@@ -927,13 +927,8 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
return RejectUser(Inst, "cannot compute vector index for GEP");
if (!isa<ConstantInt>(Index)) {
- bool UsedInLoad = false;
- for (auto *U : GEP->users()) {
- if (isa<LoadInst>(U)) {
- UsedInLoad = true;
- break;
- }
- }
+ bool UsedInLoad = llvm::any_of(GEP->users(),
+ [&](const auto *U){ return isa<LoadInst>(U); });
if (auto *UserVecTy =
dyn_cast<FixedVectorType>(GEP->getSourceElementType())) {
if (UsedInLoad && UserVecTy->getNumElements() > DynIdxNumElmLimit) {
>From 671de2f770da59846de5908dfd5648cc91c7e92b Mon Sep 17 00:00:00 2001
From: Kevin Choi <kevin.choi at amd.com>
Date: Wed, 3 Dec 2025 13:07:11 -0600
Subject: [PATCH 5/6] nfc, rename var
---
llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index e618b88253457..ec4abe281e7fd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -85,7 +85,7 @@ static cl::opt<unsigned>
"when sorting profitable allocas"),
cl::init(4));
-static cl::opt<unsigned> DynIdxNumElmLimit(
+static cl::opt<unsigned> DynamicIndexNumberElementLimit(
"amdgpu-dynamic-index-num-element-limit",
cl::desc("Maximum number of elements for promoting alloca with dynamic"
" index"),
@@ -931,7 +931,8 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
[&](const auto *U){ return isa<LoadInst>(U); });
if (auto *UserVecTy =
dyn_cast<FixedVectorType>(GEP->getSourceElementType())) {
- if (UsedInLoad && UserVecTy->getNumElements() > DynIdxNumElmLimit) {
+ if (UsedInLoad &&
+ UserVecTy->getNumElements() > DynamicIndexNumberElementLimit) {
return RejectUser(Inst,
"user has too many elements for dynamic index");
}
>From 3b7249961aded412120eec78a8fde151024751e2 Mon Sep 17 00:00:00 2001
From: Kevin Choi <kevin.choi at amd.com>
Date: Wed, 3 Dec 2025 13:14:17 -0600
Subject: [PATCH 6/6] nfc, formatting
---
llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index ec4abe281e7fd..912bd799db6c3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -927,8 +927,8 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
return RejectUser(Inst, "cannot compute vector index for GEP");
if (!isa<ConstantInt>(Index)) {
- bool UsedInLoad = llvm::any_of(GEP->users(),
- [&](const auto *U){ return isa<LoadInst>(U); });
+ bool UsedInLoad = llvm::any_of(
+ GEP->users(), [&](const auto *U){ return isa<LoadInst>(U); });
if (auto *UserVecTy =
dyn_cast<FixedVectorType>(GEP->getSourceElementType())) {
if (UsedInLoad &&
More information about the llvm-commits
mailing list