[llvm] [AMDGPU] Limit promoting allocas that have users with dynamic index above a threshold on number of elements (PR #170327)

Wed Dec 3 11:14:45 PST 2025

https://github.com/choikwa updated https://github.com/llvm/llvm-project/pull/170327

>From 97f7c78f07b35264305b54d1ce400a83c6282e42 Mon Sep 17 00:00:00 2001
From: Kevin Choi <kevin.choi at amd.com>
Date: Tue, 2 Dec 2025 09:51:10 -0600
Subject: [PATCH 1/6] [AMDGPU] Limit promoting allocas that have users with
 dynamic index above a threshold on number of elements

AMDGPU backend has poor code generation (scalarized copy) for extracting subvectors with dynamic index that can impact compile-time, reg-pressure, etc.
For vectors with large number of elements (i.e. <128 x i8> with <32 x i8> user), dynamic indexing will blow up compile-time in GreedyRA.

Added check in GEP to see if it's used in a load.
Added testcase to test different number of elements in subvector user.
---
 .../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 22 +++++
 .../AMDGPU/promote-alloca-vector-gep.ll       | 80 +++++++++++++++++++
 2 files changed, 102 insertions(+)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index bb95265a794a0..aba660ffb6e45 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -85,6 +85,11 @@ static cl::opt<unsigned>
                             "when sorting profitable allocas"),
                    cl::init(4));
 
+static cl::opt<unsigned> DynIdxNumElmLimit("dynamic-index-num-element-limit",
+    cl::desc("Maximum number of elements for promoting alloca with dynamic"
+      " index"),
+    cl::init(8));
+
 // Shared implementation which can do both promotion to vector and to LDS.
 class AMDGPUPromoteAllocaImpl {
 private:
@@ -919,6 +924,23 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
       Value *Index = GEPToVectorIndex(GEP, &Alloca, VecEltTy, *DL, NewGEPInsts);
       if (!Index)
         return RejectUser(Inst, "cannot compute vector index for GEP");
+      
+      if (!isa<ConstantInt>(Index)) {
+        bool UsedInLoad = false;
+        for (auto *U : GEP->users()) {
+          if(isa<LoadInst>(U)) {
+            UsedInLoad = true;
+            break;
+          }
+        }
+        if (auto *UserVecTy = dyn_cast<FixedVectorType>(
+                GEP->getSourceElementType())) {
+          if (UsedInLoad && UserVecTy->getNumElements() > DynIdxNumElmLimit) {
+            return RejectUser(Inst, 
+              "user has too many number of elements for dynamic index");
+          }
+        }
+      }
 
       GEPVectorIdx[GEP] = Index;
       UsersToRemove.push_back(Inst);
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll
index 76e1868b3c4b9..caab29b58c13f 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll
@@ -3,6 +3,8 @@
 
 ; Check that invalid IR is not produced on a vector typed
 ; getelementptr with a scalar alloca pointer base.
+; Also check if GEP with dynamic index is rejected above
+; threshold # of elements.
 
 define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset() {
 ; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset() {
@@ -250,6 +252,84 @@ bb2:
   store i32 0, ptr addrspace(5) %extractelement
   ret void
 }
+
+define amdgpu_kernel void @GEP_dynamic_idx_v32i8(ptr addrspace(1) %out, i32 %idx) {
+; CHECK-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_v32i8(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [64 x i8], align 4, addrspace(5)
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr addrspace(5) [[ALLOCA]], i32 [[IDX]]
+; CHECK-NEXT:    [[VEC:%.*]] = load <16 x i8>, ptr addrspace(5) [[GEP]], align 4
+; CHECK-NEXT:    store <16 x i8> [[VEC]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %alloca = alloca [64 x i8], align 4, addrspace(5)
+  %gep = getelementptr inbounds <16 x i8>, ptr addrspace(5) %alloca, i32 %idx
+  %vec = load <16 x i8>, ptr addrspace(5) %gep, align 4
+  store <16 x i8> %vec, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @GEP_dynamic_idx_v8i8(ptr addrspace(1) %out, i32 %idx) {
+; CHECK-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_v8i8(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[ALLOCA:%.*]] = freeze <64 x i8> poison
+; CHECK-NEXT:    [[TMP0:%.*]] = mul i32 [[IDX]], 8
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i8> [[TMP2]], i8 [[TMP4]], i64 1
+; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x i8> [[TMP5]], i8 [[TMP7]], i64 2
+; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[TMP0]], 3
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <8 x i8> [[TMP8]], i8 [[TMP10]], i64 3
+; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <8 x i8> [[TMP11]], i8 [[TMP13]], i64 4
+; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP0]], 5
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <8 x i8> [[TMP14]], i8 [[TMP16]], i64 5
+; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP0]], 6
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <8 x i8> [[TMP17]], i8 [[TMP19]], i64 6
+; CHECK-NEXT:    [[TMP21:%.*]] = add i32 [[TMP0]], 7
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP21]]
+; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <8 x i8> [[TMP20]], i8 [[TMP22]], i64 7
+; CHECK-NEXT:    store <8 x i8> [[TMP23]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %alloca = alloca [64 x i8], align 4, addrspace(5)
+  %gep = getelementptr inbounds <8 x i8>, ptr addrspace(5) %alloca, i32 %idx
+  %vec = load <8 x i8>, ptr addrspace(5) %gep, align 4
+  store <8 x i8> %vec, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @GEP_dynamic_idx_noload(ptr addrspace(1) %out, i32 %idx) {
+; CHECK-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_noload(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [64 x i8], align 4, addrspace(5)
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i8>, ptr addrspace(5) [[ALLOCA]], i32 [[IDX]]
+; CHECK-NEXT:    [[GEPINT:%.*]] = ptrtoint ptr addrspace(5) [[GEP]] to i64
+; CHECK-NEXT:    store i64 [[GEPINT]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %alloca = alloca [64 x i8], align 4, addrspace(5)
+  %gep = getelementptr inbounds <8 x i8>, ptr addrspace(5) %alloca, i32 %idx
+  %gepint = ptrtoint ptr addrspace(5) %gep to i64
+  store i64 %gepint, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+
 ;.
 ; CHECK: [[META0]] = !{}
 ; CHECK: [[RNG1]] = !{i32 0, i32 1025}

>From 406a57568f58eb0016f916cf028f11c51c9ace0a Mon Sep 17 00:00:00 2001
From: Kevin Choi <5455710+choikwa at users.noreply.github.com>
Date: Tue, 2 Dec 2025 12:07:35 -0500
Subject: [PATCH 2/6] Update llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

Co-authored-by: Copilot <175728472+Copilot at users.noreply.github.com>
---
 llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index aba660ffb6e45..783a9408e249f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -937,7 +937,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
                 GEP->getSourceElementType())) {
           if (UsedInLoad && UserVecTy->getNumElements() > DynIdxNumElmLimit) {
             return RejectUser(Inst, 
-              "user has too many number of elements for dynamic index");
+              "user has too many elements for dynamic index");
           }
         }
       }

>From a311a654a85840de44e4a5a8d42cf65c36ed4045 Mon Sep 17 00:00:00 2001
From: Kevin Choi <kevin.choi at amd.com>
Date: Tue, 2 Dec 2025 16:57:33 -0600
Subject: [PATCH 3/6] NFC, formatting

---
 llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 783a9408e249f..1282641bd3325 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -85,9 +85,10 @@ static cl::opt<unsigned>
                             "when sorting profitable allocas"),
                    cl::init(4));
 
-static cl::opt<unsigned> DynIdxNumElmLimit("dynamic-index-num-element-limit",
+static cl::opt<unsigned> DynIdxNumElmLimit(
+    "dynamic-index-num-element-limit",
     cl::desc("Maximum number of elements for promoting alloca with dynamic"
-      " index"),
+             " index"),
     cl::init(8));
 
 // Shared implementation which can do both promotion to vector and to LDS.
@@ -924,20 +925,20 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
       Value *Index = GEPToVectorIndex(GEP, &Alloca, VecEltTy, *DL, NewGEPInsts);
       if (!Index)
         return RejectUser(Inst, "cannot compute vector index for GEP");
-      
+
       if (!isa<ConstantInt>(Index)) {
         bool UsedInLoad = false;
         for (auto *U : GEP->users()) {
-          if(isa<LoadInst>(U)) {
+          if (isa<LoadInst>(U)) {
             UsedInLoad = true;
             break;
           }
         }
-        if (auto *UserVecTy = dyn_cast<FixedVectorType>(
-                GEP->getSourceElementType())) {
+        if (auto *UserVecTy =
+                dyn_cast<FixedVectorType>(GEP->getSourceElementType())) {
           if (UsedInLoad && UserVecTy->getNumElements() > DynIdxNumElmLimit) {
-            return RejectUser(Inst, 
-              "user has too many elements for dynamic index");
+            return RejectUser(Inst,
+                              "user has too many elements for dynamic index");
           }
         }
       }

>From 80fb5a41eb4d5f0a2dbba5a5d51e192492d6e6da Mon Sep 17 00:00:00 2001
From: Kevin Choi <kevin.choi at amd.com>
Date: Wed, 3 Dec 2025 12:38:44 -0600
Subject: [PATCH 4/6] addressing feedback

---
 llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 1282641bd3325..e618b88253457 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -86,7 +86,7 @@ static cl::opt<unsigned>
                    cl::init(4));
 
 static cl::opt<unsigned> DynIdxNumElmLimit(
-    "dynamic-index-num-element-limit",
+    "amdgpu-dynamic-index-num-element-limit",
     cl::desc("Maximum number of elements for promoting alloca with dynamic"
              " index"),
     cl::init(8));
@@ -927,13 +927,8 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
         return RejectUser(Inst, "cannot compute vector index for GEP");
 
       if (!isa<ConstantInt>(Index)) {
-        bool UsedInLoad = false;
-        for (auto *U : GEP->users()) {
-          if (isa<LoadInst>(U)) {
-            UsedInLoad = true;
-            break;
-          }
-        }
+        bool UsedInLoad = llvm::any_of(GEP->users(), 
+            [&](const auto *U){ return isa<LoadInst>(U); });
         if (auto *UserVecTy =
                 dyn_cast<FixedVectorType>(GEP->getSourceElementType())) {
           if (UsedInLoad && UserVecTy->getNumElements() > DynIdxNumElmLimit) {

>From 671de2f770da59846de5908dfd5648cc91c7e92b Mon Sep 17 00:00:00 2001
From: Kevin Choi <kevin.choi at amd.com>
Date: Wed, 3 Dec 2025 13:07:11 -0600
Subject: [PATCH 5/6] nfc, rename var

---
 llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index e618b88253457..ec4abe281e7fd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -85,7 +85,7 @@ static cl::opt<unsigned>
                             "when sorting profitable allocas"),
                    cl::init(4));
 
-static cl::opt<unsigned> DynIdxNumElmLimit(
+static cl::opt<unsigned> DynamicIndexNumberElementLimit(
     "amdgpu-dynamic-index-num-element-limit",
     cl::desc("Maximum number of elements for promoting alloca with dynamic"
              " index"),
@@ -931,7 +931,8 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
             [&](const auto *U){ return isa<LoadInst>(U); });
         if (auto *UserVecTy =
                 dyn_cast<FixedVectorType>(GEP->getSourceElementType())) {
-          if (UsedInLoad && UserVecTy->getNumElements() > DynIdxNumElmLimit) {
+          if (UsedInLoad &&
+              UserVecTy->getNumElements() > DynamicIndexNumberElementLimit) {
             return RejectUser(Inst,
                               "user has too many elements for dynamic index");
           }

>From 3b7249961aded412120eec78a8fde151024751e2 Mon Sep 17 00:00:00 2001
From: Kevin Choi <kevin.choi at amd.com>
Date: Wed, 3 Dec 2025 13:14:17 -0600
Subject: [PATCH 6/6] nfc, formatting

---
 llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index ec4abe281e7fd..912bd799db6c3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -927,8 +927,8 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
         return RejectUser(Inst, "cannot compute vector index for GEP");
 
       if (!isa<ConstantInt>(Index)) {
-        bool UsedInLoad = llvm::any_of(GEP->users(), 
-            [&](const auto *U){ return isa<LoadInst>(U); });
+        bool UsedInLoad = llvm::any_of(
+            GEP->users(), [&](const auto *U){ return isa<LoadInst>(U); });
         if (auto *UserVecTy =
                 dyn_cast<FixedVectorType>(GEP->getSourceElementType())) {
           if (UsedInLoad &&