[llvm] [AMDGPU] Limit promoting allocas that have users with dynamic index above a threshold on number of elements (PR #170327)
Kevin Choi via llvm-commits
llvm-commits at lists.llvm.org
Wed Dec 3 12:17:35 PST 2025
https://github.com/choikwa updated https://github.com/llvm/llvm-project/pull/170327
>From 97f7c78f07b35264305b54d1ce400a83c6282e42 Mon Sep 17 00:00:00 2001
From: Kevin Choi <kevin.choi at amd.com>
Date: Tue, 2 Dec 2025 09:51:10 -0600
Subject: [PATCH 01/10] [AMDGPU] Limit promoting allocas that have users with
dynamic index above a threshold on number of elements
AMDGPU backend has poor code generation (scalarized copy) for extracting subvectors with dynamic index that can impact compile-time, reg-pressure, etc.
For vectors with large number of elements (i.e. <128 x i8> with <32 x i8> user), dynamic indexing will blow up compile-time in GreedyRA.
Added check in GEP to see if it's used in a load.
Added testcase to test different number of elements in subvector user.
---
.../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 22 +++++
.../AMDGPU/promote-alloca-vector-gep.ll | 80 +++++++++++++++++++
2 files changed, 102 insertions(+)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index bb95265a794a0..aba660ffb6e45 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -85,6 +85,11 @@ static cl::opt<unsigned>
"when sorting profitable allocas"),
cl::init(4));
+static cl::opt<unsigned> DynIdxNumElmLimit("dynamic-index-num-element-limit",
+ cl::desc("Maximum number of elements for promoting alloca with dynamic"
+ " index"),
+ cl::init(8));
+
// Shared implementation which can do both promotion to vector and to LDS.
class AMDGPUPromoteAllocaImpl {
private:
@@ -919,6 +924,23 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
Value *Index = GEPToVectorIndex(GEP, &Alloca, VecEltTy, *DL, NewGEPInsts);
if (!Index)
return RejectUser(Inst, "cannot compute vector index for GEP");
+
+ if (!isa<ConstantInt>(Index)) {
+ bool UsedInLoad = false;
+ for (auto *U : GEP->users()) {
+ if(isa<LoadInst>(U)) {
+ UsedInLoad = true;
+ break;
+ }
+ }
+ if (auto *UserVecTy = dyn_cast<FixedVectorType>(
+ GEP->getSourceElementType())) {
+ if (UsedInLoad && UserVecTy->getNumElements() > DynIdxNumElmLimit) {
+ return RejectUser(Inst,
+ "user has too many number of elements for dynamic index");
+ }
+ }
+ }
GEPVectorIdx[GEP] = Index;
UsersToRemove.push_back(Inst);
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll
index 76e1868b3c4b9..caab29b58c13f 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll
@@ -3,6 +3,8 @@
; Check that invalid IR is not produced on a vector typed
; getelementptr with a scalar alloca pointer base.
+; Also check if GEP with dynamic index is rejected above
+; threshold # of elements.
define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset() {
; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset() {
@@ -250,6 +252,84 @@ bb2:
store i32 0, ptr addrspace(5) %extractelement
ret void
}
+
+define amdgpu_kernel void @GEP_dynamic_idx_v32i8(ptr addrspace(1) %out, i32 %idx) {
+; CHECK-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_v32i8(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [64 x i8], align 4, addrspace(5)
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr addrspace(5) [[ALLOCA]], i32 [[IDX]]
+; CHECK-NEXT: [[VEC:%.*]] = load <16 x i8>, ptr addrspace(5) [[GEP]], align 4
+; CHECK-NEXT: store <16 x i8> [[VEC]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: ret void
+;
+entry:
+ %alloca = alloca [64 x i8], align 4, addrspace(5)
+ %gep = getelementptr inbounds <16 x i8>, ptr addrspace(5) %alloca, i32 %idx
+ %vec = load <16 x i8>, ptr addrspace(5) %gep, align 4
+ store <16 x i8> %vec, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @GEP_dynamic_idx_v8i8(ptr addrspace(1) %out, i32 %idx) {
+; CHECK-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_v8i8(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <64 x i8> poison
+; CHECK-NEXT: [[TMP0:%.*]] = mul i32 [[IDX]], 8
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP0]]
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0
+; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP0]], 1
+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP3]]
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i8> [[TMP2]], i8 [[TMP4]], i64 1
+; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP0]], 2
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP6]]
+; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i8> [[TMP5]], i8 [[TMP7]], i64 2
+; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP0]], 3
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP9]]
+; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x i8> [[TMP8]], i8 [[TMP10]], i64 3
+; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP0]], 4
+; CHECK-NEXT: [[TMP13:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP12]]
+; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x i8> [[TMP11]], i8 [[TMP13]], i64 4
+; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP0]], 5
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP15]]
+; CHECK-NEXT: [[TMP17:%.*]] = insertelement <8 x i8> [[TMP14]], i8 [[TMP16]], i64 5
+; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP0]], 6
+; CHECK-NEXT: [[TMP19:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP18]]
+; CHECK-NEXT: [[TMP20:%.*]] = insertelement <8 x i8> [[TMP17]], i8 [[TMP19]], i64 6
+; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TMP0]], 7
+; CHECK-NEXT: [[TMP22:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP21]]
+; CHECK-NEXT: [[TMP23:%.*]] = insertelement <8 x i8> [[TMP20]], i8 [[TMP22]], i64 7
+; CHECK-NEXT: store <8 x i8> [[TMP23]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: ret void
+;
+entry:
+ %alloca = alloca [64 x i8], align 4, addrspace(5)
+ %gep = getelementptr inbounds <8 x i8>, ptr addrspace(5) %alloca, i32 %idx
+ %vec = load <8 x i8>, ptr addrspace(5) %gep, align 4
+ store <8 x i8> %vec, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @GEP_dynamic_idx_noload(ptr addrspace(1) %out, i32 %idx) {
+; CHECK-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_noload(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [64 x i8], align 4, addrspace(5)
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i8>, ptr addrspace(5) [[ALLOCA]], i32 [[IDX]]
+; CHECK-NEXT: [[GEPINT:%.*]] = ptrtoint ptr addrspace(5) [[GEP]] to i64
+; CHECK-NEXT: store i64 [[GEPINT]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: ret void
+;
+entry:
+ %alloca = alloca [64 x i8], align 4, addrspace(5)
+ %gep = getelementptr inbounds <8 x i8>, ptr addrspace(5) %alloca, i32 %idx
+ %gepint = ptrtoint ptr addrspace(5) %gep to i64
+ store i64 %gepint, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+
;.
; CHECK: [[META0]] = !{}
; CHECK: [[RNG1]] = !{i32 0, i32 1025}
>From 406a57568f58eb0016f916cf028f11c51c9ace0a Mon Sep 17 00:00:00 2001
From: Kevin Choi <5455710+choikwa at users.noreply.github.com>
Date: Tue, 2 Dec 2025 12:07:35 -0500
Subject: [PATCH 02/10] Update llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
Co-authored-by: Copilot <175728472+Copilot at users.noreply.github.com>
---
llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index aba660ffb6e45..783a9408e249f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -937,7 +937,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
GEP->getSourceElementType())) {
if (UsedInLoad && UserVecTy->getNumElements() > DynIdxNumElmLimit) {
return RejectUser(Inst,
- "user has too many number of elements for dynamic index");
+ "user has too many elements for dynamic index");
}
}
}
>From a311a654a85840de44e4a5a8d42cf65c36ed4045 Mon Sep 17 00:00:00 2001
From: Kevin Choi <kevin.choi at amd.com>
Date: Tue, 2 Dec 2025 16:57:33 -0600
Subject: [PATCH 03/10] NFC, formatting
---
llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 17 +++++++++--------
1 file changed, 9 insertions(+), 8 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 783a9408e249f..1282641bd3325 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -85,9 +85,10 @@ static cl::opt<unsigned>
"when sorting profitable allocas"),
cl::init(4));
-static cl::opt<unsigned> DynIdxNumElmLimit("dynamic-index-num-element-limit",
+static cl::opt<unsigned> DynIdxNumElmLimit(
+ "dynamic-index-num-element-limit",
cl::desc("Maximum number of elements for promoting alloca with dynamic"
- " index"),
+ " index"),
cl::init(8));
// Shared implementation which can do both promotion to vector and to LDS.
@@ -924,20 +925,20 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
Value *Index = GEPToVectorIndex(GEP, &Alloca, VecEltTy, *DL, NewGEPInsts);
if (!Index)
return RejectUser(Inst, "cannot compute vector index for GEP");
-
+
if (!isa<ConstantInt>(Index)) {
bool UsedInLoad = false;
for (auto *U : GEP->users()) {
- if(isa<LoadInst>(U)) {
+ if (isa<LoadInst>(U)) {
UsedInLoad = true;
break;
}
}
- if (auto *UserVecTy = dyn_cast<FixedVectorType>(
- GEP->getSourceElementType())) {
+ if (auto *UserVecTy =
+ dyn_cast<FixedVectorType>(GEP->getSourceElementType())) {
if (UsedInLoad && UserVecTy->getNumElements() > DynIdxNumElmLimit) {
- return RejectUser(Inst,
- "user has too many elements for dynamic index");
+ return RejectUser(Inst,
+ "user has too many elements for dynamic index");
}
}
}
>From 80fb5a41eb4d5f0a2dbba5a5d51e192492d6e6da Mon Sep 17 00:00:00 2001
From: Kevin Choi <kevin.choi at amd.com>
Date: Wed, 3 Dec 2025 12:38:44 -0600
Subject: [PATCH 04/10] addressing feedback
---
llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 11 +++--------
1 file changed, 3 insertions(+), 8 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 1282641bd3325..e618b88253457 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -86,7 +86,7 @@ static cl::opt<unsigned>
cl::init(4));
static cl::opt<unsigned> DynIdxNumElmLimit(
- "dynamic-index-num-element-limit",
+ "amdgpu-dynamic-index-num-element-limit",
cl::desc("Maximum number of elements for promoting alloca with dynamic"
" index"),
cl::init(8));
@@ -927,13 +927,8 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
return RejectUser(Inst, "cannot compute vector index for GEP");
if (!isa<ConstantInt>(Index)) {
- bool UsedInLoad = false;
- for (auto *U : GEP->users()) {
- if (isa<LoadInst>(U)) {
- UsedInLoad = true;
- break;
- }
- }
+ bool UsedInLoad = llvm::any_of(GEP->users(),
+ [&](const auto *U){ return isa<LoadInst>(U); });
if (auto *UserVecTy =
dyn_cast<FixedVectorType>(GEP->getSourceElementType())) {
if (UsedInLoad && UserVecTy->getNumElements() > DynIdxNumElmLimit) {
>From 671de2f770da59846de5908dfd5648cc91c7e92b Mon Sep 17 00:00:00 2001
From: Kevin Choi <kevin.choi at amd.com>
Date: Wed, 3 Dec 2025 13:07:11 -0600
Subject: [PATCH 05/10] nfc, rename var
---
llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index e618b88253457..ec4abe281e7fd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -85,7 +85,7 @@ static cl::opt<unsigned>
"when sorting profitable allocas"),
cl::init(4));
-static cl::opt<unsigned> DynIdxNumElmLimit(
+static cl::opt<unsigned> DynamicIndexNumberElementLimit(
"amdgpu-dynamic-index-num-element-limit",
cl::desc("Maximum number of elements for promoting alloca with dynamic"
" index"),
@@ -931,7 +931,8 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
[&](const auto *U){ return isa<LoadInst>(U); });
if (auto *UserVecTy =
dyn_cast<FixedVectorType>(GEP->getSourceElementType())) {
- if (UsedInLoad && UserVecTy->getNumElements() > DynIdxNumElmLimit) {
+ if (UsedInLoad &&
+ UserVecTy->getNumElements() > DynamicIndexNumberElementLimit) {
return RejectUser(Inst,
"user has too many elements for dynamic index");
}
>From 3b7249961aded412120eec78a8fde151024751e2 Mon Sep 17 00:00:00 2001
From: Kevin Choi <kevin.choi at amd.com>
Date: Wed, 3 Dec 2025 13:14:17 -0600
Subject: [PATCH 06/10] nfc, formatting
---
llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index ec4abe281e7fd..912bd799db6c3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -927,8 +927,8 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
return RejectUser(Inst, "cannot compute vector index for GEP");
if (!isa<ConstantInt>(Index)) {
- bool UsedInLoad = llvm::any_of(GEP->users(),
- [&](const auto *U){ return isa<LoadInst>(U); });
+ bool UsedInLoad = llvm::any_of(
+ GEP->users(), [&](const auto *U){ return isa<LoadInst>(U); });
if (auto *UserVecTy =
dyn_cast<FixedVectorType>(GEP->getSourceElementType())) {
if (UsedInLoad &&
>From 8e31b8549b7f2651780ecdba05f022cfcd844b3b Mon Sep 17 00:00:00 2001
From: Kevin Choi <kevin.choi at amd.com>
Date: Wed, 3 Dec 2025 13:21:58 -0600
Subject: [PATCH 07/10] space
---
llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 912bd799db6c3..38f4e07eb613f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -928,7 +928,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
if (!isa<ConstantInt>(Index)) {
bool UsedInLoad = llvm::any_of(
- GEP->users(), [&](const auto *U){ return isa<LoadInst>(U); });
+ GEP->users(), [&](const auto *U) { return isa<LoadInst>(U); });
if (auto *UserVecTy =
dyn_cast<FixedVectorType>(GEP->getSourceElementType())) {
if (UsedInLoad &&
>From 904e5e019525e3948ada81f503e6074a41a91934 Mon Sep 17 00:00:00 2001
From: Kevin Choi <kevin.choi at amd.com>
Date: Wed, 3 Dec 2025 13:54:35 -0600
Subject: [PATCH 08/10] addressing feedback, move tests to
promote-alloca-vector-dynamic-idx.ll, test different limit values
---
.../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 6 +-
.../promote-alloca-vector-dynamic-idx.ll | 533 ++++++++++++++++++
.../AMDGPU/promote-alloca-vector-gep.ll | 78 ---
3 files changed, 536 insertions(+), 81 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/promote-alloca-vector-dynamic-idx.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 38f4e07eb613f..3b656480770b6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -85,8 +85,8 @@ static cl::opt<unsigned>
"when sorting profitable allocas"),
cl::init(4));
-static cl::opt<unsigned> DynamicIndexNumberElementLimit(
- "amdgpu-dynamic-index-num-element-limit",
+static cl::opt<unsigned> PromoteAllocaDynamicIndexNumberElementLimit(
+ "amdgpu-promote-alloca-dynamic-index-num-element-limit",
cl::desc("Maximum number of elements for promoting alloca with dynamic"
" index"),
cl::init(8));
@@ -932,7 +932,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
if (auto *UserVecTy =
dyn_cast<FixedVectorType>(GEP->getSourceElementType())) {
if (UsedInLoad &&
- UserVecTy->getNumElements() > DynamicIndexNumberElementLimit) {
+ UserVecTy->getNumElements() > PromoteAllocaDynamicIndexNumberElementLimit) {
return RejectUser(Inst,
"user has too many elements for dynamic index");
}
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-dynamic-idx.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-dynamic-idx.ll
new file mode 100644
index 0000000000000..111f6e8f8d990
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-dynamic-idx.ll
@@ -0,0 +1,533 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -amdgpu-promote-alloca-dynamic-index-num-element-limit=4 < %s | FileCheck -check-prefix=C4 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca < %s | FileCheck -check-prefix=C8 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -amdgpu-promote-alloca-dynamic-index-num-element-limit=16 < %s | FileCheck -check-prefix=C16 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -amdgpu-promote-alloca-dynamic-index-num-element-limit=32 < %s | FileCheck -check-prefix=C32 %s
+
+; Check if alloca is promoted if user is accessed with dynamic index
+
+define amdgpu_kernel void @GEP_dynamic_idx_v4i8(ptr addrspace(1) %out, i32 %idx) {
+; C4-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_v4i8(
+; C4-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) {
+; C4-NEXT: [[ENTRY:.*:]]
+; C4-NEXT: [[ALLOCA:%.*]] = freeze <64 x i8> poison
+; C4-NEXT: [[TMP0:%.*]] = mul i32 [[IDX]], 4
+; C4-NEXT: [[TMP1:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP0]]
+; C4-NEXT: [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i64 0
+; C4-NEXT: [[TMP3:%.*]] = add i32 [[TMP0]], 1
+; C4-NEXT: [[TMP4:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP3]]
+; C4-NEXT: [[TMP5:%.*]] = insertelement <4 x i8> [[TMP2]], i8 [[TMP4]], i64 1
+; C4-NEXT: [[TMP6:%.*]] = add i32 [[TMP0]], 2
+; C4-NEXT: [[TMP7:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP6]]
+; C4-NEXT: [[TMP8:%.*]] = insertelement <4 x i8> [[TMP5]], i8 [[TMP7]], i64 2
+; C4-NEXT: [[TMP9:%.*]] = add i32 [[TMP0]], 3
+; C4-NEXT: [[TMP10:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP9]]
+; C4-NEXT: [[TMP11:%.*]] = insertelement <4 x i8> [[TMP8]], i8 [[TMP10]], i64 3
+; C4-NEXT: store <4 x i8> [[TMP11]], ptr addrspace(1) [[OUT]], align 4
+; C4-NEXT: ret void
+;
+; C8-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_v4i8(
+; C8-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) {
+; C8-NEXT: [[ENTRY:.*:]]
+; C8-NEXT: [[ALLOCA:%.*]] = freeze <64 x i8> poison
+; C8-NEXT: [[TMP0:%.*]] = mul i32 [[IDX]], 4
+; C8-NEXT: [[TMP1:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP0]]
+; C8-NEXT: [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i64 0
+; C8-NEXT: [[TMP3:%.*]] = add i32 [[TMP0]], 1
+; C8-NEXT: [[TMP4:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP3]]
+; C8-NEXT: [[TMP5:%.*]] = insertelement <4 x i8> [[TMP2]], i8 [[TMP4]], i64 1
+; C8-NEXT: [[TMP6:%.*]] = add i32 [[TMP0]], 2
+; C8-NEXT: [[TMP7:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP6]]
+; C8-NEXT: [[TMP8:%.*]] = insertelement <4 x i8> [[TMP5]], i8 [[TMP7]], i64 2
+; C8-NEXT: [[TMP9:%.*]] = add i32 [[TMP0]], 3
+; C8-NEXT: [[TMP10:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP9]]
+; C8-NEXT: [[TMP11:%.*]] = insertelement <4 x i8> [[TMP8]], i8 [[TMP10]], i64 3
+; C8-NEXT: store <4 x i8> [[TMP11]], ptr addrspace(1) [[OUT]], align 4
+; C8-NEXT: ret void
+;
+; C16-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_v4i8(
+; C16-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) {
+; C16-NEXT: [[ENTRY:.*:]]
+; C16-NEXT: [[ALLOCA:%.*]] = freeze <64 x i8> poison
+; C16-NEXT: [[TMP0:%.*]] = mul i32 [[IDX]], 4
+; C16-NEXT: [[TMP1:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP0]]
+; C16-NEXT: [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i64 0
+; C16-NEXT: [[TMP3:%.*]] = add i32 [[TMP0]], 1
+; C16-NEXT: [[TMP4:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP3]]
+; C16-NEXT: [[TMP5:%.*]] = insertelement <4 x i8> [[TMP2]], i8 [[TMP4]], i64 1
+; C16-NEXT: [[TMP6:%.*]] = add i32 [[TMP0]], 2
+; C16-NEXT: [[TMP7:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP6]]
+; C16-NEXT: [[TMP8:%.*]] = insertelement <4 x i8> [[TMP5]], i8 [[TMP7]], i64 2
+; C16-NEXT: [[TMP9:%.*]] = add i32 [[TMP0]], 3
+; C16-NEXT: [[TMP10:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP9]]
+; C16-NEXT: [[TMP11:%.*]] = insertelement <4 x i8> [[TMP8]], i8 [[TMP10]], i64 3
+; C16-NEXT: store <4 x i8> [[TMP11]], ptr addrspace(1) [[OUT]], align 4
+; C16-NEXT: ret void
+;
+; C32-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_v4i8(
+; C32-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) {
+; C32-NEXT: [[ENTRY:.*:]]
+; C32-NEXT: [[ALLOCA:%.*]] = freeze <64 x i8> poison
+; C32-NEXT: [[TMP0:%.*]] = mul i32 [[IDX]], 4
+; C32-NEXT: [[TMP1:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP0]]
+; C32-NEXT: [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i64 0
+; C32-NEXT: [[TMP3:%.*]] = add i32 [[TMP0]], 1
+; C32-NEXT: [[TMP4:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP3]]
+; C32-NEXT: [[TMP5:%.*]] = insertelement <4 x i8> [[TMP2]], i8 [[TMP4]], i64 1
+; C32-NEXT: [[TMP6:%.*]] = add i32 [[TMP0]], 2
+; C32-NEXT: [[TMP7:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP6]]
+; C32-NEXT: [[TMP8:%.*]] = insertelement <4 x i8> [[TMP5]], i8 [[TMP7]], i64 2
+; C32-NEXT: [[TMP9:%.*]] = add i32 [[TMP0]], 3
+; C32-NEXT: [[TMP10:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP9]]
+; C32-NEXT: [[TMP11:%.*]] = insertelement <4 x i8> [[TMP8]], i8 [[TMP10]], i64 3
+; C32-NEXT: store <4 x i8> [[TMP11]], ptr addrspace(1) [[OUT]], align 4
+; C32-NEXT: ret void
+;
+entry:
+ %alloca = alloca [64 x i8], align 4, addrspace(5)
+ %gep = getelementptr inbounds <4 x i8>, ptr addrspace(5) %alloca, i32 %idx
+ %vec = load <4 x i8>, ptr addrspace(5) %gep, align 4
+ store <4 x i8> %vec, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @GEP_dynamic_idx_v8i8(ptr addrspace(1) %out, i32 %idx) {
+; C4-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_v8i8(
+; C4-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) {
+; C4-NEXT: [[ENTRY:.*:]]
+; C4-NEXT: [[ALLOCA:%.*]] = alloca [64 x i8], align 4, addrspace(5)
+; C4-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i8>, ptr addrspace(5) [[ALLOCA]], i32 [[IDX]]
+; C4-NEXT: [[VEC:%.*]] = load <8 x i8>, ptr addrspace(5) [[GEP]], align 4
+; C4-NEXT: store <8 x i8> [[VEC]], ptr addrspace(1) [[OUT]], align 4
+; C4-NEXT: ret void
+;
+; C8-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_v8i8(
+; C8-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) {
+; C8-NEXT: [[ENTRY:.*:]]
+; C8-NEXT: [[ALLOCA:%.*]] = freeze <64 x i8> poison
+; C8-NEXT: [[TMP0:%.*]] = mul i32 [[IDX]], 8
+; C8-NEXT: [[TMP1:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP0]]
+; C8-NEXT: [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0
+; C8-NEXT: [[TMP3:%.*]] = add i32 [[TMP0]], 1
+; C8-NEXT: [[TMP4:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP3]]
+; C8-NEXT: [[TMP5:%.*]] = insertelement <8 x i8> [[TMP2]], i8 [[TMP4]], i64 1
+; C8-NEXT: [[TMP6:%.*]] = add i32 [[TMP0]], 2
+; C8-NEXT: [[TMP7:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP6]]
+; C8-NEXT: [[TMP8:%.*]] = insertelement <8 x i8> [[TMP5]], i8 [[TMP7]], i64 2
+; C8-NEXT: [[TMP9:%.*]] = add i32 [[TMP0]], 3
+; C8-NEXT: [[TMP10:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP9]]
+; C8-NEXT: [[TMP11:%.*]] = insertelement <8 x i8> [[TMP8]], i8 [[TMP10]], i64 3
+; C8-NEXT: [[TMP12:%.*]] = add i32 [[TMP0]], 4
+; C8-NEXT: [[TMP13:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP12]]
+; C8-NEXT: [[TMP14:%.*]] = insertelement <8 x i8> [[TMP11]], i8 [[TMP13]], i64 4
+; C8-NEXT: [[TMP15:%.*]] = add i32 [[TMP0]], 5
+; C8-NEXT: [[TMP16:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP15]]
+; C8-NEXT: [[TMP17:%.*]] = insertelement <8 x i8> [[TMP14]], i8 [[TMP16]], i64 5
+; C8-NEXT: [[TMP18:%.*]] = add i32 [[TMP0]], 6
+; C8-NEXT: [[TMP19:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP18]]
+; C8-NEXT: [[TMP20:%.*]] = insertelement <8 x i8> [[TMP17]], i8 [[TMP19]], i64 6
+; C8-NEXT: [[TMP21:%.*]] = add i32 [[TMP0]], 7
+; C8-NEXT: [[TMP22:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP21]]
+; C8-NEXT: [[TMP23:%.*]] = insertelement <8 x i8> [[TMP20]], i8 [[TMP22]], i64 7
+; C8-NEXT: store <8 x i8> [[TMP23]], ptr addrspace(1) [[OUT]], align 4
+; C8-NEXT: ret void
+;
+; C16-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_v8i8(
+; C16-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) {
+; C16-NEXT: [[ENTRY:.*:]]
+; C16-NEXT: [[ALLOCA:%.*]] = freeze <64 x i8> poison
+; C16-NEXT: [[TMP0:%.*]] = mul i32 [[IDX]], 8
+; C16-NEXT: [[TMP1:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP0]]
+; C16-NEXT: [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0
+; C16-NEXT: [[TMP3:%.*]] = add i32 [[TMP0]], 1
+; C16-NEXT: [[TMP4:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP3]]
+; C16-NEXT: [[TMP5:%.*]] = insertelement <8 x i8> [[TMP2]], i8 [[TMP4]], i64 1
+; C16-NEXT: [[TMP6:%.*]] = add i32 [[TMP0]], 2
+; C16-NEXT: [[TMP7:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP6]]
+; C16-NEXT: [[TMP8:%.*]] = insertelement <8 x i8> [[TMP5]], i8 [[TMP7]], i64 2
+; C16-NEXT: [[TMP9:%.*]] = add i32 [[TMP0]], 3
+; C16-NEXT: [[TMP10:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP9]]
+; C16-NEXT: [[TMP11:%.*]] = insertelement <8 x i8> [[TMP8]], i8 [[TMP10]], i64 3
+; C16-NEXT: [[TMP12:%.*]] = add i32 [[TMP0]], 4
+; C16-NEXT: [[TMP13:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP12]]
+; C16-NEXT: [[TMP14:%.*]] = insertelement <8 x i8> [[TMP11]], i8 [[TMP13]], i64 4
+; C16-NEXT: [[TMP15:%.*]] = add i32 [[TMP0]], 5
+; C16-NEXT: [[TMP16:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP15]]
+; C16-NEXT: [[TMP17:%.*]] = insertelement <8 x i8> [[TMP14]], i8 [[TMP16]], i64 5
+; C16-NEXT: [[TMP18:%.*]] = add i32 [[TMP0]], 6
+; C16-NEXT: [[TMP19:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP18]]
+; C16-NEXT: [[TMP20:%.*]] = insertelement <8 x i8> [[TMP17]], i8 [[TMP19]], i64 6
+; C16-NEXT: [[TMP21:%.*]] = add i32 [[TMP0]], 7
+; C16-NEXT: [[TMP22:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP21]]
+; C16-NEXT: [[TMP23:%.*]] = insertelement <8 x i8> [[TMP20]], i8 [[TMP22]], i64 7
+; C16-NEXT: store <8 x i8> [[TMP23]], ptr addrspace(1) [[OUT]], align 4
+; C16-NEXT: ret void
+;
+; C32-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_v8i8(
+; C32-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) {
+; C32-NEXT: [[ENTRY:.*:]]
+; C32-NEXT: [[ALLOCA:%.*]] = freeze <64 x i8> poison
+; C32-NEXT: [[TMP0:%.*]] = mul i32 [[IDX]], 8
+; C32-NEXT: [[TMP1:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP0]]
+; C32-NEXT: [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0
+; C32-NEXT: [[TMP3:%.*]] = add i32 [[TMP0]], 1
+; C32-NEXT: [[TMP4:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP3]]
+; C32-NEXT: [[TMP5:%.*]] = insertelement <8 x i8> [[TMP2]], i8 [[TMP4]], i64 1
+; C32-NEXT: [[TMP6:%.*]] = add i32 [[TMP0]], 2
+; C32-NEXT: [[TMP7:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP6]]
+; C32-NEXT: [[TMP8:%.*]] = insertelement <8 x i8> [[TMP5]], i8 [[TMP7]], i64 2
+; C32-NEXT: [[TMP9:%.*]] = add i32 [[TMP0]], 3
+; C32-NEXT: [[TMP10:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP9]]
+; C32-NEXT: [[TMP11:%.*]] = insertelement <8 x i8> [[TMP8]], i8 [[TMP10]], i64 3
+; C32-NEXT: [[TMP12:%.*]] = add i32 [[TMP0]], 4
+; C32-NEXT: [[TMP13:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP12]]
+; C32-NEXT: [[TMP14:%.*]] = insertelement <8 x i8> [[TMP11]], i8 [[TMP13]], i64 4
+; C32-NEXT: [[TMP15:%.*]] = add i32 [[TMP0]], 5
+; C32-NEXT: [[TMP16:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP15]]
+; C32-NEXT: [[TMP17:%.*]] = insertelement <8 x i8> [[TMP14]], i8 [[TMP16]], i64 5
+; C32-NEXT: [[TMP18:%.*]] = add i32 [[TMP0]], 6
+; C32-NEXT: [[TMP19:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP18]]
+; C32-NEXT: [[TMP20:%.*]] = insertelement <8 x i8> [[TMP17]], i8 [[TMP19]], i64 6
+; C32-NEXT: [[TMP21:%.*]] = add i32 [[TMP0]], 7
+; C32-NEXT: [[TMP22:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP21]]
+; C32-NEXT: [[TMP23:%.*]] = insertelement <8 x i8> [[TMP20]], i8 [[TMP22]], i64 7
+; C32-NEXT: store <8 x i8> [[TMP23]], ptr addrspace(1) [[OUT]], align 4
+; C32-NEXT: ret void
+;
+entry:
+ %alloca = alloca [64 x i8], align 4, addrspace(5)
+ %gep = getelementptr inbounds <8 x i8>, ptr addrspace(5) %alloca, i32 %idx
+ %vec = load <8 x i8>, ptr addrspace(5) %gep, align 4
+ store <8 x i8> %vec, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @GEP_dynamic_idx_v16i8(ptr addrspace(1) %out, i32 %idx) {
+; C4-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_v16i8(
+; C4-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) {
+; C4-NEXT: [[ENTRY:.*:]]
+; C4-NEXT: [[ALLOCA:%.*]] = alloca [64 x i8], align 4, addrspace(5)
+; C4-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr addrspace(5) [[ALLOCA]], i32 [[IDX]]
+; C4-NEXT: [[VEC:%.*]] = load <16 x i8>, ptr addrspace(5) [[GEP]], align 4
+; C4-NEXT: store <16 x i8> [[VEC]], ptr addrspace(1) [[OUT]], align 4
+; C4-NEXT: ret void
+;
+; C8-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_v16i8(
+; C8-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) {
+; C8-NEXT: [[ENTRY:.*:]]
+; C8-NEXT: [[ALLOCA:%.*]] = alloca [64 x i8], align 4, addrspace(5)
+; C8-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr addrspace(5) [[ALLOCA]], i32 [[IDX]]
+; C8-NEXT: [[VEC:%.*]] = load <16 x i8>, ptr addrspace(5) [[GEP]], align 4
+; C8-NEXT: store <16 x i8> [[VEC]], ptr addrspace(1) [[OUT]], align 4
+; C8-NEXT: ret void
+;
+; C16-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_v16i8(
+; C16-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) {
+; C16-NEXT: [[ENTRY:.*:]]
+; C16-NEXT: [[ALLOCA:%.*]] = freeze <64 x i8> poison
+; C16-NEXT: [[TMP0:%.*]] = mul i32 [[IDX]], 16
+; C16-NEXT: [[TMP1:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP0]]
+; C16-NEXT: [[TMP2:%.*]] = insertelement <16 x i8> poison, i8 [[TMP1]], i64 0
+; C16-NEXT: [[TMP3:%.*]] = add i32 [[TMP0]], 1
+; C16-NEXT: [[TMP4:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP3]]
+; C16-NEXT: [[TMP5:%.*]] = insertelement <16 x i8> [[TMP2]], i8 [[TMP4]], i64 1
+; C16-NEXT: [[TMP6:%.*]] = add i32 [[TMP0]], 2
+; C16-NEXT: [[TMP7:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP6]]
+; C16-NEXT: [[TMP8:%.*]] = insertelement <16 x i8> [[TMP5]], i8 [[TMP7]], i64 2
+; C16-NEXT: [[TMP9:%.*]] = add i32 [[TMP0]], 3
+; C16-NEXT: [[TMP10:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP9]]
+; C16-NEXT: [[TMP11:%.*]] = insertelement <16 x i8> [[TMP8]], i8 [[TMP10]], i64 3
+; C16-NEXT: [[TMP12:%.*]] = add i32 [[TMP0]], 4
+; C16-NEXT: [[TMP13:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP12]]
+; C16-NEXT: [[TMP14:%.*]] = insertelement <16 x i8> [[TMP11]], i8 [[TMP13]], i64 4
+; C16-NEXT: [[TMP15:%.*]] = add i32 [[TMP0]], 5
+; C16-NEXT: [[TMP16:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP15]]
+; C16-NEXT: [[TMP17:%.*]] = insertelement <16 x i8> [[TMP14]], i8 [[TMP16]], i64 5
+; C16-NEXT: [[TMP18:%.*]] = add i32 [[TMP0]], 6
+; C16-NEXT: [[TMP19:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP18]]
+; C16-NEXT: [[TMP20:%.*]] = insertelement <16 x i8> [[TMP17]], i8 [[TMP19]], i64 6
+; C16-NEXT: [[TMP21:%.*]] = add i32 [[TMP0]], 7
+; C16-NEXT: [[TMP22:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP21]]
+; C16-NEXT: [[TMP23:%.*]] = insertelement <16 x i8> [[TMP20]], i8 [[TMP22]], i64 7
+; C16-NEXT: [[TMP24:%.*]] = add i32 [[TMP0]], 8
+; C16-NEXT: [[TMP25:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP24]]
+; C16-NEXT: [[TMP26:%.*]] = insertelement <16 x i8> [[TMP23]], i8 [[TMP25]], i64 8
+; C16-NEXT: [[TMP27:%.*]] = add i32 [[TMP0]], 9
+; C16-NEXT: [[TMP28:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP27]]
+; C16-NEXT: [[TMP29:%.*]] = insertelement <16 x i8> [[TMP26]], i8 [[TMP28]], i64 9
+; C16-NEXT: [[TMP30:%.*]] = add i32 [[TMP0]], 10
+; C16-NEXT: [[TMP31:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP30]]
+; C16-NEXT: [[TMP32:%.*]] = insertelement <16 x i8> [[TMP29]], i8 [[TMP31]], i64 10
+; C16-NEXT: [[TMP33:%.*]] = add i32 [[TMP0]], 11
+; C16-NEXT: [[TMP34:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP33]]
+; C16-NEXT: [[TMP35:%.*]] = insertelement <16 x i8> [[TMP32]], i8 [[TMP34]], i64 11
+; C16-NEXT: [[TMP36:%.*]] = add i32 [[TMP0]], 12
+; C16-NEXT: [[TMP37:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP36]]
+; C16-NEXT: [[TMP38:%.*]] = insertelement <16 x i8> [[TMP35]], i8 [[TMP37]], i64 12
+; C16-NEXT: [[TMP39:%.*]] = add i32 [[TMP0]], 13
+; C16-NEXT: [[TMP40:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP39]]
+; C16-NEXT: [[TMP41:%.*]] = insertelement <16 x i8> [[TMP38]], i8 [[TMP40]], i64 13
+; C16-NEXT: [[TMP42:%.*]] = add i32 [[TMP0]], 14
+; C16-NEXT: [[TMP43:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP42]]
+; C16-NEXT: [[TMP44:%.*]] = insertelement <16 x i8> [[TMP41]], i8 [[TMP43]], i64 14
+; C16-NEXT: [[TMP45:%.*]] = add i32 [[TMP0]], 15
+; C16-NEXT: [[TMP46:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP45]]
+; C16-NEXT: [[TMP47:%.*]] = insertelement <16 x i8> [[TMP44]], i8 [[TMP46]], i64 15
+; C16-NEXT: store <16 x i8> [[TMP47]], ptr addrspace(1) [[OUT]], align 4
+; C16-NEXT: ret void
+;
+; C32-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_v16i8(
+; C32-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) {
+; C32-NEXT: [[ENTRY:.*:]]
+; C32-NEXT: [[ALLOCA:%.*]] = freeze <64 x i8> poison
+; C32-NEXT: [[TMP0:%.*]] = mul i32 [[IDX]], 16
+; C32-NEXT: [[TMP1:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP0]]
+; C32-NEXT: [[TMP2:%.*]] = insertelement <16 x i8> poison, i8 [[TMP1]], i64 0
+; C32-NEXT: [[TMP3:%.*]] = add i32 [[TMP0]], 1
+; C32-NEXT: [[TMP4:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP3]]
+; C32-NEXT: [[TMP5:%.*]] = insertelement <16 x i8> [[TMP2]], i8 [[TMP4]], i64 1
+; C32-NEXT: [[TMP6:%.*]] = add i32 [[TMP0]], 2
+; C32-NEXT: [[TMP7:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP6]]
+; C32-NEXT: [[TMP8:%.*]] = insertelement <16 x i8> [[TMP5]], i8 [[TMP7]], i64 2
+; C32-NEXT: [[TMP9:%.*]] = add i32 [[TMP0]], 3
+; C32-NEXT: [[TMP10:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP9]]
+; C32-NEXT: [[TMP11:%.*]] = insertelement <16 x i8> [[TMP8]], i8 [[TMP10]], i64 3
+; C32-NEXT: [[TMP12:%.*]] = add i32 [[TMP0]], 4
+; C32-NEXT: [[TMP13:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP12]]
+; C32-NEXT: [[TMP14:%.*]] = insertelement <16 x i8> [[TMP11]], i8 [[TMP13]], i64 4
+; C32-NEXT: [[TMP15:%.*]] = add i32 [[TMP0]], 5
+; C32-NEXT: [[TMP16:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP15]]
+; C32-NEXT: [[TMP17:%.*]] = insertelement <16 x i8> [[TMP14]], i8 [[TMP16]], i64 5
+; C32-NEXT: [[TMP18:%.*]] = add i32 [[TMP0]], 6
+; C32-NEXT: [[TMP19:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP18]]
+; C32-NEXT: [[TMP20:%.*]] = insertelement <16 x i8> [[TMP17]], i8 [[TMP19]], i64 6
+; C32-NEXT: [[TMP21:%.*]] = add i32 [[TMP0]], 7
+; C32-NEXT: [[TMP22:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP21]]
+; C32-NEXT: [[TMP23:%.*]] = insertelement <16 x i8> [[TMP20]], i8 [[TMP22]], i64 7
+; C32-NEXT: [[TMP24:%.*]] = add i32 [[TMP0]], 8
+; C32-NEXT: [[TMP25:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP24]]
+; C32-NEXT: [[TMP26:%.*]] = insertelement <16 x i8> [[TMP23]], i8 [[TMP25]], i64 8
+; C32-NEXT: [[TMP27:%.*]] = add i32 [[TMP0]], 9
+; C32-NEXT: [[TMP28:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP27]]
+; C32-NEXT: [[TMP29:%.*]] = insertelement <16 x i8> [[TMP26]], i8 [[TMP28]], i64 9
+; C32-NEXT: [[TMP30:%.*]] = add i32 [[TMP0]], 10
+; C32-NEXT: [[TMP31:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP30]]
+; C32-NEXT: [[TMP32:%.*]] = insertelement <16 x i8> [[TMP29]], i8 [[TMP31]], i64 10
+; C32-NEXT: [[TMP33:%.*]] = add i32 [[TMP0]], 11
+; C32-NEXT: [[TMP34:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP33]]
+; C32-NEXT: [[TMP35:%.*]] = insertelement <16 x i8> [[TMP32]], i8 [[TMP34]], i64 11
+; C32-NEXT: [[TMP36:%.*]] = add i32 [[TMP0]], 12
+; C32-NEXT: [[TMP37:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP36]]
+; C32-NEXT: [[TMP38:%.*]] = insertelement <16 x i8> [[TMP35]], i8 [[TMP37]], i64 12
+; C32-NEXT: [[TMP39:%.*]] = add i32 [[TMP0]], 13
+; C32-NEXT: [[TMP40:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP39]]
+; C32-NEXT: [[TMP41:%.*]] = insertelement <16 x i8> [[TMP38]], i8 [[TMP40]], i64 13
+; C32-NEXT: [[TMP42:%.*]] = add i32 [[TMP0]], 14
+; C32-NEXT: [[TMP43:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP42]]
+; C32-NEXT: [[TMP44:%.*]] = insertelement <16 x i8> [[TMP41]], i8 [[TMP43]], i64 14
+; C32-NEXT: [[TMP45:%.*]] = add i32 [[TMP0]], 15
+; C32-NEXT: [[TMP46:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP45]]
+; C32-NEXT: [[TMP47:%.*]] = insertelement <16 x i8> [[TMP44]], i8 [[TMP46]], i64 15
+; C32-NEXT: store <16 x i8> [[TMP47]], ptr addrspace(1) [[OUT]], align 4
+; C32-NEXT: ret void
+;
+entry:
+ %alloca = alloca [64 x i8], align 4, addrspace(5)
+ %gep = getelementptr inbounds <16 x i8>, ptr addrspace(5) %alloca, i32 %idx
+ %vec = load <16 x i8>, ptr addrspace(5) %gep, align 4
+ store <16 x i8> %vec, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @GEP_dynamic_idx_v32i8(ptr addrspace(1) %out, i32 %idx) {
+; C4-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_v32i8(
+; C4-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) {
+; C4-NEXT: [[ENTRY:.*:]]
+; C4-NEXT: [[ALLOCA:%.*]] = alloca [64 x i8], align 4, addrspace(5)
+; C4-NEXT: [[GEP:%.*]] = getelementptr inbounds <32 x i8>, ptr addrspace(5) [[ALLOCA]], i32 [[IDX]]
+; C4-NEXT: [[VEC:%.*]] = load <32 x i8>, ptr addrspace(5) [[GEP]], align 4
+; C4-NEXT: store <32 x i8> [[VEC]], ptr addrspace(1) [[OUT]], align 4
+; C4-NEXT: ret void
+;
+; C8-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_v32i8(
+; C8-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) {
+; C8-NEXT: [[ENTRY:.*:]]
+; C8-NEXT: [[ALLOCA:%.*]] = alloca [64 x i8], align 4, addrspace(5)
+; C8-NEXT: [[GEP:%.*]] = getelementptr inbounds <32 x i8>, ptr addrspace(5) [[ALLOCA]], i32 [[IDX]]
+; C8-NEXT: [[VEC:%.*]] = load <32 x i8>, ptr addrspace(5) [[GEP]], align 4
+; C8-NEXT: store <32 x i8> [[VEC]], ptr addrspace(1) [[OUT]], align 4
+; C8-NEXT: ret void
+;
+; C16-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_v32i8(
+; C16-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) {
+; C16-NEXT: [[ENTRY:.*:]]
+; C16-NEXT: [[ALLOCA:%.*]] = alloca [64 x i8], align 4, addrspace(5)
+; C16-NEXT: [[GEP:%.*]] = getelementptr inbounds <32 x i8>, ptr addrspace(5) [[ALLOCA]], i32 [[IDX]]
+; C16-NEXT: [[VEC:%.*]] = load <32 x i8>, ptr addrspace(5) [[GEP]], align 4
+; C16-NEXT: store <32 x i8> [[VEC]], ptr addrspace(1) [[OUT]], align 4
+; C16-NEXT: ret void
+;
+; C32-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_v32i8(
+; C32-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) {
+; C32-NEXT: [[ENTRY:.*:]]
+; C32-NEXT: [[ALLOCA:%.*]] = freeze <64 x i8> poison
+; C32-NEXT: [[TMP0:%.*]] = mul i32 [[IDX]], 32
+; C32-NEXT: [[TMP1:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP0]]
+; C32-NEXT: [[TMP2:%.*]] = insertelement <32 x i8> poison, i8 [[TMP1]], i64 0
+; C32-NEXT: [[TMP3:%.*]] = add i32 [[TMP0]], 1
+; C32-NEXT: [[TMP4:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP3]]
+; C32-NEXT: [[TMP5:%.*]] = insertelement <32 x i8> [[TMP2]], i8 [[TMP4]], i64 1
+; C32-NEXT: [[TMP6:%.*]] = add i32 [[TMP0]], 2
+; C32-NEXT: [[TMP7:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP6]]
+; C32-NEXT: [[TMP8:%.*]] = insertelement <32 x i8> [[TMP5]], i8 [[TMP7]], i64 2
+; C32-NEXT: [[TMP9:%.*]] = add i32 [[TMP0]], 3
+; C32-NEXT: [[TMP10:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP9]]
+; C32-NEXT: [[TMP11:%.*]] = insertelement <32 x i8> [[TMP8]], i8 [[TMP10]], i64 3
+; C32-NEXT: [[TMP12:%.*]] = add i32 [[TMP0]], 4
+; C32-NEXT: [[TMP13:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP12]]
+; C32-NEXT: [[TMP14:%.*]] = insertelement <32 x i8> [[TMP11]], i8 [[TMP13]], i64 4
+; C32-NEXT: [[TMP15:%.*]] = add i32 [[TMP0]], 5
+; C32-NEXT: [[TMP16:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP15]]
+; C32-NEXT: [[TMP17:%.*]] = insertelement <32 x i8> [[TMP14]], i8 [[TMP16]], i64 5
+; C32-NEXT: [[TMP18:%.*]] = add i32 [[TMP0]], 6
+; C32-NEXT: [[TMP19:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP18]]
+; C32-NEXT: [[TMP20:%.*]] = insertelement <32 x i8> [[TMP17]], i8 [[TMP19]], i64 6
+; C32-NEXT: [[TMP21:%.*]] = add i32 [[TMP0]], 7
+; C32-NEXT: [[TMP22:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP21]]
+; C32-NEXT: [[TMP23:%.*]] = insertelement <32 x i8> [[TMP20]], i8 [[TMP22]], i64 7
+; C32-NEXT: [[TMP24:%.*]] = add i32 [[TMP0]], 8
+; C32-NEXT: [[TMP25:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP24]]
+; C32-NEXT: [[TMP26:%.*]] = insertelement <32 x i8> [[TMP23]], i8 [[TMP25]], i64 8
+; C32-NEXT: [[TMP27:%.*]] = add i32 [[TMP0]], 9
+; C32-NEXT: [[TMP28:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP27]]
+; C32-NEXT: [[TMP29:%.*]] = insertelement <32 x i8> [[TMP26]], i8 [[TMP28]], i64 9
+; C32-NEXT: [[TMP30:%.*]] = add i32 [[TMP0]], 10
+; C32-NEXT: [[TMP31:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP30]]
+; C32-NEXT: [[TMP32:%.*]] = insertelement <32 x i8> [[TMP29]], i8 [[TMP31]], i64 10
+; C32-NEXT: [[TMP33:%.*]] = add i32 [[TMP0]], 11
+; C32-NEXT: [[TMP34:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP33]]
+; C32-NEXT: [[TMP35:%.*]] = insertelement <32 x i8> [[TMP32]], i8 [[TMP34]], i64 11
+; C32-NEXT: [[TMP36:%.*]] = add i32 [[TMP0]], 12
+; C32-NEXT: [[TMP37:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP36]]
+; C32-NEXT: [[TMP38:%.*]] = insertelement <32 x i8> [[TMP35]], i8 [[TMP37]], i64 12
+; C32-NEXT: [[TMP39:%.*]] = add i32 [[TMP0]], 13
+; C32-NEXT: [[TMP40:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP39]]
+; C32-NEXT: [[TMP41:%.*]] = insertelement <32 x i8> [[TMP38]], i8 [[TMP40]], i64 13
+; C32-NEXT: [[TMP42:%.*]] = add i32 [[TMP0]], 14
+; C32-NEXT: [[TMP43:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP42]]
+; C32-NEXT: [[TMP44:%.*]] = insertelement <32 x i8> [[TMP41]], i8 [[TMP43]], i64 14
+; C32-NEXT: [[TMP45:%.*]] = add i32 [[TMP0]], 15
+; C32-NEXT: [[TMP46:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP45]]
+; C32-NEXT: [[TMP47:%.*]] = insertelement <32 x i8> [[TMP44]], i8 [[TMP46]], i64 15
+; C32-NEXT: [[TMP48:%.*]] = add i32 [[TMP0]], 16
+; C32-NEXT: [[TMP49:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP48]]
+; C32-NEXT: [[TMP50:%.*]] = insertelement <32 x i8> [[TMP47]], i8 [[TMP49]], i64 16
+; C32-NEXT: [[TMP51:%.*]] = add i32 [[TMP0]], 17
+; C32-NEXT: [[TMP52:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP51]]
+; C32-NEXT: [[TMP53:%.*]] = insertelement <32 x i8> [[TMP50]], i8 [[TMP52]], i64 17
+; C32-NEXT: [[TMP54:%.*]] = add i32 [[TMP0]], 18
+; C32-NEXT: [[TMP55:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP54]]
+; C32-NEXT: [[TMP56:%.*]] = insertelement <32 x i8> [[TMP53]], i8 [[TMP55]], i64 18
+; C32-NEXT: [[TMP57:%.*]] = add i32 [[TMP0]], 19
+; C32-NEXT: [[TMP58:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP57]]
+; C32-NEXT: [[TMP59:%.*]] = insertelement <32 x i8> [[TMP56]], i8 [[TMP58]], i64 19
+; C32-NEXT: [[TMP60:%.*]] = add i32 [[TMP0]], 20
+; C32-NEXT: [[TMP61:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP60]]
+; C32-NEXT: [[TMP62:%.*]] = insertelement <32 x i8> [[TMP59]], i8 [[TMP61]], i64 20
+; C32-NEXT: [[TMP63:%.*]] = add i32 [[TMP0]], 21
+; C32-NEXT: [[TMP64:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP63]]
+; C32-NEXT: [[TMP65:%.*]] = insertelement <32 x i8> [[TMP62]], i8 [[TMP64]], i64 21
+; C32-NEXT: [[TMP66:%.*]] = add i32 [[TMP0]], 22
+; C32-NEXT: [[TMP67:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP66]]
+; C32-NEXT: [[TMP68:%.*]] = insertelement <32 x i8> [[TMP65]], i8 [[TMP67]], i64 22
+; C32-NEXT: [[TMP69:%.*]] = add i32 [[TMP0]], 23
+; C32-NEXT: [[TMP70:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP69]]
+; C32-NEXT: [[TMP71:%.*]] = insertelement <32 x i8> [[TMP68]], i8 [[TMP70]], i64 23
+; C32-NEXT: [[TMP72:%.*]] = add i32 [[TMP0]], 24
+; C32-NEXT: [[TMP73:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP72]]
+; C32-NEXT: [[TMP74:%.*]] = insertelement <32 x i8> [[TMP71]], i8 [[TMP73]], i64 24
+; C32-NEXT: [[TMP75:%.*]] = add i32 [[TMP0]], 25
+; C32-NEXT: [[TMP76:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP75]]
+; C32-NEXT: [[TMP77:%.*]] = insertelement <32 x i8> [[TMP74]], i8 [[TMP76]], i64 25
+; C32-NEXT: [[TMP78:%.*]] = add i32 [[TMP0]], 26
+; C32-NEXT: [[TMP79:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP78]]
+; C32-NEXT: [[TMP80:%.*]] = insertelement <32 x i8> [[TMP77]], i8 [[TMP79]], i64 26
+; C32-NEXT: [[TMP81:%.*]] = add i32 [[TMP0]], 27
+; C32-NEXT: [[TMP82:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP81]]
+; C32-NEXT: [[TMP83:%.*]] = insertelement <32 x i8> [[TMP80]], i8 [[TMP82]], i64 27
+; C32-NEXT: [[TMP84:%.*]] = add i32 [[TMP0]], 28
+; C32-NEXT: [[TMP85:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP84]]
+; C32-NEXT: [[TMP86:%.*]] = insertelement <32 x i8> [[TMP83]], i8 [[TMP85]], i64 28
+; C32-NEXT: [[TMP87:%.*]] = add i32 [[TMP0]], 29
+; C32-NEXT: [[TMP88:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP87]]
+; C32-NEXT: [[TMP89:%.*]] = insertelement <32 x i8> [[TMP86]], i8 [[TMP88]], i64 29
+; C32-NEXT: [[TMP90:%.*]] = add i32 [[TMP0]], 30
+; C32-NEXT: [[TMP91:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP90]]
+; C32-NEXT: [[TMP92:%.*]] = insertelement <32 x i8> [[TMP89]], i8 [[TMP91]], i64 30
+; C32-NEXT: [[TMP93:%.*]] = add i32 [[TMP0]], 31
+; C32-NEXT: [[TMP94:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP93]]
+; C32-NEXT: [[TMP95:%.*]] = insertelement <32 x i8> [[TMP92]], i8 [[TMP94]], i64 31
+; C32-NEXT: store <32 x i8> [[TMP95]], ptr addrspace(1) [[OUT]], align 4
+; C32-NEXT: ret void
+;
+entry:
+ %alloca = alloca [64 x i8], align 4, addrspace(5)
+ %gep = getelementptr inbounds <32 x i8>, ptr addrspace(5) %alloca, i32 %idx
+ %vec = load <32 x i8>, ptr addrspace(5) %gep, align 4
+ store <32 x i8> %vec, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @GEP_dynamic_idx_noload(ptr addrspace(1) %out, i32 %idx) {
+; CHECK-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_noload(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [64 x i8], align 4, addrspace(5)
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i8>, ptr addrspace(5) [[ALLOCA]], i32 [[IDX]]
+; CHECK-NEXT: [[GEPINT:%.*]] = ptrtoint ptr addrspace(5) [[GEP]] to i64
+; CHECK-NEXT: store i64 [[GEPINT]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: ret void
+;
+; C4-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_noload(
+; C4-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) {
+; C4-NEXT: [[ENTRY:.*:]]
+; C4-NEXT: [[ALLOCA:%.*]] = alloca [64 x i8], align 4, addrspace(5)
+; C4-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i8>, ptr addrspace(5) [[ALLOCA]], i32 [[IDX]]
+; C4-NEXT: [[GEPINT:%.*]] = ptrtoint ptr addrspace(5) [[GEP]] to i64
+; C4-NEXT: store i64 [[GEPINT]], ptr addrspace(1) [[OUT]], align 4
+; C4-NEXT: ret void
+;
+; C8-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_noload(
+; C8-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) {
+; C8-NEXT: [[ENTRY:.*:]]
+; C8-NEXT: [[ALLOCA:%.*]] = alloca [64 x i8], align 4, addrspace(5)
+; C8-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i8>, ptr addrspace(5) [[ALLOCA]], i32 [[IDX]]
+; C8-NEXT: [[GEPINT:%.*]] = ptrtoint ptr addrspace(5) [[GEP]] to i64
+; C8-NEXT: store i64 [[GEPINT]], ptr addrspace(1) [[OUT]], align 4
+; C8-NEXT: ret void
+;
+; C16-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_noload(
+; C16-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) {
+; C16-NEXT: [[ENTRY:.*:]]
+; C16-NEXT: [[ALLOCA:%.*]] = alloca [64 x i8], align 4, addrspace(5)
+; C16-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i8>, ptr addrspace(5) [[ALLOCA]], i32 [[IDX]]
+; C16-NEXT: [[GEPINT:%.*]] = ptrtoint ptr addrspace(5) [[GEP]] to i64
+; C16-NEXT: store i64 [[GEPINT]], ptr addrspace(1) [[OUT]], align 4
+; C16-NEXT: ret void
+;
+; C32-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_noload(
+; C32-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) {
+; C32-NEXT: [[ENTRY:.*:]]
+; C32-NEXT: [[ALLOCA:%.*]] = alloca [64 x i8], align 4, addrspace(5)
+; C32-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i8>, ptr addrspace(5) [[ALLOCA]], i32 [[IDX]]
+; C32-NEXT: [[GEPINT:%.*]] = ptrtoint ptr addrspace(5) [[GEP]] to i64
+; C32-NEXT: store i64 [[GEPINT]], ptr addrspace(1) [[OUT]], align 4
+; C32-NEXT: ret void
+;
+entry:
+ %alloca = alloca [64 x i8], align 4, addrspace(5)
+ %gep = getelementptr inbounds <8 x i8>, ptr addrspace(5) %alloca, i32 %idx
+ %gepint = ptrtoint ptr addrspace(5) %gep to i64
+ store i64 %gepint, ptr addrspace(1) %out, align 4
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll
index caab29b58c13f..85a987f7c3a28 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll
@@ -252,84 +252,6 @@ bb2:
store i32 0, ptr addrspace(5) %extractelement
ret void
}
-
-define amdgpu_kernel void @GEP_dynamic_idx_v32i8(ptr addrspace(1) %out, i32 %idx) {
-; CHECK-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_v32i8(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) {
-; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [64 x i8], align 4, addrspace(5)
-; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr addrspace(5) [[ALLOCA]], i32 [[IDX]]
-; CHECK-NEXT: [[VEC:%.*]] = load <16 x i8>, ptr addrspace(5) [[GEP]], align 4
-; CHECK-NEXT: store <16 x i8> [[VEC]], ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT: ret void
-;
-entry:
- %alloca = alloca [64 x i8], align 4, addrspace(5)
- %gep = getelementptr inbounds <16 x i8>, ptr addrspace(5) %alloca, i32 %idx
- %vec = load <16 x i8>, ptr addrspace(5) %gep, align 4
- store <16 x i8> %vec, ptr addrspace(1) %out, align 4
- ret void
-}
-
-define amdgpu_kernel void @GEP_dynamic_idx_v8i8(ptr addrspace(1) %out, i32 %idx) {
-; CHECK-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_v8i8(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) {
-; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <64 x i8> poison
-; CHECK-NEXT: [[TMP0:%.*]] = mul i32 [[IDX]], 8
-; CHECK-NEXT: [[TMP1:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP0]]
-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[TMP1]], i64 0
-; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP0]], 1
-; CHECK-NEXT: [[TMP4:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP3]]
-; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i8> [[TMP2]], i8 [[TMP4]], i64 1
-; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP0]], 2
-; CHECK-NEXT: [[TMP7:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP6]]
-; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i8> [[TMP5]], i8 [[TMP7]], i64 2
-; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP0]], 3
-; CHECK-NEXT: [[TMP10:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP9]]
-; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x i8> [[TMP8]], i8 [[TMP10]], i64 3
-; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP0]], 4
-; CHECK-NEXT: [[TMP13:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP12]]
-; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x i8> [[TMP11]], i8 [[TMP13]], i64 4
-; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP0]], 5
-; CHECK-NEXT: [[TMP16:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP15]]
-; CHECK-NEXT: [[TMP17:%.*]] = insertelement <8 x i8> [[TMP14]], i8 [[TMP16]], i64 5
-; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP0]], 6
-; CHECK-NEXT: [[TMP19:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP18]]
-; CHECK-NEXT: [[TMP20:%.*]] = insertelement <8 x i8> [[TMP17]], i8 [[TMP19]], i64 6
-; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TMP0]], 7
-; CHECK-NEXT: [[TMP22:%.*]] = extractelement <64 x i8> [[ALLOCA]], i32 [[TMP21]]
-; CHECK-NEXT: [[TMP23:%.*]] = insertelement <8 x i8> [[TMP20]], i8 [[TMP22]], i64 7
-; CHECK-NEXT: store <8 x i8> [[TMP23]], ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT: ret void
-;
-entry:
- %alloca = alloca [64 x i8], align 4, addrspace(5)
- %gep = getelementptr inbounds <8 x i8>, ptr addrspace(5) %alloca, i32 %idx
- %vec = load <8 x i8>, ptr addrspace(5) %gep, align 4
- store <8 x i8> %vec, ptr addrspace(1) %out, align 4
- ret void
-}
-
-define amdgpu_kernel void @GEP_dynamic_idx_noload(ptr addrspace(1) %out, i32 %idx) {
-; CHECK-LABEL: define amdgpu_kernel void @GEP_dynamic_idx_noload(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IDX:%.*]]) {
-; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [64 x i8], align 4, addrspace(5)
-; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i8>, ptr addrspace(5) [[ALLOCA]], i32 [[IDX]]
-; CHECK-NEXT: [[GEPINT:%.*]] = ptrtoint ptr addrspace(5) [[GEP]] to i64
-; CHECK-NEXT: store i64 [[GEPINT]], ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT: ret void
-;
-entry:
- %alloca = alloca [64 x i8], align 4, addrspace(5)
- %gep = getelementptr inbounds <8 x i8>, ptr addrspace(5) %alloca, i32 %idx
- %gepint = ptrtoint ptr addrspace(5) %gep to i64
- store i64 %gepint, ptr addrspace(1) %out, align 4
- ret void
-}
-
-
;.
; CHECK: [[META0]] = !{}
; CHECK: [[RNG1]] = !{i32 0, i32 1025}
>From 8700e910df7dae1e71d6313c8c2ab1a3c9894a2f Mon Sep 17 00:00:00 2001
From: Kevin Choi <kevin.choi at amd.com>
Date: Wed, 3 Dec 2025 14:07:10 -0600
Subject: [PATCH 09/10] format
---
llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 3b656480770b6..b1bb1ae27e77a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -931,8 +931,8 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
GEP->users(), [&](const auto *U) { return isa<LoadInst>(U); });
if (auto *UserVecTy =
dyn_cast<FixedVectorType>(GEP->getSourceElementType())) {
- if (UsedInLoad &&
- UserVecTy->getNumElements() > PromoteAllocaDynamicIndexNumberElementLimit) {
+ if (UsedInLoad && UserVecTy->getNumElements() >
+ PromoteAllocaDynamicIndexNumberElementLimit) {
return RejectUser(Inst,
"user has too many elements for dynamic index");
}
>From 9d7b94bccac8c94388951e44df93f4fe48b5ab4a Mon Sep 17 00:00:00 2001
From: Kevin Choi <kevin.choi at amd.com>
Date: Wed, 3 Dec 2025 14:17:14 -0600
Subject: [PATCH 10/10] remove comment
---
llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll | 2 --
1 file changed, 2 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll
index 85a987f7c3a28..76e1868b3c4b9 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll
@@ -3,8 +3,6 @@
; Check that invalid IR is not produced on a vector typed
; getelementptr with a scalar alloca pointer base.
-; Also check if GEP with dynamic index is rejected above
-; threshold # of elements.
define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset() {
; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset() {
More information about the llvm-commits
mailing list