[llvm] [AMDGPU] Extend promotion of alloca to vectors (PR #127973)

Carl Ritson via llvm-commits llvm-commits at lists.llvm.org
Mon Mar 10 23:39:30 PDT 2025


https://github.com/perlfu updated https://github.com/llvm/llvm-project/pull/127973

>From 9293d1aed312c4ade0546b77f701fdcd4a57697c Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson at amd.com>
Date: Thu, 20 Feb 2025 16:42:30 +0900
Subject: [PATCH 1/3] [AMDGPU] Extend promotion of alloca to vectors

* Add multi dimensional array support
* Make maximum vector size tunable
* Make ratio of VGPRs used for vector promotion tunable
---
 llvm/docs/AMDGPUUsage.rst                     |   4 +
 .../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 111 +++++--
 .../CodeGen/AMDGPU/amdgpu.private-memory.ll   |   9 +-
 .../test/CodeGen/AMDGPU/array-ptr-calc-i32.ll |   8 +-
 .../AMDGPU/promote-alloca-max-elements.ll     | 236 ++++++++++++++
 .../CodeGen/AMDGPU/promote-alloca-memset.ll   |  12 +-
 .../CodeGen/AMDGPU/promote-alloca-multidim.ll | 292 ++++++++++++++++++
 .../CodeGen/AMDGPU/promote-alloca-no-opts.ll  |   4 +-
 .../AMDGPU/promote-alloca-vgpr-ratio.ll       | 276 +++++++++++++++++
 9 files changed, 907 insertions(+), 45 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/promote-alloca-max-elements.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/promote-alloca-vgpr-ratio.ll

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 9932074830866..d163d52cc1b31 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1720,6 +1720,10 @@ The AMDGPU backend supports the following LLVM IR attributes.
      "amdgpu-sgpr-hazard-mem-wait-cull-threshold"     Sets the number of active SGPR hazards that must be present before
                                                       inserting a cull sequence at a memory wait.
 
+     "amdgpu-promote-alloca-to-vector-max-elements"   Maximum vector size (in elements) to create when promoting alloca.
+
+     "amdgpu-promote-alloca-to-vector-vgpr-ratio"     Ratio of VGPRs to budget for promoting alloca to vectors.
+
      ================================================ ==========================================================
 
 Calling Conventions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 69ddb384e1a40..aec00a28c1619 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -66,6 +66,18 @@ static cl::opt<unsigned> PromoteAllocaToVectorLimit(
     cl::desc("Maximum byte size to consider promote alloca to vector"),
     cl::init(0));
 
+static cl::opt<unsigned> PromoteAllocaToVectorMaxElements(
+    "amdgpu-promote-alloca-to-vector-max-elements",
+    cl::desc("Maximum vector size (in elements) to use when promoting alloca"),
+    cl::init(16));
+
+// Use up to 1/4 of available register budget for vectorization.
+// FIXME: Increase the limit for whole function budgets? Perhaps x2?
+static cl::opt<unsigned> PromoteAllocaToVectorVGPRRatio(
+    "amdgpu-promote-alloca-to-vector-vgpr-ratio",
+    cl::desc("Ratio of VGPRs to budget for promoting alloca to vectors"),
+    cl::init(4));
+
 static cl::opt<unsigned>
     LoopUserWeight("promote-alloca-vector-loop-user-weight",
                    cl::desc("The bonus weight of users of allocas within loop "
@@ -310,12 +322,17 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
 
   bool SufficientLDS = PromoteToLDS ? hasSufficientLocalMem(F) : false;
 
-  // Use up to 1/4 of available register budget for vectorization.
-  // FIXME: Increase the limit for whole function budgets? Perhaps x2?
+  const unsigned VGPRRatio =
+      PromoteAllocaToVectorVGPRRatio.getNumOccurrences()
+          ? PromoteAllocaToVectorVGPRRatio
+          : F.getFnAttributeAsParsedInteger(
+                "amdgpu-promote-alloca-to-vector-vgpr-ratio",
+                PromoteAllocaToVectorVGPRRatio);
+
   unsigned VectorizationBudget =
       (PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
                                   : (MaxVGPRs * 32)) /
-      4;
+      VGPRRatio;
 
   SmallVector<AllocaInst *, 16> Allocas;
   for (Instruction &I : F.getEntryBlock()) {
@@ -398,7 +415,8 @@ calculateVectorIndex(Value *Ptr,
 }
 
 static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
-                               Type *VecElemTy, const DataLayout &DL) {
+                               Type *VecElemTy, const DataLayout &DL,
+                               SmallVector<Instruction *> &NewInsts) {
   // TODO: Extracting a "multiple of X" from a GEP might be a useful generic
   // helper.
   unsigned BW = DL.getIndexTypeSizeInBits(GEP->getType());
@@ -412,22 +430,37 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
   if (VarOffsets.size() > 1)
     return nullptr;
 
-  if (VarOffsets.size() == 1) {
-    // Only handle cases where we don't need to insert extra arithmetic
-    // instructions.
-    const auto &VarOffset = VarOffsets.front();
-    if (!ConstOffset.isZero() || VarOffset.second != VecElemSize)
-      return nullptr;
-    return VarOffset.first;
-  }
-
   APInt Quot;
   uint64_t Rem;
   APInt::udivrem(ConstOffset, VecElemSize, Quot, Rem);
   if (Rem != 0)
     return nullptr;
 
-  return ConstantInt::get(GEP->getContext(), Quot);
+  ConstantInt *ConstIndex = ConstantInt::get(GEP->getContext(), Quot);
+  if (VarOffsets.size() == 0)
+    return ConstIndex;
+
+  IRBuilder<> Builder(GEP);
+
+  const auto &VarOffset = VarOffsets.front();
+  APInt::udivrem(VarOffset.second, VecElemSize, Quot, Rem);
+  if (Rem != 0 || Quot.isZero())
+    return nullptr;
+
+  Value *Offset = VarOffset.first;
+  if (!Quot.isOne()) {
+    ConstantInt *ConstMul = ConstantInt::get(GEP->getContext(), Quot);
+    Offset = Builder.CreateMul(Offset, ConstMul);
+    if (Instruction *NewInst = dyn_cast<Instruction>(Offset))
+      NewInsts.push_back(NewInst);
+  }
+  if (ConstOffset.isZero())
+    return Offset;
+
+  Value *IndexAdd = Builder.CreateAdd(ConstIndex, Offset);
+  if (Instruction *NewInst = dyn_cast<Instruction>(IndexAdd))
+    NewInsts.push_back(NewInst);
+  return IndexAdd;
 }
 
 /// Promotes a single user of the alloca to a vector form.
@@ -735,23 +768,48 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
   Type *AllocaTy = Alloca.getAllocatedType();
   auto *VectorTy = dyn_cast<FixedVectorType>(AllocaTy);
   if (auto *ArrayTy = dyn_cast<ArrayType>(AllocaTy)) {
-    if (VectorType::isValidElementType(ArrayTy->getElementType()) &&
-        ArrayTy->getNumElements() > 0)
-      VectorTy = FixedVectorType::get(ArrayTy->getElementType(),
-                                      ArrayTy->getNumElements());
+    uint64_t NumElems = 1;
+    Type *ElemTy;
+    do {
+      NumElems *= ArrayTy->getNumElements();
+      ElemTy = ArrayTy->getElementType();
+    } while ((ArrayTy = dyn_cast<ArrayType>(ElemTy)));
+
+    // Check for array of vectors
+    auto *InnerVectorTy = dyn_cast<FixedVectorType>(ElemTy);
+    if (InnerVectorTy) {
+      NumElems *= InnerVectorTy->getNumElements();
+      ElemTy = InnerVectorTy->getElementType();
+    }
+
+    if (VectorType::isValidElementType(ElemTy) && NumElems > 0) {
+      unsigned ElementSize = DL->getTypeSizeInBits(ElemTy) / 8;
+      unsigned AllocaSize = DL->getTypeStoreSize(AllocaTy);
+      // Expand vector if required to match padding of inner type,
+      // i.e. odd size subvectors.
+      // Storage size of new vector must match that of alloca for correct
+      // behaviour of byte offsets and GEP computation.
+      if (NumElems * ElementSize != AllocaSize)
+        NumElems = AllocaSize / ElementSize;
+      if (NumElems > 0 && (AllocaSize % ElementSize) == 0)
+        VectorTy = FixedVectorType::get(ElemTy, NumElems);
+    }
   }
 
-  // FIXME: There is no reason why we can't support larger arrays, we
-  // are just being conservative for now.
-  // FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or
-  // equivalent. Potentially these could also be promoted but we don't currently
-  // handle this case
   if (!VectorTy) {
     LLVM_DEBUG(dbgs() << "  Cannot convert type to vector\n");
     return false;
   }
 
-  if (VectorTy->getNumElements() > 16 || VectorTy->getNumElements() < 2) {
+  const unsigned MaxElements =
+      PromoteAllocaToVectorMaxElements.getNumOccurrences()
+          ? PromoteAllocaToVectorMaxElements
+          : Alloca.getParent()->getParent()->getFnAttributeAsParsedInteger(
+                "amdgpu-promote-alloca-to-vector-max-elements",
+                PromoteAllocaToVectorMaxElements);
+
+  if (VectorTy->getNumElements() > MaxElements ||
+      VectorTy->getNumElements() < 2) {
     LLVM_DEBUG(dbgs() << "  " << *VectorTy
                       << " has an unsupported number of elements\n");
     return false;
@@ -761,11 +819,14 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
   SmallVector<Instruction *> WorkList;
   SmallVector<Instruction *> UsersToRemove;
   SmallVector<Instruction *> DeferredInsts;
+  SmallVector<Instruction *> NewGEPInsts;
   DenseMap<MemTransferInst *, MemTransferInfo> TransferInfo;
 
   const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
     LLVM_DEBUG(dbgs() << "  Cannot promote alloca to vector: " << Msg << "\n"
                       << "    " << *Inst << "\n");
+    for (auto *Inst : reverse(NewGEPInsts))
+      Inst->eraseFromParent();
     return false;
   };
 
@@ -815,7 +876,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
     if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
       // If we can't compute a vector index from this GEP, then we can't
       // promote this alloca to vector.
-      Value *Index = GEPToVectorIndex(GEP, &Alloca, VecEltTy, *DL);
+      Value *Index = GEPToVectorIndex(GEP, &Alloca, VecEltTy, *DL, NewGEPInsts);
       if (!Index)
         return RejectUser(Inst, "cannot compute vector index for GEP");
 
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
index 82832277b1aba..a663d451cad35 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
@@ -258,7 +258,7 @@ entry:
 ; FUNC-LABEL: {{^}}no_overlap:
 ;
 ; A total of 5 bytes should be allocated and used.
-; SI: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 ;
+; SI-ALLOCA: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 ;
 define amdgpu_kernel void @no_overlap(ptr addrspace(1) %out, i32 %in) #0 {
 entry:
   %0 = alloca [3 x i8], align 1, addrspace(5)
@@ -281,6 +281,7 @@ entry:
   ret void
 }
 
+; FUNC-LABEL: {{^}}char_array_array:
 define amdgpu_kernel void @char_array_array(ptr addrspace(1) %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x [2 x i8]], addrspace(5)
@@ -294,6 +295,7 @@ entry:
   ret void
 }
 
+; FUNC-LABEL: {{^}}i32_array_array:
 define amdgpu_kernel void @i32_array_array(ptr addrspace(1) %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x [2 x i32]], addrspace(5)
@@ -306,6 +308,7 @@ entry:
   ret void
 }
 
+; FUNC-LABEL: {{^}}i64_array_array:
 define amdgpu_kernel void @i64_array_array(ptr addrspace(1) %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x [2 x i64]], addrspace(5)
@@ -319,7 +322,7 @@ entry:
 }
 
 %struct.pair32 = type { i32, i32 }
-
+; FUNC-LABEL: {{^}}struct_array_array:
 define amdgpu_kernel void @struct_array_array(ptr addrspace(1) %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x [2 x %struct.pair32]], addrspace(5)
@@ -333,6 +336,7 @@ entry:
   ret void
 }
 
+; FUNC-LABEL: {{^}}struct_pair32_array:
 define amdgpu_kernel void @struct_pair32_array(ptr addrspace(1) %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x %struct.pair32], addrspace(5)
@@ -346,6 +350,7 @@ entry:
   ret void
 }
 
+; FUNC-LABEL: {{^}}select_private:
 define amdgpu_kernel void @select_private(ptr addrspace(1) %out, i32 %in) nounwind {
 entry:
   %tmp = alloca [2 x i32], addrspace(5)
diff --git a/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll b/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
index ac8782e1e542f..e1bbc243344b0 100644
--- a/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
@@ -22,13 +22,7 @@ declare void @llvm.amdgcn.s.barrier() #2
 ; SI-ALLOCA: s_barrier
 ; SI-ALLOCA: buffer_load_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}], 0 offen offset:64
 ;
-; FIXME: The AMDGPUPromoteAlloca pass should be able to convert this
-; alloca to a vector.  It currently fails because it does not know how
-; to interpret:
-; getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 1, i32 %b
-
-; SI-PROMOTE: v_add_i32_e32 [[PTRREG:v[0-9]+]], vcc, 64
-; SI-PROMOTE: ds_write_b32 [[PTRREG]]
+; SI-PROMOTE: LDSByteSize: 0
 define amdgpu_kernel void @test_private_array_ptr_calc(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) #0 {
   %alloca = alloca [16 x i32], align 16, addrspace(5)
   %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0);
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-max-elements.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-max-elements.ll
new file mode 100644
index 0000000000000..7f1cf728bce7e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-max-elements.ll
@@ -0,0 +1,236 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 < %s | FileCheck --check-prefix=BASE --check-prefix=DEFAULT %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 -amdgpu-promote-alloca-to-vector-max-elements=24 < %s | FileCheck --check-prefix=BASE %s --check-prefix=MAX24
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 -amdgpu-promote-alloca-to-vector-max-elements=32 < %s | FileCheck --check-prefix=BASE %s --check-prefix=MAX32
+
+define amdgpu_kernel void @i32_24_elements(ptr %out) #0 {
+; DEFAULT-LABEL: define amdgpu_kernel void @i32_24_elements(
+; DEFAULT-SAME: ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; DEFAULT-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; DEFAULT-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; DEFAULT-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; DEFAULT-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; DEFAULT-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; DEFAULT-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; DEFAULT-NEXT:    [[ALLOCA:%.*]] = alloca [24 x i32], align 16, addrspace(5)
+; DEFAULT-NEXT:    call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 96, i1 false)
+; DEFAULT-NEXT:    [[GEP_0:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0
+; DEFAULT-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 20
+; DEFAULT-NEXT:    store i32 42, ptr addrspace(5) [[GEP_0]], align 4
+; DEFAULT-NEXT:    store i32 43, ptr addrspace(5) [[GEP_1]], align 4
+; DEFAULT-NEXT:    [[GEP:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]]
+; DEFAULT-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4
+; DEFAULT-NEXT:    store i32 [[LOAD]], ptr [[OUT]], align 4
+; DEFAULT-NEXT:    ret void
+;
+; MAX24-LABEL: define amdgpu_kernel void @i32_24_elements(
+; MAX24-SAME: ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; MAX24-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; MAX24-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; MAX24-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; MAX24-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; MAX24-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; MAX24-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; MAX24-NEXT:    [[TMP1:%.*]] = extractelement <24 x i32> <i32 42, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 43, i32 0, i32 0, i32 0>, i32 [[SEL2]]
+; MAX24-NEXT:    store i32 [[TMP1]], ptr [[OUT]], align 4
+; MAX24-NEXT:    ret void
+;
+; MAX32-LABEL: define amdgpu_kernel void @i32_24_elements(
+; MAX32-SAME: ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; MAX32-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; MAX32-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; MAX32-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; MAX32-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; MAX32-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; MAX32-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; MAX32-NEXT:    [[TMP1:%.*]] = extractelement <24 x i32> <i32 42, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 43, i32 0, i32 0, i32 0>, i32 [[SEL2]]
+; MAX32-NEXT:    store i32 [[TMP1]], ptr [[OUT]], align 4
+; MAX32-NEXT:    ret void
+;
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %c1 = icmp uge i32 %x, 3
+  %c2 = icmp uge i32 %y, 3
+  %sel1 = select i1 %c1, i32 1, i32 2
+  %sel2 = select i1 %c2, i32 0, i32 %sel1
+  %alloca = alloca [24 x i32], align 16, addrspace(5)
+  call void @llvm.memset.p5.i32(ptr addrspace(5) %alloca, i8 0, i32 96, i1 false)
+  %gep.0 = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 0
+  %gep.1 = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 20
+  store i32 42, ptr addrspace(5) %gep.0
+  store i32 43, ptr addrspace(5) %gep.1
+  %gep = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 %sel2
+  %load = load i32, ptr addrspace(5) %gep
+  store i32 %load, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @i32_24_elements_attrib(ptr %out) #1 {
+; BASE-LABEL: define amdgpu_kernel void @i32_24_elements_attrib(
+; BASE-SAME: ptr [[OUT:%.*]]) #[[ATTR1:[0-9]+]] {
+; BASE-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; BASE-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; BASE-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; BASE-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; BASE-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; BASE-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; BASE-NEXT:    [[TMP1:%.*]] = extractelement <24 x i32> <i32 42, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 43, i32 0, i32 0, i32 0>, i32 [[SEL2]]
+; BASE-NEXT:    store i32 [[TMP1]], ptr [[OUT]], align 4
+; BASE-NEXT:    ret void
+;
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %c1 = icmp uge i32 %x, 3
+  %c2 = icmp uge i32 %y, 3
+  %sel1 = select i1 %c1, i32 1, i32 2
+  %sel2 = select i1 %c2, i32 0, i32 %sel1
+  %alloca = alloca [24 x i32], align 16, addrspace(5)
+  call void @llvm.memset.p5.i32(ptr addrspace(5) %alloca, i8 0, i32 96, i1 false)
+  %gep.0 = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 0
+  %gep.1 = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 20
+  store i32 42, ptr addrspace(5) %gep.0
+  store i32 43, ptr addrspace(5) %gep.1
+  %gep = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 %sel2
+  %load = load i32, ptr addrspace(5) %gep
+  store i32 %load, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @i32_32_elements(ptr %out) #0 {
+; DEFAULT-LABEL: define amdgpu_kernel void @i32_32_elements(
+; DEFAULT-SAME: ptr [[OUT:%.*]]) #[[ATTR0]] {
+; DEFAULT-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; DEFAULT-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; DEFAULT-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; DEFAULT-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; DEFAULT-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; DEFAULT-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; DEFAULT-NEXT:    [[ALLOCA:%.*]] = alloca [32 x i32], align 16, addrspace(5)
+; DEFAULT-NEXT:    call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 128, i1 false)
+; DEFAULT-NEXT:    [[GEP_0:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0
+; DEFAULT-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 30
+; DEFAULT-NEXT:    store i32 42, ptr addrspace(5) [[GEP_0]], align 4
+; DEFAULT-NEXT:    store i32 43, ptr addrspace(5) [[GEP_1]], align 4
+; DEFAULT-NEXT:    [[GEP:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]]
+; DEFAULT-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4
+; DEFAULT-NEXT:    store i32 [[LOAD]], ptr [[OUT]], align 4
+; DEFAULT-NEXT:    ret void
+;
+; MAX24-LABEL: define amdgpu_kernel void @i32_32_elements(
+; MAX24-SAME: ptr [[OUT:%.*]]) #[[ATTR0]] {
+; MAX24-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; MAX24-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; MAX24-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; MAX24-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; MAX24-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; MAX24-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; MAX24-NEXT:    [[ALLOCA:%.*]] = alloca [32 x i32], align 16, addrspace(5)
+; MAX24-NEXT:    call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 128, i1 false)
+; MAX24-NEXT:    [[GEP_0:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0
+; MAX24-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 30
+; MAX24-NEXT:    store i32 42, ptr addrspace(5) [[GEP_0]], align 4
+; MAX24-NEXT:    store i32 43, ptr addrspace(5) [[GEP_1]], align 4
+; MAX24-NEXT:    [[GEP:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]]
+; MAX24-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4
+; MAX24-NEXT:    store i32 [[LOAD]], ptr [[OUT]], align 4
+; MAX24-NEXT:    ret void
+;
+; MAX32-LABEL: define amdgpu_kernel void @i32_32_elements(
+; MAX32-SAME: ptr [[OUT:%.*]]) #[[ATTR0]] {
+; MAX32-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; MAX32-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; MAX32-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; MAX32-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; MAX32-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; MAX32-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; MAX32-NEXT:    [[TMP1:%.*]] = extractelement <32 x i32> <i32 42, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 43, i32 0>, i32 [[SEL2]]
+; MAX32-NEXT:    store i32 [[TMP1]], ptr [[OUT]], align 4
+; MAX32-NEXT:    ret void
+;
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %c1 = icmp uge i32 %x, 3
+  %c2 = icmp uge i32 %y, 3
+  %sel1 = select i1 %c1, i32 1, i32 2
+  %sel2 = select i1 %c2, i32 0, i32 %sel1
+  %alloca = alloca [32 x i32], align 16, addrspace(5)
+  call void @llvm.memset.p5.i32(ptr addrspace(5) %alloca, i8 0, i32 128, i1 false)
+  %gep.0 = getelementptr inbounds [32 x i32], ptr addrspace(5) %alloca, i32 0, i32 0
+  %gep.1 = getelementptr inbounds [32 x i32], ptr addrspace(5) %alloca, i32 0, i32 30
+  store i32 42, ptr addrspace(5) %gep.0
+  store i32 43, ptr addrspace(5) %gep.1
+  %gep = getelementptr inbounds [32 x i32], ptr addrspace(5) %alloca, i32 0, i32 %sel2
+  %load = load i32, ptr addrspace(5) %gep
+  store i32 %load, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @i32_32_elements_attrib(ptr %out) #2 {
+; DEFAULT-LABEL: define amdgpu_kernel void @i32_32_elements_attrib(
+; DEFAULT-SAME: ptr [[OUT:%.*]]) #[[ATTR2:[0-9]+]] {
+; DEFAULT-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; DEFAULT-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; DEFAULT-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; DEFAULT-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; DEFAULT-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; DEFAULT-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; DEFAULT-NEXT:    [[TMP1:%.*]] = extractelement <32 x i32> <i32 42, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 43, i32 0>, i32 [[SEL2]]
+; DEFAULT-NEXT:    store i32 [[TMP1]], ptr [[OUT]], align 4
+; DEFAULT-NEXT:    ret void
+;
+; MAX24-LABEL: define amdgpu_kernel void @i32_32_elements_attrib(
+; MAX24-SAME: ptr [[OUT:%.*]]) #[[ATTR2:[0-9]+]] {
+; MAX24-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; MAX24-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; MAX24-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; MAX24-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; MAX24-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; MAX24-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; MAX24-NEXT:    [[ALLOCA:%.*]] = alloca [32 x i32], align 16, addrspace(5)
+; MAX24-NEXT:    call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 128, i1 false)
+; MAX24-NEXT:    [[GEP_0:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0
+; MAX24-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 30
+; MAX24-NEXT:    store i32 42, ptr addrspace(5) [[GEP_0]], align 4
+; MAX24-NEXT:    store i32 43, ptr addrspace(5) [[GEP_1]], align 4
+; MAX24-NEXT:    [[GEP:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]]
+; MAX24-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4
+; MAX24-NEXT:    store i32 [[LOAD]], ptr [[OUT]], align 4
+; MAX24-NEXT:    ret void
+;
+; MAX32-LABEL: define amdgpu_kernel void @i32_32_elements_attrib(
+; MAX32-SAME: ptr [[OUT:%.*]]) #[[ATTR2:[0-9]+]] {
+; MAX32-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; MAX32-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; MAX32-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; MAX32-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; MAX32-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; MAX32-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; MAX32-NEXT:    [[TMP1:%.*]] = extractelement <32 x i32> <i32 42, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 43, i32 0>, i32 [[SEL2]]
+; MAX32-NEXT:    store i32 [[TMP1]], ptr [[OUT]], align 4
+; MAX32-NEXT:    ret void
+;
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %c1 = icmp uge i32 %x, 3
+  %c2 = icmp uge i32 %y, 3
+  %sel1 = select i1 %c1, i32 1, i32 2
+  %sel2 = select i1 %c2, i32 0, i32 %sel1
+  %alloca = alloca [32 x i32], align 16, addrspace(5)
+  call void @llvm.memset.p5.i32(ptr addrspace(5) %alloca, i8 0, i32 128, i1 false)
+  %gep.0 = getelementptr inbounds [32 x i32], ptr addrspace(5) %alloca, i32 0, i32 0
+  %gep.1 = getelementptr inbounds [32 x i32], ptr addrspace(5) %alloca, i32 0, i32 30
+  store i32 42, ptr addrspace(5) %gep.0
+  store i32 43, ptr addrspace(5) %gep.1
+  %gep = getelementptr inbounds [32 x i32], ptr addrspace(5) %alloca, i32 0, i32 %sel2
+  %load = load i32, ptr addrspace(5) %gep
+  store i32 %load, ptr %out
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
+declare i32 @llvm.amdgcn.workitem.id.y()
+declare void @llvm.memset.p5.i32(ptr addrspace(5) nocapture writeonly, i8, i32, i1 immarg)
+
+attributes #0 = { nounwind "amdgpu-flat-work-group-size"="64,64" "amdgpu-waves-per-eu"="1,3" }
+attributes #1 = { nounwind "amdgpu-flat-work-group-size"="64,64" "amdgpu-waves-per-eu"="1,3" "amdgpu-promote-alloca-to-vector-max-elements"="24" }
+attributes #2 = { nounwind "amdgpu-flat-work-group-size"="64,64" "amdgpu-waves-per-eu"="1,3" "amdgpu-promote-alloca-to-vector-max-elements"="32" }
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll
index 74651e10c7809..bb51c1d27b336 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll
@@ -110,10 +110,7 @@ define amdgpu_kernel void @memset_vector_ptr_alloca(ptr %out) {
 
 define amdgpu_kernel void @memset_array_of_array_ptr_alloca(ptr %out) {
 ; CHECK-LABEL: @memset_array_of_array_ptr_alloca(
-; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [2 x [3 x ptr]], align 16, addrspace(5)
-; CHECK-NEXT:    call void @llvm.memset.p5.i64(ptr addrspace(5) [[ALLOCA]], i8 0, i64 48, i1 false)
-; CHECK-NEXT:    [[LOAD:%.*]] = load i64, ptr addrspace(5) [[ALLOCA]], align 8
-; CHECK-NEXT:    store i64 [[LOAD]], ptr [[OUT:%.*]], align 8
+; CHECK-NEXT:    store i64 0, ptr [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %alloca = alloca [2 x [3 x ptr]], align 16, addrspace(5)
@@ -125,14 +122,11 @@ define amdgpu_kernel void @memset_array_of_array_ptr_alloca(ptr %out) {
 
 define amdgpu_kernel void @memset_array_of_vec_ptr_alloca(ptr %out) {
 ; CHECK-LABEL: @memset_array_of_vec_ptr_alloca(
-; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [2 x <3 x ptr>], align 16, addrspace(5)
-; CHECK-NEXT:    call void @llvm.memset.p5.i64(ptr addrspace(5) [[ALLOCA]], i8 0, i64 48, i1 false)
-; CHECK-NEXT:    [[LOAD:%.*]] = load i64, ptr addrspace(5) [[ALLOCA]], align 8
-; CHECK-NEXT:    store i64 [[LOAD]], ptr [[OUT:%.*]], align 8
+; CHECK-NEXT:    store i64 0, ptr [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %alloca = alloca [2 x <3 x ptr>], align 16, addrspace(5)
-  call void @llvm.memset.p5.i64(ptr addrspace(5) %alloca, i8 0, i64 48, i1 false)
+  call void @llvm.memset.p5.i64(ptr addrspace(5) %alloca, i8 0, i64 64, i1 false)
   %load = load i64, ptr addrspace(5) %alloca
   store i64 %load, ptr %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll
new file mode 100644
index 0000000000000..a0af3f7442114
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll
@@ -0,0 +1,292 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 < %s | FileCheck %s
+
+define amdgpu_kernel void @i32_2d_load_store(ptr %out) {
+; CHECK-LABEL: define amdgpu_kernel void @i32_2d_load_store(
+; CHECK-SAME: ptr [[OUT:%.*]]) {
+; CHECK-NEXT:    store i32 3, ptr [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+  %alloca = alloca [2 x [3 x i32]], align 16, addrspace(5)
+  %gep.00 = getelementptr inbounds [2 x [3 x i32]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 0
+  %gep.01 = getelementptr inbounds [2 x [3 x i32]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 1
+  %gep.02 = getelementptr inbounds [2 x [3 x i32]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 2
+  %gep.10 = getelementptr inbounds [2 x [3 x i32]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0
+  %gep.11 = getelementptr inbounds [2 x [3 x i32]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 1
+  %gep.12 = getelementptr inbounds [2 x [3 x i32]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 2
+  store i32 0, ptr addrspace(5) %gep.00
+  store i32 1, ptr addrspace(5) %gep.01
+  store i32 2, ptr addrspace(5) %gep.02
+  store i32 3, ptr addrspace(5) %gep.10
+  store i32 4, ptr addrspace(5) %gep.11
+  store i32 5, ptr addrspace(5) %gep.12
+  %gep = getelementptr inbounds [2 x [3 x i32]], ptr addrspace(5) %alloca, i32 0, i32 1
+  %load = load i32, ptr addrspace(5) %gep
+  store i32 %load, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @i64_2d_load_store(ptr %out) {
+; CHECK-LABEL: define amdgpu_kernel void @i64_2d_load_store(
+; CHECK-SAME: ptr [[OUT:%.*]]) {
+; CHECK-NEXT:    store i64 3, ptr [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+  %alloca = alloca [2 x [3 x i64]], align 16, addrspace(5)
+  %gep.00 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 0
+  %gep.01 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 1
+  %gep.02 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 2
+  %gep.10 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0
+  %gep.11 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 1
+  %gep.12 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 2
+  store i64 0, ptr addrspace(5) %gep.00
+  store i64 1, ptr addrspace(5) %gep.01
+  store i64 2, ptr addrspace(5) %gep.02
+  store i64 3, ptr addrspace(5) %gep.10
+  store i64 4, ptr addrspace(5) %gep.11
+  store i64 5, ptr addrspace(5) %gep.12
+  %gep = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 1
+  %load = load i64, ptr addrspace(5) %gep
+  store i64 %load, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @i32_2d_alloca_store_partial(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) {
+; CHECK-LABEL: define amdgpu_kernel void @i32_2d_alloca_store_partial(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(3) [[DUMMY_LDS:%.*]]) {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; CHECK-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; CHECK-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; CHECK-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 undef, i32 undef, i32 undef, i32 undef>, i32 [[SEL2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[TMP0]] to float
+; CHECK-NEXT:    store float [[TMP1]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+bb:
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %c1 = icmp uge i32 %x, 3
+  %c2 = icmp uge i32 %y, 3
+  %sel1 = select i1 %c1, i32 1, i32 2
+  %sel2 = select i1 %c2, i32 0, i32 %sel1
+  %alloca = alloca [2 x [4 x i32]], align 4, addrspace(5)
+  %gep = getelementptr inbounds <4 x i32>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
+  store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, ptr addrspace(5) %alloca, align 4
+  %load = load float, ptr addrspace(5) %gep, align 4
+  store float %load, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+
+define amdgpu_kernel void @i64_2d_load_store_cast(ptr %out) {
+; CHECK-LABEL: define amdgpu_kernel void @i64_2d_load_store_cast(
+; CHECK-SAME: ptr [[OUT:%.*]]) {
+; CHECK-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; CHECK-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; CHECK-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; CHECK-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <6 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5>, i32 [[SEL2]]
+; CHECK-NEXT:    store i64 [[TMP1]], ptr [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %c1 = icmp uge i32 %x, 3
+  %c2 = icmp uge i32 %y, 3
+  %sel1 = select i1 %c1, i32 1, i32 2
+  %sel2 = select i1 %c2, i32 0, i32 %sel1
+  %alloca = alloca [2 x [3 x i64]], align 16, addrspace(5)
+  %gep.00 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 0
+  %gep.01 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 1
+  %gep.02 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 2
+  %gep.10 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0
+  %gep.11 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 1
+  %gep.12 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 2
+  store i64 0, ptr addrspace(5) %gep.00
+  store i64 1, ptr addrspace(5) %gep.01
+  store i64 2, ptr addrspace(5) %gep.02
+  store i64 3, ptr addrspace(5) %gep.10
+  store i64 4, ptr addrspace(5) %gep.11
+  store i64 5, ptr addrspace(5) %gep.12
+  %gep = getelementptr inbounds [6 x i64], ptr addrspace(5) %alloca, i32 0, i32 %sel2
+  %load = load i64, ptr addrspace(5) %gep
+  store i64 %load, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @i64_2d_load_store_subvec_1(ptr %out) {
+; CHECK-LABEL: define amdgpu_kernel void @i64_2d_load_store_subvec_1(
+; CHECK-SAME: ptr [[OUT:%.*]]) {
+; CHECK-NEXT:    [[ELEM:%.*]] = extractelement <3 x i64> <i64 3, i64 4, i64 5>, i32 2
+; CHECK-NEXT:    store i64 [[ELEM]], ptr [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+  %alloca = alloca [2 x [3 x i64]], align 16, addrspace(5)
+  %gep.00 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0
+  %gep.01 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0
+  store <3 x i64> <i64 0, i64 1, i64 2>, ptr addrspace(5) %gep.00
+  store <3 x i64> <i64 3, i64 4, i64 5>, ptr addrspace(5) %gep.01
+  %gep = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 1
+  %load = load <3 x i64>, ptr addrspace(5) %gep
+  %elem = extractelement <3 x i64> %load, i32 2
+  store i64 %elem, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @i64_2d_load_store_subvec_2(ptr %out) {
+; CHECK-LABEL: define amdgpu_kernel void @i64_2d_load_store_subvec_2(
+; CHECK-SAME: ptr [[OUT:%.*]]) {
+; CHECK-NEXT:    [[ELEM:%.*]] = extractelement <3 x i64> <i64 3, i64 4, i64 5>, i32 2
+; CHECK-NEXT:    store i64 [[ELEM]], ptr [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+  %alloca = alloca [2 x <3 x i64>], align 16, addrspace(5)
+  %gep.00 = getelementptr inbounds [2 x <3 x i64>], ptr addrspace(5) %alloca, i32 0
+  %gep.01 = getelementptr inbounds [2 x <3 x i64>], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0
+  store <3 x i64> <i64 0, i64 1, i64 2>, ptr addrspace(5) %gep.00
+  store <3 x i64> <i64 3, i64 4, i64 5>, ptr addrspace(5) %gep.01
+  %gep = getelementptr inbounds [2 x <3 x i64>], ptr addrspace(5) %alloca, i32 0, i32 1
+  %load = load <3 x i64>, ptr addrspace(5) %gep
+  %elem = extractelement <3 x i64> %load, i32 2
+  store i64 %elem, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @i64_2d_load_store_subvec_3(ptr %out) {
+; CHECK-LABEL: define amdgpu_kernel void @i64_2d_load_store_subvec_3(
+; CHECK-SAME: ptr [[OUT:%.*]]) {
+; CHECK-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; CHECK-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; CHECK-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; CHECK-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[SEL2]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <6 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5>, i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <3 x i64> poison, i64 [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <6 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5>, i32 [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <3 x i64> [[TMP3]], i64 [[TMP5]], i64 1
+; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP1]], 2
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <6 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5>, i32 [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <3 x i64> [[TMP6]], i64 [[TMP8]], i64 2
+; CHECK-NEXT:    [[ELEM:%.*]] = extractelement <3 x i64> [[TMP9]], i32 2
+; CHECK-NEXT:    store i64 [[ELEM]], ptr [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %c1 = icmp uge i32 %x, 3
+  %c2 = icmp uge i32 %y, 3
+  %sel1 = select i1 %c1, i32 1, i32 2
+  %sel2 = select i1 %c2, i32 0, i32 %sel1
+  %alloca = alloca [2 x [3 x i64]], align 16, addrspace(5)
+  %gep.00 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0
+  %gep.01 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0
+  store <3 x i64> <i64 0, i64 1, i64 2>, ptr addrspace(5) %gep.00
+  store <3 x i64> <i64 3, i64 4, i64 5>, ptr addrspace(5) %gep.01
+  %gep = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 %sel2
+  %load = load <3 x i64>, ptr addrspace(5) %gep
+  %elem = extractelement <3 x i64> %load, i32 2
+  store i64 %elem, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @i64_2d_load_store_subvec_4(ptr %out) {
+; CHECK-LABEL: define amdgpu_kernel void @i64_2d_load_store_subvec_4(
+; CHECK-SAME: ptr [[OUT:%.*]]) {
+; CHECK-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; CHECK-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; CHECK-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; CHECK-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[SEL2]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i64> <i64 0, i64 1, i64 2, i64 undef, i64 3, i64 4, i64 5, i64 undef>, i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <3 x i64> poison, i64 [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i64> <i64 0, i64 1, i64 2, i64 undef, i64 3, i64 4, i64 5, i64 undef>, i32 [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <3 x i64> [[TMP3]], i64 [[TMP5]], i64 1
+; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP1]], 2
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i64> <i64 0, i64 1, i64 2, i64 undef, i64 3, i64 4, i64 5, i64 undef>, i32 [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <3 x i64> [[TMP6]], i64 [[TMP8]], i64 2
+; CHECK-NEXT:    [[ELEM:%.*]] = extractelement <3 x i64> [[TMP9]], i32 2
+; CHECK-NEXT:    store i64 [[ELEM]], ptr [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %c1 = icmp uge i32 %x, 3
+  %c2 = icmp uge i32 %y, 3
+  %sel1 = select i1 %c1, i32 1, i32 2
+  %sel2 = select i1 %c2, i32 0, i32 %sel1
+  %alloca = alloca [2 x <3 x i64>], align 16, addrspace(5)
+  %gep.00 = getelementptr inbounds [2 x <3 x i64>], ptr addrspace(5) %alloca, i32 0
+  %gep.01 = getelementptr inbounds [2 x <3 x i64>], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0
+  store <3 x i64> <i64 0, i64 1, i64 2>, ptr addrspace(5) %gep.00
+  store <3 x i64> <i64 3, i64 4, i64 5>, ptr addrspace(5) %gep.01
+  %gep = getelementptr inbounds [2 x <3 x i64>], ptr addrspace(5) %alloca, i32 0, i32 %sel2
+  %load = load <3 x i64>, ptr addrspace(5) %gep
+  %elem = extractelement <3 x i64> %load, i32 2
+  store i64 %elem, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @i32_3d_load_store(ptr %out) {
+; CHECK-LABEL: define amdgpu_kernel void @i32_3d_load_store(
+; CHECK-SAME: ptr [[OUT:%.*]]) {
+; CHECK-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; CHECK-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; CHECK-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; CHECK-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>, i32 [[SEL2]]
+; CHECK-NEXT:    store i32 [[TMP1]], ptr [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %c1 = icmp uge i32 %x, 3
+  %c2 = icmp uge i32 %y, 3
+  %sel1 = select i1 %c1, i32 1, i32 2
+  %sel2 = select i1 %c2, i32 0, i32 %sel1
+  %alloca = alloca [2 x [2 x [3 x i32]]], align 16, addrspace(5)
+  %gep.000 = getelementptr inbounds [2 x [2 x [3 x i32]]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 0, i32 0
+  %gep.001 = getelementptr inbounds [2 x [2 x [3 x i32]]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 0, i32 1
+  %gep.002 = getelementptr inbounds [2 x [2 x [3 x i32]]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 0, i32 2
+  %gep.010 = getelementptr inbounds [2 x [2 x [3 x i32]]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 1, i32 0
+  %gep.011 = getelementptr inbounds [2 x [2 x [3 x i32]]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 1, i32 1
+  %gep.012 = getelementptr inbounds [2 x [2 x [3 x i32]]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 1, i32 2
+  %gep.100 = getelementptr inbounds [2 x [2 x [3 x i32]]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0, i32 0
+  %gep.101 = getelementptr inbounds [2 x [2 x [3 x i32]]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0, i32 1
+  %gep.102 = getelementptr inbounds [2 x [2 x [3 x i32]]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0, i32 2
+  %gep.110 = getelementptr inbounds [2 x [2 x [3 x i32]]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 1, i32 0
+  %gep.111 = getelementptr inbounds [2 x [2 x [3 x i32]]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 1, i32 1
+  %gep.112 = getelementptr inbounds [2 x [2 x [3 x i32]]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 1, i32 2
+  store i32 0, ptr addrspace(5) %gep.000
+  store i32 1, ptr addrspace(5) %gep.001
+  store i32 2, ptr addrspace(5) %gep.002
+  store i32 3, ptr addrspace(5) %gep.010
+  store i32 4, ptr addrspace(5) %gep.011
+  store i32 5, ptr addrspace(5) %gep.012
+  store i32 6, ptr addrspace(5) %gep.100
+  store i32 7, ptr addrspace(5) %gep.101
+  store i32 8, ptr addrspace(5) %gep.102
+  store i32 9, ptr addrspace(5) %gep.110
+  store i32 10, ptr addrspace(5) %gep.111
+  store i32 11, ptr addrspace(5) %gep.112
+  %gep = getelementptr inbounds [12 x i32], ptr addrspace(5) %alloca, i32 0, i32 %sel2
+  %load = load i32, ptr addrspace(5) %gep
+  store i32 %load, ptr %out
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
+declare i32 @llvm.amdgcn.workitem.id.y()
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll
index 28b923243b6db..ac28d9c20e2fa 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll
@@ -1,5 +1,5 @@
-; RUN: llc -O0 -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -mattr=+promote-alloca < %s | FileCheck -check-prefix=NOOPTS -check-prefix=ALL %s
-; RUN: llc -O1 -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -mattr=+promote-alloca < %s | FileCheck -check-prefix=OPTS -check-prefix=ALL %s
+; RUN: llc -O0 -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -disable-promote-alloca-to-vector=1 -mattr=+promote-alloca < %s | FileCheck -check-prefix=NOOPTS -check-prefix=ALL %s
+; RUN: llc -O1 -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -disable-promote-alloca-to-vector=1 -mattr=+promote-alloca < %s | FileCheck -check-prefix=OPTS -check-prefix=ALL %s
 
 ; ALL-LABEL: {{^}}promote_alloca_i32_array_array:
 ; NOOPTS: .amdhsa_group_segment_fixed_size 0
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-vgpr-ratio.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-vgpr-ratio.ll
new file mode 100644
index 0000000000000..ee30f761144ce
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-vgpr-ratio.ll
@@ -0,0 +1,276 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 < %s | FileCheck --check-prefix=BASE --check-prefix=DEFAULT %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 -amdgpu-promote-alloca-to-vector-vgpr-ratio=2 < %s | FileCheck --check-prefix=BASE %s --check-prefix=RATIO2
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 -amdgpu-promote-alloca-to-vector-vgpr-ratio=8 < %s | FileCheck --check-prefix=BASE %s --check-prefix=RATIO8
+
+define amdgpu_kernel void @i32_24_elements(ptr %out) #0 {
+; DEFAULT-LABEL: define amdgpu_kernel void @i32_24_elements(
+; DEFAULT-SAME: ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; DEFAULT-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; DEFAULT-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; DEFAULT-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; DEFAULT-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; DEFAULT-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; DEFAULT-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; DEFAULT-NEXT:    [[ALLOCA:%.*]] = alloca [24 x i32], align 16, addrspace(5)
+; DEFAULT-NEXT:    call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 96, i1 false)
+; DEFAULT-NEXT:    [[GEP_0:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0
+; DEFAULT-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 20
+; DEFAULT-NEXT:    store i32 42, ptr addrspace(5) [[GEP_0]], align 4
+; DEFAULT-NEXT:    store i32 43, ptr addrspace(5) [[GEP_1]], align 4
+; DEFAULT-NEXT:    [[GEP:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]]
+; DEFAULT-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4
+; DEFAULT-NEXT:    store i32 [[LOAD]], ptr [[OUT]], align 4
+; DEFAULT-NEXT:    ret void
+;
+; RATIO2-LABEL: define amdgpu_kernel void @i32_24_elements(
+; RATIO2-SAME: ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; RATIO2-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; RATIO2-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; RATIO2-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; RATIO2-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; RATIO2-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; RATIO2-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; RATIO2-NEXT:    [[TMP1:%.*]] = extractelement <24 x i32> <i32 42, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 43, i32 0, i32 0, i32 0>, i32 [[SEL2]]
+; RATIO2-NEXT:    store i32 [[TMP1]], ptr [[OUT]], align 4
+; RATIO2-NEXT:    ret void
+;
+; RATIO8-LABEL: define amdgpu_kernel void @i32_24_elements(
+; RATIO8-SAME: ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; RATIO8-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; RATIO8-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; RATIO8-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; RATIO8-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; RATIO8-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; RATIO8-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; RATIO8-NEXT:    [[ALLOCA:%.*]] = alloca [24 x i32], align 16, addrspace(5)
+; RATIO8-NEXT:    call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 96, i1 false)
+; RATIO8-NEXT:    [[GEP_0:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0
+; RATIO8-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 20
+; RATIO8-NEXT:    store i32 42, ptr addrspace(5) [[GEP_0]], align 4
+; RATIO8-NEXT:    store i32 43, ptr addrspace(5) [[GEP_1]], align 4
+; RATIO8-NEXT:    [[GEP:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]]
+; RATIO8-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4
+; RATIO8-NEXT:    store i32 [[LOAD]], ptr [[OUT]], align 4
+; RATIO8-NEXT:    ret void
+;
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %c1 = icmp uge i32 %x, 3
+  %c2 = icmp uge i32 %y, 3
+  %sel1 = select i1 %c1, i32 1, i32 2
+  %sel2 = select i1 %c2, i32 0, i32 %sel1
+  %alloca = alloca [24 x i32], align 16, addrspace(5)
+  call void @llvm.memset.p5.i32(ptr addrspace(5) %alloca, i8 0, i32 96, i1 false)
+  %gep.0 = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 0
+  %gep.1 = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 20
+  store i32 42, ptr addrspace(5) %gep.0
+  store i32 43, ptr addrspace(5) %gep.1
+  %gep = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 %sel2
+  %load = load i32, ptr addrspace(5) %gep
+  store i32 %load, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @i32_24_elements_attrib(ptr %out) #1 {
+; DEFAULT-LABEL: define amdgpu_kernel void @i32_24_elements_attrib(
+; DEFAULT-SAME: ptr [[OUT:%.*]]) #[[ATTR1:[0-9]+]] {
+; DEFAULT-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; DEFAULT-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; DEFAULT-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; DEFAULT-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; DEFAULT-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; DEFAULT-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; DEFAULT-NEXT:    [[TMP1:%.*]] = extractelement <24 x i32> <i32 42, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 43, i32 0, i32 0, i32 0>, i32 [[SEL2]]
+; DEFAULT-NEXT:    store i32 [[TMP1]], ptr [[OUT]], align 4
+; DEFAULT-NEXT:    ret void
+;
+; RATIO2-LABEL: define amdgpu_kernel void @i32_24_elements_attrib(
+; RATIO2-SAME: ptr [[OUT:%.*]]) #[[ATTR1:[0-9]+]] {
+; RATIO2-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; RATIO2-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; RATIO2-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; RATIO2-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; RATIO2-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; RATIO2-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; RATIO2-NEXT:    [[TMP1:%.*]] = extractelement <24 x i32> <i32 42, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 43, i32 0, i32 0, i32 0>, i32 [[SEL2]]
+; RATIO2-NEXT:    store i32 [[TMP1]], ptr [[OUT]], align 4
+; RATIO2-NEXT:    ret void
+;
+; RATIO8-LABEL: define amdgpu_kernel void @i32_24_elements_attrib(
+; RATIO8-SAME: ptr [[OUT:%.*]]) #[[ATTR1:[0-9]+]] {
+; RATIO8-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; RATIO8-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; RATIO8-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; RATIO8-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; RATIO8-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; RATIO8-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; RATIO8-NEXT:    [[ALLOCA:%.*]] = alloca [24 x i32], align 16, addrspace(5)
+; RATIO8-NEXT:    call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 96, i1 false)
+; RATIO8-NEXT:    [[GEP_0:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0
+; RATIO8-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 20
+; RATIO8-NEXT:    store i32 42, ptr addrspace(5) [[GEP_0]], align 4
+; RATIO8-NEXT:    store i32 43, ptr addrspace(5) [[GEP_1]], align 4
+; RATIO8-NEXT:    [[GEP:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]]
+; RATIO8-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4
+; RATIO8-NEXT:    store i32 [[LOAD]], ptr [[OUT]], align 4
+; RATIO8-NEXT:    ret void
+;
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %c1 = icmp uge i32 %x, 3
+  %c2 = icmp uge i32 %y, 3
+  %sel1 = select i1 %c1, i32 1, i32 2
+  %sel2 = select i1 %c2, i32 0, i32 %sel1
+  %alloca = alloca [24 x i32], align 16, addrspace(5)
+  call void @llvm.memset.p5.i32(ptr addrspace(5) %alloca, i8 0, i32 96, i1 false)
+  %gep.0 = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 0
+  %gep.1 = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 20
+  store i32 42, ptr addrspace(5) %gep.0
+  store i32 43, ptr addrspace(5) %gep.1
+  %gep = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 %sel2
+  %load = load i32, ptr addrspace(5) %gep
+  store i32 %load, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @i32_16_elements(ptr %out) #0 {
+; DEFAULT-LABEL: define amdgpu_kernel void @i32_16_elements(
+; DEFAULT-SAME: ptr [[OUT:%.*]]) #[[ATTR0]] {
+; DEFAULT-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; DEFAULT-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; DEFAULT-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; DEFAULT-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; DEFAULT-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; DEFAULT-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; DEFAULT-NEXT:    [[TMP1:%.*]] = extractelement <16 x i32> <i32 42, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 43>, i32 [[SEL2]]
+; DEFAULT-NEXT:    store i32 [[TMP1]], ptr [[OUT]], align 4
+; DEFAULT-NEXT:    ret void
+;
+; RATIO2-LABEL: define amdgpu_kernel void @i32_16_elements(
+; RATIO2-SAME: ptr [[OUT:%.*]]) #[[ATTR0]] {
+; RATIO2-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; RATIO2-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; RATIO2-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; RATIO2-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; RATIO2-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; RATIO2-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; RATIO2-NEXT:    [[TMP1:%.*]] = extractelement <16 x i32> <i32 42, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 43>, i32 [[SEL2]]
+; RATIO2-NEXT:    store i32 [[TMP1]], ptr [[OUT]], align 4
+; RATIO2-NEXT:    ret void
+;
+; RATIO8-LABEL: define amdgpu_kernel void @i32_16_elements(
+; RATIO8-SAME: ptr [[OUT:%.*]]) #[[ATTR0]] {
+; RATIO8-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; RATIO8-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; RATIO8-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; RATIO8-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; RATIO8-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; RATIO8-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; RATIO8-NEXT:    [[ALLOCA:%.*]] = alloca [16 x i32], align 16, addrspace(5)
+; RATIO8-NEXT:    call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 64, i1 false)
+; RATIO8-NEXT:    [[GEP_0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0
+; RATIO8-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 15
+; RATIO8-NEXT:    store i32 42, ptr addrspace(5) [[GEP_0]], align 4
+; RATIO8-NEXT:    store i32 43, ptr addrspace(5) [[GEP_1]], align 4
+; RATIO8-NEXT:    [[GEP:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]]
+; RATIO8-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4
+; RATIO8-NEXT:    store i32 [[LOAD]], ptr [[OUT]], align 4
+; RATIO8-NEXT:    ret void
+;
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %c1 = icmp uge i32 %x, 3
+  %c2 = icmp uge i32 %y, 3
+  %sel1 = select i1 %c1, i32 1, i32 2
+  %sel2 = select i1 %c2, i32 0, i32 %sel1
+  %alloca = alloca [16 x i32], align 16, addrspace(5)
+  call void @llvm.memset.p5.i32(ptr addrspace(5) %alloca, i8 0, i32 64, i1 false)
+  %gep.0 = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 0
+  %gep.1 = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 15
+  store i32 42, ptr addrspace(5) %gep.0
+  store i32 43, ptr addrspace(5) %gep.1
+  %gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %sel2
+  %load = load i32, ptr addrspace(5) %gep
+  store i32 %load, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @i32_16_elements_attrib(ptr %out) #2 {
+; DEFAULT-LABEL: define amdgpu_kernel void @i32_16_elements_attrib(
+; DEFAULT-SAME: ptr [[OUT:%.*]]) #[[ATTR2:[0-9]+]] {
+; DEFAULT-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; DEFAULT-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; DEFAULT-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; DEFAULT-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; DEFAULT-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; DEFAULT-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; DEFAULT-NEXT:    [[ALLOCA:%.*]] = alloca [16 x i32], align 16, addrspace(5)
+; DEFAULT-NEXT:    call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 64, i1 false)
+; DEFAULT-NEXT:    [[GEP_0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0
+; DEFAULT-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 15
+; DEFAULT-NEXT:    store i32 42, ptr addrspace(5) [[GEP_0]], align 4
+; DEFAULT-NEXT:    store i32 43, ptr addrspace(5) [[GEP_1]], align 4
+; DEFAULT-NEXT:    [[GEP:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]]
+; DEFAULT-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4
+; DEFAULT-NEXT:    store i32 [[LOAD]], ptr [[OUT]], align 4
+; DEFAULT-NEXT:    ret void
+;
+; RATIO2-LABEL: define amdgpu_kernel void @i32_16_elements_attrib(
+; RATIO2-SAME: ptr [[OUT:%.*]]) #[[ATTR2:[0-9]+]] {
+; RATIO2-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; RATIO2-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; RATIO2-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; RATIO2-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; RATIO2-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; RATIO2-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; RATIO2-NEXT:    [[TMP1:%.*]] = extractelement <16 x i32> <i32 42, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 43>, i32 [[SEL2]]
+; RATIO2-NEXT:    store i32 [[TMP1]], ptr [[OUT]], align 4
+; RATIO2-NEXT:    ret void
+;
+; RATIO8-LABEL: define amdgpu_kernel void @i32_16_elements_attrib(
+; RATIO8-SAME: ptr [[OUT:%.*]]) #[[ATTR2:[0-9]+]] {
+; RATIO8-NEXT:    [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; RATIO8-NEXT:    [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; RATIO8-NEXT:    [[C1:%.*]] = icmp uge i32 [[X]], 3
+; RATIO8-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; RATIO8-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; RATIO8-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; RATIO8-NEXT:    [[ALLOCA:%.*]] = alloca [16 x i32], align 16, addrspace(5)
+; RATIO8-NEXT:    call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 64, i1 false)
+; RATIO8-NEXT:    [[GEP_0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0
+; RATIO8-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 15
+; RATIO8-NEXT:    store i32 42, ptr addrspace(5) [[GEP_0]], align 4
+; RATIO8-NEXT:    store i32 43, ptr addrspace(5) [[GEP_1]], align 4
+; RATIO8-NEXT:    [[GEP:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]]
+; RATIO8-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4
+; RATIO8-NEXT:    store i32 [[LOAD]], ptr [[OUT]], align 4
+; RATIO8-NEXT:    ret void
+;
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %c1 = icmp uge i32 %x, 3
+  %c2 = icmp uge i32 %y, 3
+  %sel1 = select i1 %c1, i32 1, i32 2
+  %sel2 = select i1 %c2, i32 0, i32 %sel1
+  %alloca = alloca [16 x i32], align 16, addrspace(5)
+  call void @llvm.memset.p5.i32(ptr addrspace(5) %alloca, i8 0, i32 64, i1 false)
+  %gep.0 = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 0
+  %gep.1 = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 15
+  store i32 42, ptr addrspace(5) %gep.0
+  store i32 43, ptr addrspace(5) %gep.1
+  %gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %sel2
+  %load = load i32, ptr addrspace(5) %gep
+  store i32 %load, ptr %out
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
+declare i32 @llvm.amdgcn.workitem.id.y()
+declare void @llvm.memset.p5.i32(ptr addrspace(5) nocapture writeonly, i8, i32, i1 immarg)
+
+attributes #0 = { nounwind "amdgpu-promote-alloca-to-vector-max-elements"="24" "amdgpu-waves-per-eu"="4,4" }
+attributes #1 = { nounwind "amdgpu-promote-alloca-to-vector-max-elements"="24" "amdgpu-waves-per-eu"="4,4" "amdgpu-promote-alloca-to-vector-vgpr-ratio"="2" }
+attributes #2 = { nounwind "amdgpu-promote-alloca-to-vector-max-elements"="24" "amdgpu-waves-per-eu"="4,4" "amdgpu-promote-alloca-to-vector-vgpr-ratio"="8" }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; BASE: {{.*}}

>From 77d30c35e4df7173df5abf5d7511d6b9e54c9a0c Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson at amd.com>
Date: Fri, 21 Feb 2025 16:09:27 +0900
Subject: [PATCH 2/3] Address reviewer comments: - Move options into pass
 attributes - Change vector size limit from max elements to max 32b registers
 - Add tests for i16, float and ptr in multi-dimensional arrays

---
 llvm/docs/AMDGPUUsage.rst                     |  2 +-
 .../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 40 +++++---
 ...elements.ll => promote-alloca-max-regs.ll} |  8 +-
 .../CodeGen/AMDGPU/promote-alloca-multidim.ll | 99 +++++++++++++++++++
 .../CodeGen/AMDGPU/promote-alloca-subvecs.ll  |  2 +-
 .../AMDGPU/promote-alloca-vgpr-ratio.ll       |  6 +-
 .../CodeGen/AMDGPU/vector-alloca-limits.ll    |  4 +-
 7 files changed, 134 insertions(+), 27 deletions(-)
 rename llvm/test/CodeGen/AMDGPU/{promote-alloca-max-elements.ll => promote-alloca-max-regs.ll} (98%)

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index d163d52cc1b31..5b2b191fe43e7 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1720,7 +1720,7 @@ The AMDGPU backend supports the following LLVM IR attributes.
      "amdgpu-sgpr-hazard-mem-wait-cull-threshold"     Sets the number of active SGPR hazards that must be present before
                                                       inserting a cull sequence at a memory wait.
 
-     "amdgpu-promote-alloca-to-vector-max-elements"   Maximum vector size (in elements) to create when promoting alloca.
+     "amdgpu-promote-alloca-to-vector-max-regs"       Maximum vector size (in 32b registers) to create when promoting alloca.
 
      "amdgpu-promote-alloca-to-vector-vgpr-ratio"     Ratio of VGPRs to budget for promoting alloca to vectors.
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index aec00a28c1619..51fee8e02126c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -66,9 +66,10 @@ static cl::opt<unsigned> PromoteAllocaToVectorLimit(
     cl::desc("Maximum byte size to consider promote alloca to vector"),
     cl::init(0));
 
-static cl::opt<unsigned> PromoteAllocaToVectorMaxElements(
-    "amdgpu-promote-alloca-to-vector-max-elements",
-    cl::desc("Maximum vector size (in elements) to use when promoting alloca"),
+static cl::opt<unsigned> PromoteAllocaToVectorMaxRegs(
+    "amdgpu-promote-alloca-to-vector-max-regs",
+    cl::desc(
+        "Maximum vector size (in 32b registers) to use when promoting alloca"),
     cl::init(16));
 
 // Use up to 1/4 of available register budget for vectorization.
@@ -96,6 +97,8 @@ class AMDGPUPromoteAllocaImpl {
   uint32_t LocalMemLimit = 0;
   uint32_t CurrentLocalMemUsage = 0;
   unsigned MaxVGPRs;
+  unsigned VGPRBudgetRatio;
+  unsigned MaxVectorRegs;
 
   bool IsAMDGCN = false;
   bool IsAMDHSA = false;
@@ -124,6 +127,8 @@ class AMDGPUPromoteAllocaImpl {
 
   void sortAllocasToPromote(SmallVectorImpl<AllocaInst *> &Allocas);
 
+  void setFunctionLimits(const Function &F);
+
 public:
   AMDGPUPromoteAllocaImpl(TargetMachine &TM, LoopInfo &LI) : TM(TM), LI(LI) {
 
@@ -310,6 +315,19 @@ void AMDGPUPromoteAllocaImpl::sortAllocasToPromote(
   // clang-format on
 }
 
+void AMDGPUPromoteAllocaImpl::setFunctionLimits(const Function &F) {
+  // Load per function limits, overriding with global options where appropriate.
+  MaxVectorRegs = F.getFnAttributeAsParsedInteger(
+      "amdgpu-promote-alloca-to-vector-max-regs", PromoteAllocaToVectorMaxRegs);
+  if (PromoteAllocaToVectorMaxRegs.getNumOccurrences())
+    MaxVectorRegs = PromoteAllocaToVectorMaxRegs;
+  VGPRBudgetRatio = F.getFnAttributeAsParsedInteger(
+      "amdgpu-promote-alloca-to-vector-vgpr-ratio",
+      PromoteAllocaToVectorVGPRRatio);
+  if (PromoteAllocaToVectorVGPRRatio.getNumOccurrences())
+    VGPRBudgetRatio = PromoteAllocaToVectorVGPRRatio;
+}
+
 bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
   Mod = F.getParent();
   DL = &Mod->getDataLayout();
@@ -319,20 +337,14 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
     return false;
 
   MaxVGPRs = getMaxVGPRs(TM, F);
+  setFunctionLimits(F);
 
   bool SufficientLDS = PromoteToLDS ? hasSufficientLocalMem(F) : false;
 
-  const unsigned VGPRRatio =
-      PromoteAllocaToVectorVGPRRatio.getNumOccurrences()
-          ? PromoteAllocaToVectorVGPRRatio
-          : F.getFnAttributeAsParsedInteger(
-                "amdgpu-promote-alloca-to-vector-vgpr-ratio",
-                PromoteAllocaToVectorVGPRRatio);
-
   unsigned VectorizationBudget =
       (PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
                                   : (MaxVGPRs * 32)) /
-      VGPRRatio;
+      VGPRBudgetRatio;
 
   SmallVector<AllocaInst *, 16> Allocas;
   for (Instruction &I : F.getEntryBlock()) {
@@ -802,11 +814,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
   }
 
   const unsigned MaxElements =
-      PromoteAllocaToVectorMaxElements.getNumOccurrences()
-          ? PromoteAllocaToVectorMaxElements
-          : Alloca.getParent()->getParent()->getFnAttributeAsParsedInteger(
-                "amdgpu-promote-alloca-to-vector-max-elements",
-                PromoteAllocaToVectorMaxElements);
+      (MaxVectorRegs * 32) / DL->getTypeSizeInBits(VectorTy->getElementType());
 
   if (VectorTy->getNumElements() > MaxElements ||
       VectorTy->getNumElements() < 2) {
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-max-elements.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-max-regs.ll
similarity index 98%
rename from llvm/test/CodeGen/AMDGPU/promote-alloca-max-elements.ll
rename to llvm/test/CodeGen/AMDGPU/promote-alloca-max-regs.ll
index 7f1cf728bce7e..f9bdfc51f61ff 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-max-elements.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-max-regs.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 < %s | FileCheck --check-prefix=BASE --check-prefix=DEFAULT %s
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 -amdgpu-promote-alloca-to-vector-max-elements=24 < %s | FileCheck --check-prefix=BASE %s --check-prefix=MAX24
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 -amdgpu-promote-alloca-to-vector-max-elements=32 < %s | FileCheck --check-prefix=BASE %s --check-prefix=MAX32
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 -amdgpu-promote-alloca-to-vector-max-regs=24 < %s | FileCheck --check-prefix=BASE %s --check-prefix=MAX24
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 -amdgpu-promote-alloca-to-vector-max-regs=32 < %s | FileCheck --check-prefix=BASE %s --check-prefix=MAX32
 
 define amdgpu_kernel void @i32_24_elements(ptr %out) #0 {
 ; DEFAULT-LABEL: define amdgpu_kernel void @i32_24_elements(
@@ -232,5 +232,5 @@ declare i32 @llvm.amdgcn.workitem.id.y()
 declare void @llvm.memset.p5.i32(ptr addrspace(5) nocapture writeonly, i8, i32, i1 immarg)
 
 attributes #0 = { nounwind "amdgpu-flat-work-group-size"="64,64" "amdgpu-waves-per-eu"="1,3" }
-attributes #1 = { nounwind "amdgpu-flat-work-group-size"="64,64" "amdgpu-waves-per-eu"="1,3" "amdgpu-promote-alloca-to-vector-max-elements"="24" }
-attributes #2 = { nounwind "amdgpu-flat-work-group-size"="64,64" "amdgpu-waves-per-eu"="1,3" "amdgpu-promote-alloca-to-vector-max-elements"="32" }
+attributes #1 = { nounwind "amdgpu-flat-work-group-size"="64,64" "amdgpu-waves-per-eu"="1,3" "amdgpu-promote-alloca-to-vector-max-regs"="24" }
+attributes #2 = { nounwind "amdgpu-flat-work-group-size"="64,64" "amdgpu-waves-per-eu"="1,3" "amdgpu-promote-alloca-to-vector-max-regs"="32" }
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll
index a0af3f7442114..fde1b20c211d8 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll
@@ -288,5 +288,104 @@ define amdgpu_kernel void @i32_3d_load_store(ptr %out) {
   ret void
 }
 
+define amdgpu_kernel void @i16_2d_load_store(ptr %out, i32 %sel) {
+; CHECK-LABEL: define amdgpu_kernel void @i16_2d_load_store(
+; CHECK-SAME: ptr [[OUT:%.*]], i32 [[SEL:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 3, [[SEL]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <6 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5>, i32 [[TMP1]]
+; CHECK-NEXT:    store i16 [[TMP2]], ptr [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+  %alloca = alloca [2 x [3 x i16]], align 16, addrspace(5)
+  %gep.00 = getelementptr inbounds [2 x [3 x i16]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 0
+  %gep.01 = getelementptr inbounds [2 x [3 x i16]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 1
+  %gep.02 = getelementptr inbounds [2 x [3 x i16]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 2
+  %gep.10 = getelementptr inbounds [2 x [3 x i16]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0
+  %gep.11 = getelementptr inbounds [2 x [3 x i16]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 1
+  %gep.12 = getelementptr inbounds [2 x [3 x i16]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 2
+  store i16 0, ptr addrspace(5) %gep.00
+  store i16 1, ptr addrspace(5) %gep.01
+  store i16 2, ptr addrspace(5) %gep.02
+  store i16 3, ptr addrspace(5) %gep.10
+  store i16 4, ptr addrspace(5) %gep.11
+  store i16 5, ptr addrspace(5) %gep.12
+  %gep = getelementptr inbounds [2 x [3 x i16]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 %sel
+  %load = load i16, ptr addrspace(5) %gep
+  store i16 %load, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @float_2d_load_store(ptr %out, i32 %sel) {
+; CHECK-LABEL: define amdgpu_kernel void @float_2d_load_store(
+; CHECK-SAME: ptr [[OUT:%.*]], i32 [[SEL:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 3, [[SEL]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <6 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00>, i32 [[TMP1]]
+; CHECK-NEXT:    store float [[TMP2]], ptr [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+  %alloca = alloca [2 x [3 x float]], align 16, addrspace(5)
+  %gep.00 = getelementptr inbounds [2 x [3 x float]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 0
+  %gep.01 = getelementptr inbounds [2 x [3 x float]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 1
+  %gep.02 = getelementptr inbounds [2 x [3 x float]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 2
+  %gep.10 = getelementptr inbounds [2 x [3 x float]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0
+  %gep.11 = getelementptr inbounds [2 x [3 x float]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 1
+  %gep.12 = getelementptr inbounds [2 x [3 x float]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 2
+  store float 0.0, ptr addrspace(5) %gep.00
+  store float 1.0, ptr addrspace(5) %gep.01
+  store float 2.0, ptr addrspace(5) %gep.02
+  store float 3.0, ptr addrspace(5) %gep.10
+  store float 4.0, ptr addrspace(5) %gep.11
+  store float 5.0, ptr addrspace(5) %gep.12
+  %gep = getelementptr inbounds [2 x [3 x float]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 %sel
+  %load = load float, ptr addrspace(5) %gep
+  store float %load, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @ptr_2d_load_store(ptr %out, i32 %sel) {
+; CHECK-LABEL: define amdgpu_kernel void @ptr_2d_load_store(
+; CHECK-SAME: ptr [[OUT:%.*]], i32 [[SEL:%.*]]) {
+; CHECK-NEXT:    [[PTR_0:%.*]] = getelementptr inbounds ptr, ptr [[OUT]], i32 0
+; CHECK-NEXT:    [[PTR_1:%.*]] = getelementptr inbounds ptr, ptr [[OUT]], i32 1
+; CHECK-NEXT:    [[PTR_2:%.*]] = getelementptr inbounds ptr, ptr [[OUT]], i32 2
+; CHECK-NEXT:    [[PTR_3:%.*]] = getelementptr inbounds ptr, ptr [[OUT]], i32 3
+; CHECK-NEXT:    [[PTR_4:%.*]] = getelementptr inbounds ptr, ptr [[OUT]], i32 4
+; CHECK-NEXT:    [[PTR_5:%.*]] = getelementptr inbounds ptr, ptr [[OUT]], i32 5
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <6 x ptr> undef, ptr [[PTR_0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <6 x ptr> [[TMP1]], ptr [[PTR_1]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <6 x ptr> [[TMP2]], ptr [[PTR_2]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <6 x ptr> [[TMP3]], ptr [[PTR_3]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <6 x ptr> [[TMP4]], ptr [[PTR_4]], i32 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <6 x ptr> [[TMP5]], ptr [[PTR_5]], i32 5
+; CHECK-NEXT:    [[TMP7:%.*]] = add i32 3, [[SEL]]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <6 x ptr> [[TMP6]], i32 [[TMP7]]
+; CHECK-NEXT:    store ptr [[TMP8]], ptr [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+  %alloca = alloca [2 x [3 x ptr]], align 16, addrspace(5)
+  %gep.00 = getelementptr inbounds [2 x [3 x ptr]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 0
+  %gep.01 = getelementptr inbounds [2 x [3 x ptr]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 1
+  %gep.02 = getelementptr inbounds [2 x [3 x ptr]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 2
+  %gep.10 = getelementptr inbounds [2 x [3 x ptr]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0
+  %gep.11 = getelementptr inbounds [2 x [3 x ptr]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 1
+  %gep.12 = getelementptr inbounds [2 x [3 x ptr]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 2
+  %ptr.0 = getelementptr inbounds ptr, ptr %out, i32 0
+  %ptr.1 = getelementptr inbounds ptr, ptr %out, i32 1
+  %ptr.2 = getelementptr inbounds ptr, ptr %out, i32 2
+  %ptr.3 = getelementptr inbounds ptr, ptr %out, i32 3
+  %ptr.4 = getelementptr inbounds ptr, ptr %out, i32 4
+  %ptr.5 = getelementptr inbounds ptr, ptr %out, i32 5
+  store ptr %ptr.0, ptr addrspace(5) %gep.00
+  store ptr %ptr.1, ptr addrspace(5) %gep.01
+  store ptr %ptr.2, ptr addrspace(5) %gep.02
+  store ptr %ptr.3, ptr addrspace(5) %gep.10
+  store ptr %ptr.4, ptr addrspace(5) %gep.11
+  store ptr %ptr.5, ptr addrspace(5) %gep.12
+  %gep = getelementptr inbounds [2 x [3 x ptr]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 %sel
+  %load = load ptr, ptr addrspace(5) %gep
+  store ptr %load, ptr %out
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x()
 declare i32 @llvm.amdgcn.workitem.id.y()
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-subvecs.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-subvecs.ll
index 7c5410004ed5b..e5d0c563b74c4 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-subvecs.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-subvecs.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -passes=amdgpu-promote-alloca -amdgpu-promote-alloca-to-vector-limit=512 < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -passes=amdgpu-promote-alloca -amdgpu-promote-alloca-to-vector-limit=512 -amdgpu-promote-alloca-to-vector-max-regs=32 < %s | FileCheck %s
 
 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
 
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-vgpr-ratio.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-vgpr-ratio.ll
index ee30f761144ce..e06b491f5986c 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-vgpr-ratio.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-vgpr-ratio.ll
@@ -269,8 +269,8 @@ declare i32 @llvm.amdgcn.workitem.id.x()
 declare i32 @llvm.amdgcn.workitem.id.y()
 declare void @llvm.memset.p5.i32(ptr addrspace(5) nocapture writeonly, i8, i32, i1 immarg)
 
-attributes #0 = { nounwind "amdgpu-promote-alloca-to-vector-max-elements"="24" "amdgpu-waves-per-eu"="4,4" }
-attributes #1 = { nounwind "amdgpu-promote-alloca-to-vector-max-elements"="24" "amdgpu-waves-per-eu"="4,4" "amdgpu-promote-alloca-to-vector-vgpr-ratio"="2" }
-attributes #2 = { nounwind "amdgpu-promote-alloca-to-vector-max-elements"="24" "amdgpu-waves-per-eu"="4,4" "amdgpu-promote-alloca-to-vector-vgpr-ratio"="8" }
+attributes #0 = { nounwind "amdgpu-promote-alloca-to-vector-max-regs"="24" "amdgpu-waves-per-eu"="4,4" }
+attributes #1 = { nounwind "amdgpu-promote-alloca-to-vector-max-regs"="24" "amdgpu-waves-per-eu"="4,4" "amdgpu-promote-alloca-to-vector-vgpr-ratio"="2" }
+attributes #2 = { nounwind "amdgpu-promote-alloca-to-vector-max-regs"="24" "amdgpu-waves-per-eu"="4,4" "amdgpu-promote-alloca-to-vector-vgpr-ratio"="8" }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; BASE: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll
index 64d7c7868ca8d..fee045359e751 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll
@@ -1,5 +1,5 @@
-; RUN: opt -S -mtriple=amdgcn-- -passes='amdgpu-promote-alloca,sroa,instcombine' < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -S -mtriple=amdgcn-- -passes='amdgpu-promote-alloca,sroa,instcombine' -amdgpu-promote-alloca-to-vector-limit=32 < %s | FileCheck -check-prefix=LIMIT32 %s
+; RUN: opt -S -mtriple=amdgcn-- -passes='amdgpu-promote-alloca,sroa,instcombine' -amdgpu-promote-alloca-to-vector-max-regs=64 < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -S -mtriple=amdgcn-- -passes='amdgpu-promote-alloca,sroa,instcombine' -amdgpu-promote-alloca-to-vector-limit=32 -amdgpu-promote-alloca-to-vector-max-regs=64 < %s | FileCheck -check-prefix=LIMIT32 %s
 
 target datalayout = "A5"
 

>From a7d512378cd22c839c4ee9e14e0f1d056b9a097d Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson at amd.com>
Date: Tue, 11 Mar 2025 15:36:44 +0900
Subject: [PATCH 3/3] - Hide generated undefs in lit test to avoid deprecation
 warning

---
 llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll
index fde1b20c211d8..5198597c935fd 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll
@@ -61,7 +61,7 @@ define amdgpu_kernel void @i32_2d_alloca_store_partial(ptr addrspace(1) %out, pt
 ; CHECK-NEXT:    [[C2:%.*]] = icmp uge i32 [[Y]], 3
 ; CHECK-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
 ; CHECK-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
-; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 undef, i32 undef, i32 undef, i32 undef>, i32 [[SEL2]]
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}>, i32 [[SEL2]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[TMP0]] to float
 ; CHECK-NEXT:    store float [[TMP1]], ptr addrspace(1) [[OUT]], align 4
 ; CHECK-NEXT:    ret void
@@ -208,13 +208,13 @@ define amdgpu_kernel void @i64_2d_load_store_subvec_4(ptr %out) {
 ; CHECK-NEXT:    [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
 ; CHECK-NEXT:    [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[SEL2]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i64> <i64 0, i64 1, i64 2, i64 undef, i64 3, i64 4, i64 5, i64 undef>, i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i64> <i64 0, i64 1, i64 2, i64 {{.*}}, i64 3, i64 4, i64 5, i64 {{.*}}>, i32 [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <3 x i64> poison, i64 [[TMP2]], i64 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP1]], 1
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i64> <i64 0, i64 1, i64 2, i64 undef, i64 3, i64 4, i64 5, i64 undef>, i32 [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i64> <i64 0, i64 1, i64 2, i64 {{.*}}, i64 3, i64 4, i64 5, i64 {{.*}}>, i32 [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <3 x i64> [[TMP3]], i64 [[TMP5]], i64 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP1]], 2
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i64> <i64 0, i64 1, i64 2, i64 undef, i64 3, i64 4, i64 5, i64 undef>, i32 [[TMP7]]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i64> <i64 0, i64 1, i64 2, i64 {{.*}}, i64 3, i64 4, i64 5, i64 {{.*}}>, i32 [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <3 x i64> [[TMP6]], i64 [[TMP8]], i64 2
 ; CHECK-NEXT:    [[ELEM:%.*]] = extractelement <3 x i64> [[TMP9]], i32 2
 ; CHECK-NEXT:    store i64 [[ELEM]], ptr [[OUT]], align 8
@@ -351,7 +351,7 @@ define amdgpu_kernel void @ptr_2d_load_store(ptr %out, i32 %sel) {
 ; CHECK-NEXT:    [[PTR_3:%.*]] = getelementptr inbounds ptr, ptr [[OUT]], i32 3
 ; CHECK-NEXT:    [[PTR_4:%.*]] = getelementptr inbounds ptr, ptr [[OUT]], i32 4
 ; CHECK-NEXT:    [[PTR_5:%.*]] = getelementptr inbounds ptr, ptr [[OUT]], i32 5
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <6 x ptr> undef, ptr [[PTR_0]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <6 x ptr> {{.*}}, ptr [[PTR_0]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <6 x ptr> [[TMP1]], ptr [[PTR_1]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <6 x ptr> [[TMP2]], ptr [[PTR_2]], i32 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <6 x ptr> [[TMP3]], ptr [[PTR_3]], i32 3



More information about the llvm-commits mailing list