[llvm] r269708 - AMDGPU: Fix promote alloca pass creating huge arrays

Mon May 16 14:20:00 PDT 2016

Author: arsenm
Date: Mon May 16 16:19:59 2016
New Revision: 269708

URL: http://llvm.org/viewvc/llvm-project?rev=269708&view=rev
Log:
AMDGPU: Fix promote alloca pass creating huge arrays

This was assuming it could use all memory before, which is
a bad decision because it restricts occupancy.

By default, only try to use enough space that could reduce
occupancy to 7, an arbitrarily chosen limit.

Based on the exist LDS usage, try to round up to the limit
in the current tier instead of further hurting occupancy.
This isn't ideal, because it doesn't accurately know how much
space is going to be used for alignment padding.

Added:
    llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll
Modified:
    llvm/trunk/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
    llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
    llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h
    llvm/trunk/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
    llvm/trunk/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
    llvm/trunk/test/CodeGen/AMDGPU/indirect-private-64.ll
    llvm/trunk/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll
    llvm/trunk/test/CodeGen/AMDGPU/private-memory-r600.ll
    llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-globals.ll
    llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll
    llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll
    llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll
    llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll
    llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll

Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp?rev=269708&r1=269707&r2=269708&view=diff
==============================================================================

--- llvm/trunk/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp Mon May 16 16:19:59 2016
@@ -36,7 +36,8 @@ private:
   MDNode *MaxWorkGroupSizeRange;
 
   // FIXME: This should be per-kernel.
-  int LocalMemAvailable;
+  uint32_t LocalMemLimit;
+  uint32_t CurrentLocalMemUsage;
 
   bool IsAMDGCN;
   bool IsAMDHSA;
@@ -67,7 +68,8 @@ public:
     Mod(nullptr),
     DL(nullptr),
     MaxWorkGroupSizeRange(nullptr),
-    LocalMemAvailable(0),
+    LocalMemLimit(0),
+    CurrentLocalMemUsage(0),
     IsAMDGCN(false),
     IsAMDHSA(false) { }
 
@@ -130,37 +132,86 @@ bool AMDGPUPromoteAlloca::runOnFunction(
   for (Type *ParamTy : FTy->params()) {
     PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
     if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
-      LocalMemAvailable = 0;
-      DEBUG(dbgs() << "Function has local memory argument.  Promoting to "
+      LocalMemLimit = 0;
+      DEBUG(dbgs() << "Function has local memory argument. Promoting to "
                       "local memory disabled.\n");
       return false;
     }
   }
 
   const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
-  LocalMemAvailable = ST.getLocalMemorySize();
-  if (LocalMemAvailable == 0)
+
+  LocalMemLimit = ST.getLocalMemorySize();
+  if (LocalMemLimit == 0)
     return false;
 
+  const DataLayout &DL = Mod->getDataLayout();
+
   // Check how much local memory is being used by global objects
+  CurrentLocalMemUsage = 0;
   for (GlobalVariable &GV : Mod->globals()) {
     if (GV.getType()->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
       continue;
 
-    for (User *U : GV.users()) {
-      Instruction *Use = dyn_cast<Instruction>(U);
+    for (const User *U : GV.users()) {
+      const Instruction *Use = dyn_cast<Instruction>(U);
       if (!Use)
         continue;
 
       if (Use->getParent()->getParent() == &F) {
-        LocalMemAvailable -= DL->getTypeAllocSize(GV.getValueType());
+        unsigned Align = GV.getAlignment();
+        if (Align == 0)
+          Align = DL.getABITypeAlignment(GV.getValueType());
+
+        // FIXME: Try to account for padding here. The padding is currently
+        // determined from the inverse order of uses in the function. I'm not
+        // sure if the use list order is in any way connected to this, so the
+        // total reported size is likely incorrect.
+        uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType());
+        CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Align);
+        CurrentLocalMemUsage += AllocSize;
         break;
       }
     }
   }
 
-  LocalMemAvailable = std::max(0, LocalMemAvailable);
-  DEBUG(dbgs() << LocalMemAvailable << " bytes free in local memory.\n");
+  unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage);
+
+  // Restrict local memory usage so that we don't drastically reduce occupancy,
+  // unless it is already significantly reduced.
+
+  // TODO: Have some sort of hint or other heuristics to guess occupancy based
+  // on other factors..
+  unsigned OccupancyHint
+    = AMDGPU::getIntegerAttribute(F, "amdgpu-max-waves-per-eu", 0);
+  if (OccupancyHint == 0)
+    OccupancyHint = 7;
+
+  // Clamp to max value.
+  OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerCU());
+
+  // Check the hint but ignore it if it's obviously wrong from the existing LDS
+  // usage.
+  MaxOccupancy = std::min(OccupancyHint, MaxOccupancy);
+
+
+  // Round up to the next tier of usage.
+  unsigned MaxSizeWithWaveCount
+    = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy);
+
+  // Program is possibly broken by using more local mem than available.
+  if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
+    return false;
+
+  LocalMemLimit = MaxSizeWithWaveCount;
+
+  DEBUG(
+    dbgs() << F.getName() << " uses " << CurrentLocalMemUsage << " bytes of LDS\n"
+    << "  Rounding size to " << MaxSizeWithWaveCount
+    << " with a maximum occupancy of " << MaxOccupancy << '\n'
+    << " and " << (LocalMemLimit - CurrentLocalMemUsage)
+    << " available for promotion\n"
+  );
 
   BasicBlock &EntryBB = *F.begin();
   for (auto I = EntryBB.begin(), E = EntryBB.end(); I != E; ) {
@@ -565,6 +616,7 @@ bool AMDGPUPromoteAlloca::collectUsesWit
   return true;
 }
 
+// FIXME: Should try to pick the most likely to be profitable allocas first.
 void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
   // Array allocations are probably not worth handling, since an allocation of
   // the array type is the canonical form.
@@ -578,10 +630,10 @@ void AMDGPUPromoteAlloca::handleAlloca(A
 
   DEBUG(dbgs() << "Trying to promote " << I << '\n');
 
-  if (tryPromoteAllocaToVector(&I))
+  if (tryPromoteAllocaToVector(&I)) {
+    DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");
     return;
-
-  DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");
+  }
 
   const Function &ContainingFunction = *I.getParent()->getParent();
 
@@ -589,14 +641,30 @@ void AMDGPUPromoteAlloca::handleAlloca(A
   // function attribute if it is available.
   unsigned WorkGroupSize = AMDGPU::getMaximumWorkGroupSize(ContainingFunction);
 
-  int AllocaSize =
-      WorkGroupSize * Mod->getDataLayout().getTypeAllocSize(AllocaTy);
+  const DataLayout &DL = Mod->getDataLayout();
 
-  if (AllocaSize > LocalMemAvailable) {
-    DEBUG(dbgs() << " Not enough local memory to promote alloca.\n");
+  unsigned Align = I.getAlignment();
+  if (Align == 0)
+    Align = DL.getABITypeAlignment(I.getAllocatedType());
+
+  // FIXME: This computed padding is likely wrong since it depends on inverse
+  // usage order.
+  //
+  // FIXME: It is also possible that if we're allowed to use all of the memory
+  // could could end up using more than the maximum due to alignment padding.
+
+  uint32_t NewSize = alignTo(CurrentLocalMemUsage, Align);
+  uint32_t AllocSize = WorkGroupSize * DL.getTypeAllocSize(AllocaTy);
+  NewSize += AllocSize;
+
+  if (NewSize > LocalMemLimit) {
+    DEBUG(dbgs() << "  " << AllocSize
+          << " bytes of local memory not available to promote\n");
     return;
   }
 
+  CurrentLocalMemUsage = NewSize;
+
   std::vector<Value*> WorkList;
 
   if (!collectUsesWithPtrTypes(&I, &I, WorkList)) {
@@ -605,7 +673,6 @@ void AMDGPUPromoteAlloca::handleAlloca(A
   }
 
   DEBUG(dbgs() << "Promoting alloca to local memory\n");
-  LocalMemAvailable -= AllocaSize;
 
   Function *F = I.getParent()->getParent();
 

Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.cpp?rev=269708&r1=269707&r2=269708&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.cpp Mon May 16 16:19:59 2016
@@ -154,6 +154,64 @@ unsigned AMDGPUSubtarget::getStackEntryS
   }
 }
 
+// FIXME: These limits are for SI. Did they change with the larger maximum LDS
+// size?
+unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves) const {
+  switch (NWaves) {
+  case 10:
+    return 1638;
+  case 9:
+    return 1820;
+  case 8:
+    return 2048;
+  case 7:
+    return 2340;
+  case 6:
+    return 2730;
+  case 5:
+    return 3276;
+  case 4:
+    return 4096;
+  case 3:
+    return 5461;
+  case 2:
+    return 8192;
+  default:
+    return getLocalMemorySize();
+  }
+}
+
+unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes) const {
+  if (Bytes <= 1638)
+    return 10;
+
+  if (Bytes <= 1820)
+    return 9;
+
+  if (Bytes <= 2048)
+    return 8;
+
+  if (Bytes <= 2340)
+    return 7;
+
+  if (Bytes <= 2730)
+    return 6;
+
+  if (Bytes <= 3276)
+    return 5;
+
+  if (Bytes <= 4096)
+    return 4;
+
+  if (Bytes <= 5461)
+    return 3;
+
+  if (Bytes <= 8192)
+    return 2;
+
+  return 1;
+}
+
 unsigned AMDGPUSubtarget::getAmdKernelCodeChipID() const {
   switch(getGeneration()) {
   default: llvm_unreachable("ChipID unknown");

Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h?rev=269708&r1=269707&r2=269708&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h Mon May 16 16:19:59 2016
@@ -269,6 +269,15 @@ public:
     return CFALUBug;
   }
 
+  /// Return the amount of LDS that can be used that will not restrict the
+  /// occupancy lower than WaveCount.
+  unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount) const;
+
+  /// Inverse of getMaxLocalMemWithWaveCount. Return the maximum wavecount if
+  /// the given LDS memory size is the only constraint.
+  unsigned getOccupancyWithLocalMemSize(uint32_t Bytes) const;
+
+
   int getLocalMemorySize() const {
     return LocalMemorySize;
   }
@@ -334,7 +343,7 @@ public:
       return 10;
 
     // FIXME: Not sure what this is for other subtagets.
-    llvm_unreachable("do not know max waves per CU for this subtarget.");
+    return 8;
   }
 
   bool enableSubRegLiveness() const override {

Modified: llvm/trunk/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/amdgpu.private-memory.ll?rev=269708&r1=269707&r2=269708&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/amdgpu.private-memory.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/amdgpu.private-memory.ll Mon May 16 16:19:59 2016
@@ -78,7 +78,7 @@
 ; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.x(), !range !0
 ; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.y(), !range !0
 ; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.z(), !range !0
-define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
+define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
 entry:
   %stack = alloca [5 x i32], align 4
   %0 = load i32, i32 addrspace(1)* %in, align 4
@@ -100,7 +100,7 @@ entry:
 
 ; OPT-LABEL: @high_alignment(
 ; OPT: getelementptr inbounds [256 x [8 x i32]], [256 x [8 x i32]] addrspace(3)* @high_alignment.stack, i32 0, i32 %{{[0-9]+}}
-define void @high_alignment(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
+define void @high_alignment(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
 entry:
   %stack = alloca [8 x i32], align 16
   %0 = load i32, i32 addrspace(1)* %in, align 4
@@ -125,7 +125,7 @@ entry:
 ; OPT: alloca [5 x i32]
 
 ; SI-NOT: ds_write
-define void @no_replace_inbounds_gep(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
+define void @no_replace_inbounds_gep(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
 entry:
   %stack = alloca [5 x i32], align 4
   %0 = load i32, i32 addrspace(1)* %in, align 4
@@ -160,7 +160,7 @@ entry:
 ; SI-NOT: v_movrel
 %struct.point = type { i32, i32 }
 
-define void @multiple_structs(i32 addrspace(1)* %out) {
+define void @multiple_structs(i32 addrspace(1)* %out) #0 {
 entry:
   %a = alloca %struct.point
   %b = alloca %struct.point
@@ -189,7 +189,7 @@ entry:
 ; R600-NOT: MOVA_INT
 ; SI-NOT: v_movrel
 
-define void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 entry:
   %prv_array_const = alloca [2 x i32]
   %prv_array = alloca [2 x i32]
@@ -228,7 +228,7 @@ for.end:
 ; SI-PROMOTE-DAG: buffer_store_short v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x68,0xe0
 ; SI-PROMOTE-DAG: buffer_store_short v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:2 ; encoding: [0x02,0x10,0x68,0xe0
 ; SI-PROMOTE: buffer_load_sshort v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
-define void @short_array(i32 addrspace(1)* %out, i32 %index) {
+define void @short_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %0 = alloca [2 x i16]
   %1 = getelementptr inbounds [2 x i16], [2 x i16]* %0, i32 0, i32 0
@@ -248,7 +248,7 @@ entry:
 
 ; SI-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x60,0xe0
 ; SI-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:1 ; encoding: [0x01,0x10,0x60,0xe0
-define void @char_array(i32 addrspace(1)* %out, i32 %index) {
+define void @char_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %0 = alloca [2 x i8]
   %1 = getelementptr inbounds [2 x i8], [2 x i8]* %0, i32 0, i32 0
@@ -263,7 +263,6 @@ entry:
 
 }
 
-
 ; Test that two stack objects are not stored in the same register
 ; The second stack object should be in T3.X
 ; FUNC-LABEL: {{^}}no_overlap:
@@ -271,7 +270,7 @@ entry:
 ; R600_CHECK: [[CHAN:[XYZW]]]+
 ; R600-NOT: [[CHAN]]+
 ; SI: v_mov_b32_e32 v3
-define void @no_overlap(i32 addrspace(1)* %out, i32 %in) {
+define void @no_overlap(i32 addrspace(1)* %out, i32 %in) #0 {
 entry:
   %0 = alloca [3 x i8], align 1
   %1 = alloca [2 x i8], align 1
@@ -295,7 +294,7 @@ entry:
   ret void
 }
 
-define void @char_array_array(i32 addrspace(1)* %out, i32 %index) {
+define void @char_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x [2 x i8]]
   %gep0 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 0
@@ -309,7 +308,7 @@ entry:
   ret void
 }
 
-define void @i32_array_array(i32 addrspace(1)* %out, i32 %index) {
+define void @i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x [2 x i32]]
   %gep0 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 0
@@ -322,7 +321,7 @@ entry:
   ret void
 }
 
-define void @i64_array_array(i64 addrspace(1)* %out, i32 %index) {
+define void @i64_array_array(i64 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x [2 x i64]]
   %gep0 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 0
@@ -337,7 +336,7 @@ entry:
 
 %struct.pair32 = type { i32, i32 }
 
-define void @struct_array_array(i32 addrspace(1)* %out, i32 %index) {
+define void @struct_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x [2 x %struct.pair32]]
   %gep0 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 0, i32 1
@@ -350,7 +349,7 @@ entry:
   ret void
 }
 
-define void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) {
+define void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x %struct.pair32]
   %gep0 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 0, i32 1
@@ -384,7 +383,7 @@ entry:
 ; SI-NOT: ds_write
 ; SI: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen
 ; SI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:5 ;
-define void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %alloca = alloca [16 x i32]
   %tmp0 = getelementptr [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
   store i32 5, i32* %tmp0
@@ -401,3 +400,5 @@ define void @ptrtoint(i32 addrspace(1)*
 ; HSAOPT: !1 = !{i32 0, i32 2048}
 
 ; NOHSAOPT: !0 = !{i32 0, i32 2048}
+
+attributes #0 = { nounwind "amdgpu-max-waves-per-eu"="2" }

Modified: llvm/trunk/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll?rev=269708&r1=269707&r2=269708&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll Mon May 16 16:19:59 2016
@@ -47,6 +47,6 @@ define void @test_private_array_ptr_calc
   ret void
 }
 
-attributes #0 = { nounwind }
+attributes #0 = { nounwind "amdgpu-max-waves-per-eu"="1" }
 attributes #1 = { nounwind readnone }
 attributes #2 = { nounwind convergent }

Modified: llvm/trunk/test/CodeGen/AMDGPU/indirect-private-64.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/indirect-private-64.ll?rev=269708&r1=269707&r2=269708&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/indirect-private-64.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/indirect-private-64.ll Mon May 16 16:19:59 2016
@@ -4,7 +4,7 @@
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca,+max-private-element-size-16 -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA16 -check-prefix=SI %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s
 
-declare void @llvm.amdgcn.s.barrier() #1
+declare void @llvm.amdgcn.s.barrier() #0
 
 ; SI-LABEL: {{^}}private_access_f64_alloca:
 
@@ -112,5 +112,5 @@ define void @private_access_v2i64_alloca
   ret void
 }
 
-attributes #0 = { nounwind }
-attributes #1 = { nounwind convergent }
+attributes #0 = { convergent nounwind }
+attributes #1 = { nounwind "amdgpu-max-waves-per-eu"="2" "amdgpu-max-work-group-size"="64" }

Modified: llvm/trunk/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll?rev=269708&r1=269707&r2=269708&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll Mon May 16 16:19:59 2016
@@ -66,7 +66,52 @@ entry:
   ret void
 }
 
-attributes #0 = { nounwind "amdgpu-max-work-group-size"="63" }
-attributes #1 = { nounwind "amdgpu-max-work-group-size"="256" }
-attributes #2 = { nounwind "amdgpu-max-work-group-size"="1600" }
+; CHECK: @occupancy_0(
+; CHECK: alloca [5 x i32]
+define void @occupancy_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #3 {
+entry:
+  %stack = alloca [5 x i32], align 4
+  %0 = load i32, i32 addrspace(1)* %in, align 4
+  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
+  store i32 4, i32* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
+  %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
+  store i32 5, i32* %arrayidx3, align 4
+  %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
+  %2 = load i32, i32* %arrayidx10, align 4
+  store i32 %2, i32 addrspace(1)* %out, align 4
+  %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
+  %3 = load i32, i32* %arrayidx12
+  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
+  store i32 %3, i32 addrspace(1)* %arrayidx13
+  ret void
+}
 
+; CHECK: @occupancy_max(
+; CHECK: alloca [5 x i32]
+define void @occupancy_max(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #4 {
+entry:
+  %stack = alloca [5 x i32], align 4
+  %0 = load i32, i32 addrspace(1)* %in, align 4
+  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
+  store i32 4, i32* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
+  %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
+  store i32 5, i32* %arrayidx3, align 4
+  %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
+  %2 = load i32, i32* %arrayidx10, align 4
+  store i32 %2, i32 addrspace(1)* %out, align 4
+  %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
+  %3 = load i32, i32* %arrayidx12
+  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
+  store i32 %3, i32 addrspace(1)* %arrayidx13
+  ret void
+}
+
+attributes #0 = { nounwind "amdgpu-max-work-group-size"="63" }
+attributes #1 = { nounwind "amdgpu-max-waves-per-eu"="3" "amdgpu-max-work-group-size"="256" }
+attributes #2 = { nounwind "amdgpu-max-waves-per-eu"="1" "amdgpu-max-work-group-size"="1600" }
+attributes #3 = { nounwind "amdgpu-max-waves-per-eu"="0" }
+attributes #4 = { nounwind "amdgpu-max-waves-per-eu"="-1" }

Modified: llvm/trunk/test/CodeGen/AMDGPU/private-memory-r600.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/private-memory-r600.ll?rev=269708&r1=269707&r2=269708&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/private-memory-r600.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/private-memory-r600.ll Mon May 16 16:19:59 2016
@@ -16,7 +16,7 @@ declare i32 @llvm.r600.read.tidig.x() no
 ; OPT: call i32 @llvm.r600.read.tidig.y(), !range !0
 ; OPT: call i32 @llvm.r600.read.tidig.z(), !range !0
 
-define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
+define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
 entry:
   %stack = alloca [5 x i32], align 4
   %0 = load i32, i32 addrspace(1)* %in, align 4
@@ -47,7 +47,7 @@ entry:
 ; R600-NOT: MOVA_INT
 %struct.point = type { i32, i32 }
 
-define void @multiple_structs(i32 addrspace(1)* %out) {
+define void @multiple_structs(i32 addrspace(1)* %out) #0 {
 entry:
   %a = alloca %struct.point
   %b = alloca %struct.point
@@ -75,7 +75,7 @@ entry:
 ; FUNC-LABEL: {{^}}direct_loop:
 ; R600-NOT: MOVA_INT
 
-define void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 entry:
   %prv_array_const = alloca [2 x i32]
   %prv_array = alloca [2 x i32]
@@ -110,7 +110,7 @@ for.end:
 ; FUNC-LABEL: {{^}}short_array:
 
 ; R600: MOVA_INT
-define void @short_array(i32 addrspace(1)* %out, i32 %index) {
+define void @short_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %0 = alloca [2 x i16]
   %1 = getelementptr inbounds [2 x i16], [2 x i16]* %0, i32 0, i32 0
@@ -127,7 +127,7 @@ entry:
 ; FUNC-LABEL: {{^}}char_array:
 
 ; R600: MOVA_INT
-define void @char_array(i32 addrspace(1)* %out, i32 %index) {
+define void @char_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %0 = alloca [2 x i8]
   %1 = getelementptr inbounds [2 x i8], [2 x i8]* %0, i32 0, i32 0
@@ -148,7 +148,7 @@ entry:
 ; R600-NOT: MOV T0.X
 ; Additional check in case the move ends up in the last slot
 ; R600-NOT: MOV * TO.X
-define void @work_item_info(i32 addrspace(1)* %out, i32 %in) {
+define void @work_item_info(i32 addrspace(1)* %out, i32 %in) #0 {
 entry:
   %0 = alloca [2 x i32]
   %1 = getelementptr inbounds [2 x i32], [2 x i32]* %0, i32 0, i32 0
@@ -169,7 +169,7 @@ entry:
 ; R600_CHECK: MOV
 ; R600_CHECK: [[CHAN:[XYZW]]]+
 ; R600-NOT: [[CHAN]]+
-define void @no_overlap(i32 addrspace(1)* %out, i32 %in) {
+define void @no_overlap(i32 addrspace(1)* %out, i32 %in) #0 {
 entry:
   %0 = alloca [3 x i8], align 1
   %1 = alloca [2 x i8], align 1
@@ -193,7 +193,7 @@ entry:
   ret void
 }
 
-define void @char_array_array(i32 addrspace(1)* %out, i32 %index) {
+define void @char_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x [2 x i8]]
   %gep0 = getelementptr inbounds [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 0
@@ -207,7 +207,7 @@ entry:
   ret void
 }
 
-define void @i32_array_array(i32 addrspace(1)* %out, i32 %index) {
+define void @i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x [2 x i32]]
   %gep0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 0
@@ -220,7 +220,7 @@ entry:
   ret void
 }
 
-define void @i64_array_array(i64 addrspace(1)* %out, i32 %index) {
+define void @i64_array_array(i64 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x [2 x i64]]
   %gep0 = getelementptr inbounds [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 0
@@ -235,7 +235,7 @@ entry:
 
 %struct.pair32 = type { i32, i32 }
 
-define void @struct_array_array(i32 addrspace(1)* %out, i32 %index) {
+define void @struct_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x [2 x %struct.pair32]]
   %gep0 = getelementptr inbounds [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 0, i32 1
@@ -248,7 +248,7 @@ entry:
   ret void
 }
 
-define void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) {
+define void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x %struct.pair32]
   %gep0 = getelementptr inbounds [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 0, i32 1
@@ -282,7 +282,7 @@ entry:
 ; SI-NOT: ds_write
 ; SI: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen
 ; SI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ;
-define void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %alloca = alloca [16 x i32]
   %tmp0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
   store i32 5, i32* %tmp0
@@ -296,3 +296,5 @@ define void @ptrtoint(i32 addrspace(1)*
 }
 
 ; OPT: !0 = !{i32 0, i32 2048}
+
+attributes #0 = { nounwind "amdgpu-max-waves-per-eu"="2" }

Modified: llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-globals.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-globals.ll?rev=269708&r1=269707&r2=269708&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-globals.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-globals.ll Mon May 16 16:19:59 2016
@@ -2,7 +2,8 @@
 ; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=ASM %s
 
 
- at global_array= internal unnamed_addr addrspace(3) global [1500 x [10 x i32]] undef, align 4
+ at global_array0 = internal unnamed_addr addrspace(3) global [750 x [10 x i32]] undef, align 4
+ at global_array1 = internal unnamed_addr addrspace(3) global [750 x [10 x i32]] undef, align 4
 
 ; IR-LABEL: define void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
 ; IR: alloca [10 x i32]
@@ -26,7 +27,9 @@ entry:
   %tmp3 = load i32, i32* %arrayidx12
   %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
   store i32 %tmp3, i32 addrspace(1)* %arrayidx13
-  %v = getelementptr inbounds [1500 x [10 x i32]], [1500 x [10 x i32]] addrspace(3)* @global_array, i32 0, i32 0, i32 0
-  store i32 %tmp3, i32 addrspace(3)* %v
+  %v0 = getelementptr inbounds [750 x [10 x i32]], [750 x [10 x i32]] addrspace(3)* @global_array0, i32 0, i32 0, i32 0
+  store i32 %tmp3, i32 addrspace(3)* %v0
+  %v1 = getelementptr inbounds [750 x [10 x i32]], [750 x [10 x i32]] addrspace(3)* @global_array1, i32 0, i32 0, i32 0
+  store i32 %tmp3, i32 addrspace(3)* %v1
   ret void
 }

Modified: llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll?rev=269708&r1=269707&r2=269708&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll Mon May 16 16:19:59 2016
@@ -11,11 +11,11 @@ declare void @llvm.memset.p0i8.i32(i8* n
 declare i32 @llvm.objectsize.i32.p0i8(i8*, i1) #1
 
 ; CHECK-LABEL: @promote_with_memcpy(
-; CHECK: getelementptr inbounds [256 x [17 x i32]], [256 x [17 x i32]] addrspace(3)* @promote_with_memcpy.alloca, i32 0, i32 %{{[0-9]+}}
+; CHECK: getelementptr inbounds [64 x [17 x i32]], [64 x [17 x i32]] addrspace(3)* @promote_with_memcpy.alloca, i32 0, i32 %{{[0-9]+}}
 ; CHECK: call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %alloca.bc, i8 addrspace(1)* %in.bc, i32 68, i32 4, i1 false)
 ; CHECK: call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out.bc, i8 addrspace(3)* %alloca.bc, i32 68, i32 4, i1 false)
 define void @promote_with_memcpy(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %alloca = alloca [17 x i32], align 16
+  %alloca = alloca [17 x i32], align 4
   %alloca.bc = bitcast [17 x i32]* %alloca to i8*
   %in.bc = bitcast i32 addrspace(1)* %in to i8 addrspace(1)*
   %out.bc = bitcast i32 addrspace(1)* %out to i8 addrspace(1)*
@@ -25,11 +25,11 @@ define void @promote_with_memcpy(i32 add
 }
 
 ; CHECK-LABEL: @promote_with_memmove(
-; CHECK: getelementptr inbounds [256 x [17 x i32]], [256 x [17 x i32]] addrspace(3)* @promote_with_memmove.alloca, i32 0, i32 %{{[0-9]+}}
+; CHECK: getelementptr inbounds [64 x [17 x i32]], [64 x [17 x i32]] addrspace(3)* @promote_with_memmove.alloca, i32 0, i32 %{{[0-9]+}}
 ; CHECK: call void @llvm.memmove.p3i8.p1i8.i32(i8 addrspace(3)* %alloca.bc, i8 addrspace(1)* %in.bc, i32 68, i32 4, i1 false)
 ; CHECK: call void @llvm.memmove.p1i8.p3i8.i32(i8 addrspace(1)* %out.bc, i8 addrspace(3)* %alloca.bc, i32 68, i32 4, i1 false)
 define void @promote_with_memmove(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %alloca = alloca [17 x i32], align 16
+  %alloca = alloca [17 x i32], align 4
   %alloca.bc = bitcast [17 x i32]* %alloca to i8*
   %in.bc = bitcast i32 addrspace(1)* %in to i8 addrspace(1)*
   %out.bc = bitcast i32 addrspace(1)* %out to i8 addrspace(1)*
@@ -39,10 +39,10 @@ define void @promote_with_memmove(i32 ad
 }
 
 ; CHECK-LABEL: @promote_with_memset(
-; CHECK: getelementptr inbounds [256 x [17 x i32]], [256 x [17 x i32]] addrspace(3)* @promote_with_memset.alloca, i32 0, i32 %{{[0-9]+}}
+; CHECK: getelementptr inbounds [64 x [17 x i32]], [64 x [17 x i32]] addrspace(3)* @promote_with_memset.alloca, i32 0, i32 %{{[0-9]+}}
 ; CHECK: call void @llvm.memset.p3i8.i32(i8 addrspace(3)* %alloca.bc, i8 7, i32 68, i32 4, i1 false)
 define void @promote_with_memset(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %alloca = alloca [17 x i32], align 16
+  %alloca = alloca [17 x i32], align 4
   %alloca.bc = bitcast [17 x i32]* %alloca to i8*
   %in.bc = bitcast i32 addrspace(1)* %in to i8 addrspace(1)*
   %out.bc = bitcast i32 addrspace(1)* %out to i8 addrspace(1)*
@@ -51,15 +51,15 @@ define void @promote_with_memset(i32 add
 }
 
 ; CHECK-LABEL: @promote_with_objectsize(
-; CHECK: [[PTR:%[0-9]+]] = getelementptr inbounds [256 x [17 x i32]], [256 x [17 x i32]] addrspace(3)* @promote_with_objectsize.alloca, i32 0, i32 %{{[0-9]+}}
+; CHECK: [[PTR:%[0-9]+]] = getelementptr inbounds [64 x [17 x i32]], [64 x [17 x i32]] addrspace(3)* @promote_with_objectsize.alloca, i32 0, i32 %{{[0-9]+}}
 ; CHECK: call i32 @llvm.objectsize.i32.p3i8(i8 addrspace(3)* %alloca.bc, i1 false)
 define void @promote_with_objectsize(i32 addrspace(1)* %out) #0 {
-  %alloca = alloca [17 x i32], align 16
+  %alloca = alloca [17 x i32], align 4
   %alloca.bc = bitcast [17 x i32]* %alloca to i8*
   %size = call i32 @llvm.objectsize.i32.p0i8(i8* %alloca.bc, i1 false)
   store i32 %size, i32 addrspace(1)* %out
   ret void
 }
 
-attributes #0 = { nounwind }
+attributes #0 = { nounwind "amdgpu-max-work-group-size"="64" "amdgpu-max-waves-per-eu"="3" }
 attributes #1 = { nounwind readnone }

Modified: llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll?rev=269708&r1=269707&r2=269708&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll Mon May 16 16:19:59 2016
@@ -34,5 +34,5 @@ entry:
   ret void
 }
 
-attributes #0 = { nounwind }
-attributes #1 = { nounwind optnone noinline }
+attributes #0 = { nounwind "amdgpu-max-work-group-size"="64" }
+attributes #1 = { nounwind optnone noinline "amdgpu-max-work-group-size"="64" }

Added: llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll?rev=269708&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll Mon May 16 16:19:59 2016
@@ -0,0 +1,130 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s | FileCheck -check-prefix=GCN %s
+
+; This shows that the amount of LDS estimate is sensitive to the order
+; of the LDS globals.
+
+; Both of these functions use the same amount of LDS, but the total
+; changes depending on the visit order of first use.
+
+; The one with the suboptimal order resulting in extra padding exceeds
+; the desired limit
+
+; The padding estimate heuristic used by the promote alloca pass
+; is mostly determined by the order of the globals,
+
+; Raw usage = 1060 bytes
+; Rounded usage:
+; 292 + (4 pad) + 256 + (8 pad) + 512 = 1072
+; 512 + (0 pad) + 256 + (0 pad) + 292 = 1060
+
+; At default occupancy guess of 7, 2340 bytes available total.
+
+; 1280 need to be left to promote alloca
+; optimally packed, this requires
+
+
+ at lds0 = internal unnamed_addr addrspace(3) global [32 x <4 x i32>] undef, align 16
+ at lds2 = internal unnamed_addr addrspace(3) global [32 x i64] undef, align 8
+ at lds1 = internal unnamed_addr addrspace(3) global [73 x i32] undef, align 4
+
+
+; GCN-LABEL: {{^}}promote_alloca_size_order_0:
+; GCN: workgroup_group_segment_byte_size = 2340
+define void @promote_alloca_size_order_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 {
+entry:
+  %stack = alloca [5 x i32], align 4
+  %tmp0 = load i32, i32 addrspace(1)* %in, align 4
+  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp0
+  store i32 4, i32* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
+  %tmp1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp1
+  store i32 5, i32* %arrayidx3, align 4
+  %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
+  %tmp2 = load i32, i32* %arrayidx10, align 4
+  store i32 %tmp2, i32 addrspace(1)* %out, align 4
+  %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
+  %tmp3 = load i32, i32* %arrayidx12
+  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
+  store i32 %tmp3, i32 addrspace(1)* %arrayidx13
+
+  %gep.lds1 = getelementptr inbounds [73 x i32], [73 x i32] addrspace(3)* @lds1, i32 0, i32 %idx
+  store volatile i32 0, i32 addrspace(3)* %gep.lds1, align 4
+
+  %gep.lds2 = getelementptr inbounds [32 x i64], [32 x i64] addrspace(3)* @lds2, i32 0, i32 %idx
+  store volatile i64 0, i64 addrspace(3)* %gep.lds2, align 8
+
+  %gep.lds0 = getelementptr inbounds [32 x <4 x i32>], [32 x <4 x i32>] addrspace(3)* @lds0, i32 0, i32 %idx
+  store volatile <4 x i32> zeroinitializer, <4 x i32> addrspace(3)* %gep.lds0, align 16
+
+  ret void
+}
+
+; GCN-LABEL: {{^}}promote_alloca_size_order_1:
+; GCN: workgroup_group_segment_byte_size = 2352
+define void @promote_alloca_size_order_1(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 {
+entry:
+  %stack = alloca [5 x i32], align 4
+  %tmp0 = load i32, i32 addrspace(1)* %in, align 4
+  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp0
+  store i32 4, i32* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
+  %tmp1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp1
+  store i32 5, i32* %arrayidx3, align 4
+  %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
+  %tmp2 = load i32, i32* %arrayidx10, align 4
+  store i32 %tmp2, i32 addrspace(1)* %out, align 4
+  %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
+  %tmp3 = load i32, i32* %arrayidx12
+  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
+  store i32 %tmp3, i32 addrspace(1)* %arrayidx13
+
+  %gep.lds0 = getelementptr inbounds [32 x <4 x i32>], [32 x <4 x i32>] addrspace(3)* @lds0, i32 0, i32 %idx
+  store volatile <4 x i32> zeroinitializer, <4 x i32> addrspace(3)* %gep.lds0, align 16
+
+  %gep.lds2 = getelementptr inbounds [32 x i64], [32 x i64] addrspace(3)* @lds2, i32 0, i32 %idx
+  store volatile i64 0, i64 addrspace(3)* %gep.lds2, align 8
+
+  %gep.lds1 = getelementptr inbounds [73 x i32], [73 x i32] addrspace(3)* @lds1, i32 0, i32 %idx
+  store volatile i32 0, i32 addrspace(3)* %gep.lds1, align 4
+
+  ret void
+}
+
+ at lds3 = internal unnamed_addr addrspace(3) global [13 x i32] undef, align 4
+ at lds4 = internal unnamed_addr addrspace(3) global [63 x <4 x i32>] undef, align 16
+
+; The guess from the alignment padding pushes this over the determined
+; size limit, so it isn't promoted
+
+; GCN-LABEL: {{^}}promote_alloca_align_pad_guess_over_limit:
+; GCN: workgroup_group_segment_byte_size = 1060
+define void @promote_alloca_align_pad_guess_over_limit(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 {
+entry:
+  %stack = alloca [5 x i32], align 4
+  %tmp0 = load i32, i32 addrspace(1)* %in, align 4
+  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp0
+  store i32 4, i32* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
+  %tmp1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp1
+  store i32 5, i32* %arrayidx3, align 4
+  %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
+  %tmp2 = load i32, i32* %arrayidx10, align 4
+  store i32 %tmp2, i32 addrspace(1)* %out, align 4
+  %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
+  %tmp3 = load i32, i32* %arrayidx12
+  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
+  store i32 %tmp3, i32 addrspace(1)* %arrayidx13
+
+  %gep.lds3 = getelementptr inbounds [13 x i32], [13 x i32] addrspace(3)* @lds3, i32 0, i32 %idx
+  store volatile i32 0, i32 addrspace(3)* %gep.lds3, align 4
+
+  %gep.lds4 = getelementptr inbounds [63 x <4 x i32>], [63 x <4 x i32>] addrspace(3)* @lds4, i32 0, i32 %idx
+  store volatile <4 x i32> zeroinitializer, <4 x i32> addrspace(3)* %gep.lds4, align 16
+
+  ret void
+}
+
+attributes #0 = { nounwind "amdgpu-max-work-group-size"="64" }

Modified: llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll?rev=269708&r1=269707&r2=269708&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll Mon May 16 16:19:59 2016
@@ -35,4 +35,4 @@ define void @lds_promoted_alloca_icmp_un
 
 declare i32* @get_unknown_pointer() #0
 
-attributes #0 = { nounwind }
+attributes #0 = { nounwind "amdgpu-max-waves-per-eu"="1" }

Modified: llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll?rev=269708&r1=269707&r2=269708&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll Mon May 16 16:19:59 2016
@@ -167,4 +167,4 @@ for.body:
 
 declare i32* @get_unknown_pointer() #0
 
-attributes #0 = { nounwind }
+attributes #0 = { nounwind "amdgpu-max-waves-per-eu"="1" }

Modified: llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll?rev=269708&r1=269707&r2=269708&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll Mon May 16 16:19:59 2016
@@ -99,4 +99,4 @@ bb2:
   ret void
 }
 
-attributes #0 = { norecurse nounwind }
+attributes #0 = { norecurse nounwind "amdgpu-max-waves-per-eu"="1" }