[llvm] 5e007fe - AMDGPU: Support non-entry block static sized allocas

Wed May 27 16:00:13 PDT 2020

Author: Matt Arsenault
Date: 2020-05-27T18:46:10-04:00
New Revision: 5e007fe9980cc44e9c4a14c9baf3bdfb012d2c18

URL: https://github.com/llvm/llvm-project/commit/5e007fe9980cc44e9c4a14c9baf3bdfb012d2c18
DIFF: https://github.com/llvm/llvm-project/commit/5e007fe9980cc44e9c4a14c9baf3bdfb012d2c18.diff

LOG: AMDGPU: Support non-entry block static sized allocas

OpenMP emits these for some reason, so handle them. Assume these use
4096 bytes by default, with a flag to override this. Also change the
related stack assumption for calls to have a flag.

Added: 
    llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
    llvm/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/lib/Target/AMDGPU/SIISelLowering.h

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 81676d63643d..fe0462a31064 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -49,6 +49,22 @@ using namespace llvm;
 using namespace llvm::AMDGPU;
 using namespace llvm::AMDGPU::HSAMD;
 
+// We need to tell the runtime some amount ahead of time if we don't know the
+// true stack size. Assume a smaller number if this is only due to dynamic /
+// non-entry block allocas.
+static cl::opt<uint32_t> AssumedStackSizeForExternalCall(
+  "amdgpu-assume-external-call-stack-size",
+  cl::desc("Assumed stack use of any external call (in bytes)"),
+  cl::Hidden,
+  cl::init(16384));
+
+static cl::opt<uint32_t> AssumedStackSizeForDynamicSizeObjects(
+  "amdgpu-assume-dynamic-stack-object-size",
+  cl::desc("Assumed extra stack use if there are any "
+           "variable sized objects (in bytes)"),
+  cl::Hidden,
+  cl::init(4096));
+
 // This should get the default rounding mode from the kernel. We just set the
 // default here, but this could change if the OpenCL rounding mode pragmas are
 // used.
@@ -637,8 +653,13 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
     Info.UsesFlatScratch = false;
   }
 
-  Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
   Info.PrivateSegmentSize = FrameInfo.getStackSize();
+
+  // Assume a big number if there are any unknown sized objects.
+  Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
+  if (Info.HasDynamicallySizedStack)
+    Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects;
+
   if (MFI->isStackRealigned())
     Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();
 
@@ -907,7 +928,9 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
           MaxVGPR = std::max(MaxVGPR, 23);
           MaxAGPR = std::max(MaxAGPR, 23);
 
-          CalleeFrameSize = std::max(CalleeFrameSize, UINT64_C(16384));
+          CalleeFrameSize = std::max(CalleeFrameSize,
+            static_cast<uint64_t>(AssumedStackSizeForExternalCall));
+
           Info.UsesVCC = true;
           Info.UsesFlatScratch = ST.hasFlatAddressSpace();
           Info.HasDynamicallySizedStack = true;

diff  --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 689aece39dee..042087ec5a4d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3089,6 +3089,67 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
                          IsThisReturn ? OutVals[0] : SDValue());
 }
 
+// This is identical to the default implementation in ExpandDYNAMIC_STACKALLOC,
+// except for applying the wave size scale to the increment amount.
+SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(
+    SDValue Op, SelectionDAG &DAG) const {
+  const MachineFunction &MF = DAG.getMachineFunction();
+  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+  SDValue Tmp1 = Op;
+  SDValue Tmp2 = Op.getValue(1);
+  SDValue Tmp3 = Op.getOperand(2);
+  SDValue Chain = Tmp1.getOperand(0);
+
+  Register SPReg = Info->getStackPtrOffsetReg();
+
+  // Chain the dynamic stack allocation so that it doesn't modify the stack
+  // pointer when other instructions are using the stack.
+  Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
+
+  SDValue Size  = Tmp2.getOperand(1);
+  SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
+  Chain = SP.getValue(1);
+  unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  const TargetFrameLowering *TFL = ST.getFrameLowering();
+  unsigned Opc =
+    TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp ?
+    ISD::ADD : ISD::SUB;
+
+  SDValue ScaledSize = DAG.getNode(
+      ISD::SHL, dl, VT, Size,
+      DAG.getConstant(ST.getWavefrontSizeLog2(), dl, MVT::i32));
+
+  unsigned StackAlign = TFL->getStackAlignment();
+  Tmp1 = DAG.getNode(Opc, dl, VT, SP, ScaledSize); // Value
+  if (Align > StackAlign)
+    Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
+                       DAG.getConstant(-(uint64_t)Align, dl, VT));
+  Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1);    // Output chain
+  Tmp2 = DAG.getCALLSEQ_END(
+      Chain, DAG.getIntPtrConstant(0, dl, true),
+      DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
+
+  return DAG.getMergeValues({Tmp1, Tmp2}, dl);
+}
+
+SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  // We only handle constant sizes here to allow non-entry block, static sized
+  // allocas. A truly dynamic value is more 
diff icult to support because we
+  // don't know if the size value is uniform or not. If the size isn't uniform,
+  // we would need to do a wave reduction to get the maximum size to know how
+  // much to increment the uniform stack pointer.
+  SDValue Size = Op.getOperand(1);
+  if (isa<ConstantSDNode>(Size))
+      return lowerDYNAMIC_STACKALLOCImpl(Op, DAG); // Use "generic" expansion.
+
+  return AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(Op, DAG);
+}
+
 Register SITargetLowering::getRegisterByName(const char* RegName, LLT VT,
                                              const MachineFunction &MF) const {
   Register Reg = StringSwitch<Register>(RegName)
@@ -4305,6 +4366,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::FMINNUM_IEEE:
   case ISD::FMAXNUM_IEEE:
     return splitBinaryVectorOp(Op, DAG);
+  case ISD::DYNAMIC_STACKALLOC:
+    return LowerDYNAMIC_STACKALLOC(Op, DAG);
   }
   return SDValue();
 }

diff  --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 7ef11eba4f9c..da0260f4ed2d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -337,6 +337,9 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   SDValue LowerCall(CallLoweringInfo &CLI,
                     SmallVectorImpl<SDValue> &InVals) const override;
 
+  SDValue lowerDYNAMIC_STACKALLOCImpl(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+
   Register getRegisterByName(const char* RegName, LLT VT,
                              const MachineFunction &MF) const override;
 

diff  --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
new file mode 100644
index 000000000000..060d66ae8428
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
@@ -0,0 +1,271 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,DEFAULTSIZE %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-assume-dynamic-stack-object-size=1024 < %s | FileCheck -check-prefixes=GCN,ASSUME1024 %s
+
+; FIXME: Generated test checks do not check metadata at the end of the
+; function, so this also includes manually added checks.
+
+; Test that we can select a statically sized alloca outside of the
+; entry block.
+
+; FIXME: FunctionLoweringInfo unhelpfully doesn't preserve an
+; alignment less than the stack alignment.
+define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align4(i32 addrspace(1)* %out, i32 %arg.cond0, i32 %arg.cond1, i32 %in) {
+; GCN-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align4:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
+; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
+; GCN-NEXT:    s_add_u32 s0, s0, s9
+; GCN-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GCN-NEXT:    s_addc_u32 s1, s1, 0
+; GCN-NEXT:    s_mov_b32 s33, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_cmp_lg_u32 s8, 0
+; GCN-NEXT:    s_cbranch_scc1 BB0_3
+; GCN-NEXT:  ; %bb.1: ; %bb.0
+; GCN-NEXT:    s_cmp_lg_u32 s9, 0
+; GCN-NEXT:    s_cbranch_scc1 BB0_3
+; GCN-NEXT:  ; %bb.2: ; %bb.1
+; GCN-NEXT:    s_add_i32 s6, s32, 0x1000
+; GCN-NEXT:    s_lshl_b32 s7, s10, 2
+; GCN-NEXT:    s_mov_b32 s32, s6
+; GCN-NEXT:    v_mov_b32_e32 v2, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_add_i32 s6, s6, s7
+; GCN-NEXT:    v_mov_b32_e32 v3, 1
+; GCN-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
+; GCN-NEXT:    v_mov_b32_e32 v1, s6
+; GCN-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_add_u32_e32 v2, v1, v0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    global_store_dword v[0:1], v2, off
+; GCN-NEXT:  BB0_3: ; %bb.2
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    global_store_dword v[0:1], v0, off
+; GCN-NEXT:    s_endpgm
+
+entry:
+  %cond0 = icmp eq i32 %arg.cond0, 0
+  br i1 %cond0, label %bb.0, label %bb.2
+
+bb.0:
+  %alloca = alloca [16 x i32], align 4, addrspace(5)
+  %gep0 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 0
+  %gep1 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 1
+  %cond1 = icmp eq i32 %arg.cond1, 0
+  br i1 %cond1, label %bb.1, label %bb.2
+
+bb.1:
+  ; Use the alloca outside of the defining block.
+  store i32 0, i32 addrspace(5)* %gep0
+  store i32 1, i32 addrspace(5)* %gep1
+  %gep2 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %in
+  %load = load i32, i32 addrspace(5)* %gep2
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %add = add i32 %load, %tid
+  store i32 %add, i32 addrspace(1)* %out
+  br label %bb.2
+
+bb.2:
+  store volatile i32 0, i32 addrspace(1)* undef
+  ret void
+}
+; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 4112
+; DEFAULTSIZE: ; ScratchSize: 4112
+
+; ASSUME1024: .amdhsa_private_segment_fixed_size 1040
+; ASSUME1024: ; ScratchSize: 1040
+
+define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align64(i32 addrspace(1)* %out, i32 %arg.cond, i32 %in) {
+; GCN-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
+; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
+; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x8
+; GCN-NEXT:    s_add_u32 s0, s0, s9
+; GCN-NEXT:    s_addc_u32 s1, s1, 0
+; GCN-NEXT:    s_mov_b32 s33, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_cmp_lg_u32 s6, 0
+; GCN-NEXT:    s_cbranch_scc1 BB1_2
+; GCN-NEXT:  ; %bb.1: ; %bb.0
+; GCN-NEXT:    s_add_i32 s6, s32, 0x1000
+; GCN-NEXT:    s_andn2_b32 s6, s6, 63
+; GCN-NEXT:    s_lshl_b32 s7, s7, 2
+; GCN-NEXT:    s_mov_b32 s32, s6
+; GCN-NEXT:    v_mov_b32_e32 v2, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_add_i32 s6, s6, s7
+; GCN-NEXT:    v_mov_b32_e32 v3, 1
+; GCN-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
+; GCN-NEXT:    v_mov_b32_e32 v1, s6
+; GCN-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_add_u32_e32 v2, v1, v0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    global_store_dword v[0:1], v2, off
+; GCN-NEXT:  BB1_2: ; %bb.1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    global_store_dword v[0:1], v0, off
+; GCN-NEXT:    s_endpgm
+entry:
+  %cond = icmp eq i32 %arg.cond, 0
+  br i1 %cond, label %bb.0, label %bb.1
+
+bb.0:
+  %alloca = alloca [16 x i32], align 64, addrspace(5)
+  %gep0 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 0
+  %gep1 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 1
+  store i32 0, i32 addrspace(5)* %gep0
+  store i32 1, i32 addrspace(5)* %gep1
+  %gep2 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %in
+  %load = load i32, i32 addrspace(5)* %gep2
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %add = add i32 %load, %tid
+  store i32 %add, i32 addrspace(1)* %out
+  br label %bb.1
+
+bb.1:
+  store volatile i32 0, i32 addrspace(1)* undef
+  ret void
+}
+
+; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 4160
+; DEFAULTSIZE: ; ScratchSize: 4160
+
+; ASSUME1024: .amdhsa_private_segment_fixed_size 1088
+; ASSUME1024: ; ScratchSize: 1088
+
+
+define void @func_non_entry_block_static_alloca_align4(i32 addrspace(1)* %out, i32 %arg.cond0, i32 %arg.cond1, i32 %in) {
+; GCN-LABEL: func_non_entry_block_static_alloca_align4:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s7, s33
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_add_u32 s32, s32, 0x400
+; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT:    s_cbranch_execz BB2_3
+; GCN-NEXT:  ; %bb.1: ; %bb.0
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GCN-NEXT:    s_and_b64 exec, exec, vcc
+; GCN-NEXT:    s_cbranch_execz BB2_3
+; GCN-NEXT:  ; %bb.2: ; %bb.1
+; GCN-NEXT:    s_add_i32 s6, s32, 0x1000
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    v_mov_b32_e32 v3, s6
+; GCN-NEXT:    v_mov_b32_e32 v6, 1
+; GCN-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v6, v3, s[0:3], 0 offen offset:4
+; GCN-NEXT:    v_lshl_add_u32 v2, v4, 2, s6
+; GCN-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_and_b32_e32 v3, 0x3ff, v5
+; GCN-NEXT:    s_mov_b32 s32, s6
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_add_u32_e32 v2, v2, v3
+; GCN-NEXT:    global_store_dword v[0:1], v2, off
+; GCN-NEXT:  BB2_3: ; %bb.2
+; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    global_store_dword v[0:1], v0, off
+; GCN-NEXT:    s_sub_u32 s32, s32, 0x400
+; GCN-NEXT:    s_mov_b32 s33, s7
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+
+entry:
+  %cond0 = icmp eq i32 %arg.cond0, 0
+  br i1 %cond0, label %bb.0, label %bb.2
+
+bb.0:
+  %alloca = alloca [16 x i32], align 4, addrspace(5)
+  %gep0 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 0
+  %gep1 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 1
+  %cond1 = icmp eq i32 %arg.cond1, 0
+  br i1 %cond1, label %bb.1, label %bb.2
+
+bb.1:
+  ; Use the alloca outside of the defining block.
+  store i32 0, i32 addrspace(5)* %gep0
+  store i32 1, i32 addrspace(5)* %gep1
+  %gep2 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %in
+  %load = load i32, i32 addrspace(5)* %gep2
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %add = add i32 %load, %tid
+  store i32 %add, i32 addrspace(1)* %out
+  br label %bb.2
+
+bb.2:
+  store volatile i32 0, i32 addrspace(1)* undef
+  ret void
+}
+
+define void @func_non_entry_block_static_alloca_align64(i32 addrspace(1)* %out, i32 %arg.cond, i32 %in) {
+; GCN-LABEL: func_non_entry_block_static_alloca_align64:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_add_u32 s4, s32, 0xfc0
+; GCN-NEXT:    s_mov_b32 s7, s33
+; GCN-NEXT:    s_and_b32 s33, s4, 0xfffff000
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GCN-NEXT:    s_add_u32 s32, s32, 0x2000
+; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT:    s_cbranch_execz BB3_2
+; GCN-NEXT:  ; %bb.1: ; %bb.0
+; GCN-NEXT:    s_add_i32 s6, s32, 0x1000
+; GCN-NEXT:    s_andn2_b32 s6, s6, 63
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    v_mov_b32_e32 v5, s6
+; GCN-NEXT:    v_mov_b32_e32 v6, 1
+; GCN-NEXT:    buffer_store_dword v2, v5, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v6, v5, s[0:3], 0 offen offset:4
+; GCN-NEXT:    v_lshl_add_u32 v2, v3, 2, s6
+; GCN-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_and_b32_e32 v3, 0x3ff, v4
+; GCN-NEXT:    s_mov_b32 s32, s6
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_add_u32_e32 v2, v2, v3
+; GCN-NEXT:    global_store_dword v[0:1], v2, off
+; GCN-NEXT:  BB3_2: ; %bb.1
+; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    global_store_dword v[0:1], v0, off
+; GCN-NEXT:    s_sub_u32 s32, s32, 0x2000
+; GCN-NEXT:    s_mov_b32 s33, s7
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cond = icmp eq i32 %arg.cond, 0
+  br i1 %cond, label %bb.0, label %bb.1
+
+bb.0:
+  %alloca = alloca [16 x i32], align 64, addrspace(5)
+  %gep0 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 0
+  %gep1 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 1
+  store i32 0, i32 addrspace(5)* %gep0
+  store i32 1, i32 addrspace(5)* %gep1
+  %gep2 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %in
+  %load = load i32, i32 addrspace(5)* %gep2
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %add = add i32 %load, %tid
+  store i32 %add, i32 addrspace(1)* %out
+  br label %bb.1
+
+bb.1:
+  store volatile i32 0, i32 addrspace(1)* undef
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone speculatable }