[llvm] 0bd1c87 - [AMDGPU] Support divergent sized dynamic alloca (#121148)

via llvm-commits llvm-commits at lists.llvm.org
Sun Jan 5 21:28:28 PST 2025


Author: Aaditya
Date: 2025-01-06T12:28:24+07:00
New Revision: 0bd1c879966cfdf145b1f96292a2632628fab3fb

URL: https://github.com/llvm/llvm-project/commit/0bd1c879966cfdf145b1f96292a2632628fab3fb
DIFF: https://github.com/llvm/llvm-project/commit/0bd1c879966cfdf145b1f96292a2632628fab3fb.diff

LOG: [AMDGPU] Support divergent sized dynamic alloca (#121148)

Currently, AMDGPU backend can handle uniform-sized dynamic allocas. 
This patch extends support for divergent-sized dynamic allocas.
When the size argument of a dynamic alloca is divergent, 
a wave-wide reduction is performed to get the required stack space. 
`@llvm.amdgcn.wave.reduce.umax` is used to perform the 
wave reduction.

Dynamic allocas are not completely supported yet, 
as the stack is not properly restored on function exit.
This patch doesn't attempt to address the aforementioned issue.

Note: Compiler already Zero-Extends or Truncates all other 
types(of alloca size arg) to i32.

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
    llvm/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/lib/Target/AMDGPU/SIISelLowering.h
    llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-dyn-stackalloc.mir
    llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll

Removed: 
    llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-divergent.ll


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index d94c400ad14225..08e23cbf34e42b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -1190,9 +1190,13 @@ bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc(
 
   const RegisterBank *SizeBank = getRegBank(AllocSize, MRI, *TRI);
 
-  // TODO: Need to emit a wave reduction to get the maximum size.
-  if (SizeBank != &AMDGPU::SGPRRegBank)
-    return false;
+  if (SizeBank != &AMDGPU::SGPRRegBank) {
+    auto WaveReduction =
+        B.buildIntrinsic(Intrinsic::amdgcn_wave_reduce_umax, {LLT::scalar(32)})
+            .addUse(AllocSize)
+            .addImm(0);
+    AllocSize = WaveReduction.getReg(0);
+  }
 
   LLT PtrTy = MRI.getType(Dst);
   LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());

diff  --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 58b061f5c1af0d..b3cfa398d9b5f6 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4017,29 +4017,26 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
 }
 
 // This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
-// except for stack growth direction(default: downwards, AMDGPU: upwards) and
-// applying the wave size scale to the increment amount.
-SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(SDValue Op,
-                                                      SelectionDAG &DAG) const {
+// except for:
+// 1. Stack growth direction(default: downwards, AMDGPU: upwards), and
+// 2. Scale size where, scale = wave-reduction(alloca-size) * wave-size
+SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
+                                                  SelectionDAG &DAG) const {
   const MachineFunction &MF = DAG.getMachineFunction();
   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
 
   SDLoc dl(Op);
   EVT VT = Op.getValueType();
-  SDValue Tmp1 = Op;
-  SDValue Tmp2 = Op.getValue(1);
-  SDValue Tmp3 = Op.getOperand(2);
-  SDValue Chain = Tmp1.getOperand(0);
-
+  SDValue Chain = Op.getOperand(0);
   Register SPReg = Info->getStackPtrOffsetReg();
 
   // Chain the dynamic stack allocation so that it doesn't modify the stack
   // pointer when other instructions are using the stack.
   Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
 
-  SDValue Size = Tmp2.getOperand(1);
+  SDValue Size = Op.getOperand(1);
   SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
-  Align Alignment = cast<ConstantSDNode>(Tmp3)->getAlignValue();
+  Align Alignment = cast<ConstantSDNode>(Op.getOperand(2))->getAlignValue();
 
   const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
   assert(TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp &&
@@ -4057,30 +4054,36 @@ SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(SDValue Op,
                            DAG.getSignedConstant(-ScaledAlignment, dl, VT));
   }
 
-  SDValue ScaledSize = DAG.getNode(
-      ISD::SHL, dl, VT, Size,
-      DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
-
-  SDValue NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value
+  assert(Size.getValueType() == MVT::i32 && "Size must be 32-bit");
+  SDValue NewSP;
+  if (isa<ConstantSDNode>(Size)) {
+    // For constant sized alloca, scale alloca size by wave-size
+    SDValue ScaledSize = DAG.getNode(
+        ISD::SHL, dl, VT, Size,
+        DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
+    NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value
+  } else {
+    // For dynamic sized alloca, perform wave-wide reduction to get max of
+    // alloca size(divergent) and then scale it by wave-size
+    SDValue WaveReduction =
+        DAG.getTargetConstant(Intrinsic::amdgcn_wave_reduce_umax, dl, MVT::i32);
+    Size = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, WaveReduction,
+                       Size, DAG.getConstant(0, dl, MVT::i32));
+    SDValue ScaledSize = DAG.getNode(
+        ISD::SHL, dl, VT, Size,
+        DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
+    NewSP =
+        DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value in vgpr.
+    SDValue ReadFirstLaneID =
+        DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, dl, MVT::i32);
+    NewSP = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, ReadFirstLaneID,
+                        NewSP);
+  }
 
   Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); // Output chain
-  Tmp2 = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
-
-  return DAG.getMergeValues({BaseAddr, Tmp2}, dl);
-}
-
-SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
-                                                  SelectionDAG &DAG) const {
-  // We only handle constant sizes here to allow non-entry block, static sized
-  // allocas. A truly dynamic value is more 
diff icult to support because we
-  // don't know if the size value is uniform or not. If the size isn't uniform,
-  // we would need to do a wave reduction to get the maximum size to know how
-  // much to increment the uniform stack pointer.
-  SDValue Size = Op.getOperand(1);
-  if (isa<ConstantSDNode>(Size))
-    return lowerDYNAMIC_STACKALLOCImpl(Op, DAG); // Use "generic" expansion.
+  SDValue CallSeqEnd = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
 
-  return AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(Op, DAG);
+  return DAG.getMergeValues({BaseAddr, CallSeqEnd}, dl);
 }
 
 SDValue SITargetLowering::LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const {

diff  --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 631f26542bbe6d..f4641e7a659907 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -421,7 +421,6 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   SDValue LowerCall(CallLoweringInfo &CLI,
                     SmallVectorImpl<SDValue> &InVals) const override;
 
-  SDValue lowerDYNAMIC_STACKALLOCImpl(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-divergent.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-divergent.ll
deleted file mode 100644
index aefcad491073fc..00000000000000
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-divergent.ll
+++ /dev/null
@@ -1,72 +0,0 @@
-; RUN: not llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -global-isel-abort=2 -pass-remarks-missed="gisel.*" -verify-machineinstrs=0 -o /dev/null 2>&1 %s | FileCheck -check-prefix=ERR %s
-
-; ERR: remark: <unknown>:0:0: cannot select: %{{[0-9]+}}:sreg_32(p5) = G_DYN_STACKALLOC %{{[0-9]+}}:vgpr(s32), 1 (in function: kernel_dynamic_stackalloc_vgpr_align4)
-; ERR-NEXT: warning: Instruction selection used fallback path for kernel_dynamic_stackalloc_vgpr_align4
-; ERR-NEXT: error: <unknown>:0:0: in function kernel_dynamic_stackalloc_vgpr_align4 void (ptr addrspace(1)): unsupported dynamic alloca
-
-define amdgpu_kernel void @kernel_dynamic_stackalloc_vgpr_align4(ptr addrspace(1) %ptr) {
-  %id = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %id
-  %n = load i32, ptr addrspace(1) %gep
-  %alloca = alloca i32, i32 %n, align 4, addrspace(5)
-  store volatile i32 123, ptr addrspace(5) %alloca
-  ret void
-}
-
-; ERR: remark: <unknown>:0:0: cannot select: %{{[0-9]+}}:sreg_32(p5) = G_DYN_STACKALLOC %{{[0-9]+}}:vgpr(s32), 1 (in function: kernel_dynamic_stackalloc_vgpr_default_align)
-; ERR-NEXT: warning: Instruction selection used fallback path for kernel_dynamic_stackalloc_vgpr_default_align
-; ERR-NEXT: error: <unknown>:0:0: in function kernel_dynamic_stackalloc_vgpr_default_align void (ptr addrspace(1)): unsupported dynamic alloca
-
-define amdgpu_kernel void @kernel_dynamic_stackalloc_vgpr_default_align(ptr addrspace(1) %ptr) {
-  %id = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %id
-  %n = load i32, ptr addrspace(1) %gep
-  %alloca = alloca i32, i32 %n, addrspace(5)
-  store volatile i32 123, ptr addrspace(5) %alloca
-  ret void
-}
-; ERR: remark: <unknown>:0:0: cannot select: %{{[0-9]+}}:sreg_32(p5) = G_DYN_STACKALLOC %{{[0-9]+}}:vgpr(s32), 64 (in function: kernel_dynamic_stackalloc_vgpr_align64)
-; ERR-NEXT: warning: Instruction selection used fallback path for kernel_dynamic_stackalloc_vgpr_align64
-; ERR-NEXT: error: <unknown>:0:0: in function kernel_dynamic_stackalloc_vgpr_align64 void (ptr addrspace(1)): unsupported dynamic alloca
-
-define amdgpu_kernel void @kernel_dynamic_stackalloc_vgpr_align64(ptr addrspace(1) %ptr) {
-  %id = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %id
-  %n = load i32, ptr addrspace(1) %gep
-  %alloca = alloca i32, i32 %n, align 64, addrspace(5)
-  store volatile i32 123, ptr addrspace(5) %alloca
-  ret void
-}
-
-; ERR: remark: <unknown>:0:0: cannot select: %{{[0-9]+}}:sreg_32(p5) = G_DYN_STACKALLOC %{{[0-9]+}}:vgpr(s32), 1 (in function: func_dynamic_stackalloc_vgpr_align4)
-; ERR-NEXT: warning: Instruction selection used fallback path for func_dynamic_stackalloc_vgpr_align4
-; ERR-NEXT: error: <unknown>:0:0: in function func_dynamic_stackalloc_vgpr_align4 void (i32): unsupported dynamic alloca
-
-define void @func_dynamic_stackalloc_vgpr_align4(i32 %n) {
-  %alloca = alloca i32, i32 %n, align 4, addrspace(5)
-  store volatile i32 456, ptr addrspace(5) %alloca
-  ret void
-}
-
-; ERR: remark: <unknown>:0:0: cannot select: %{{[0-9]+}}:sreg_32(p5) = G_DYN_STACKALLOC %{{[0-9]+}}:vgpr(s32), 1 (in function: func_dynamic_stackalloc_vgpr_default_align)
-; ERR-NEXT: warning: Instruction selection used fallback path for func_dynamic_stackalloc_vgpr_default_align
-; ERR-NEXT: error: <unknown>:0:0: in function func_dynamic_stackalloc_vgpr_default_align void (i32): unsupported dynamic alloca
-
-define void @func_dynamic_stackalloc_vgpr_default_align(i32 %n) {
-  %alloca = alloca i32, i32 %n, addrspace(5)
-  store volatile i32 456, ptr addrspace(5) %alloca
-  ret void
-}
-; ERR: remark: <unknown>:0:0: cannot select: %{{[0-9]+}}:sreg_32(p5) = G_DYN_STACKALLOC %{{[0-9]+}}:vgpr(s32), 64 (in function: func_dynamic_stackalloc_vgpr_align64)
-; ERR-NEXT: warning: Instruction selection used fallback path for func_dynamic_stackalloc_vgpr_align64
-; ERR-NEXT: error: <unknown>:0:0: in function func_dynamic_stackalloc_vgpr_align64 void (i32): unsupported dynamic alloca
-
-define void @func_dynamic_stackalloc_vgpr_align64(i32 %n) {
-  %alloca = alloca i32, i32 %n, align 64, addrspace(5)
-  store volatile i32 456, ptr addrspace(5) %alloca
-  ret void
-}
-
-declare i32 @llvm.amdgcn.workitem.id.x() #0
-
-attributes #0 = { nounwind readnone speculatable }

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-dyn-stackalloc.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-dyn-stackalloc.mir
index 5378ce2d1efaad..10517a49e697c5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-dyn-stackalloc.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-dyn-stackalloc.mir
@@ -491,3 +491,132 @@ body: |
     %1:_(p5) = G_DYN_STACKALLOC %0, 32
     S_ENDPGM 0, implicit %1
 ...
+
+---
+name: test_dyn_stackalloc_vgpr_align4
+legalized:       true
+frameInfo:
+  maxAlignment: 4
+stack:
+  - { id: 0, type: variable-sized, alignment: 4 }
+body: |
+  bb.0:
+    liveins: $vgpr0
+
+    ; WAVE64-LABEL: name: test_dyn_stackalloc_vgpr_align4
+    ; WAVE64: liveins: $vgpr0
+    ; WAVE64-NEXT: {{  $}}
+    ; WAVE64-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; WAVE64-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.wave.reduce.umax), [[COPY]](s32), 0
+    ; WAVE64-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
+    ; WAVE64-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[INTRINSIC_CONVERGENT]], [[C]](s32)
+    ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
+    ; WAVE64-NEXT: [[COPY2:%[0-9]+]]:sgpr(p5) = COPY [[COPY1]](p5)
+    ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY2]], [[SHL]](s32)
+    ; WAVE64-NEXT: $sp_reg = COPY [[PTR_ADD]](p5)
+    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[COPY2]](p5)
+    ;
+    ; WAVE32-LABEL: name: test_dyn_stackalloc_vgpr_align4
+    ; WAVE32: liveins: $vgpr0
+    ; WAVE32-NEXT: {{  $}}
+    ; WAVE32-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; WAVE32-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.wave.reduce.umax), [[COPY]](s32), 0
+    ; WAVE32-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
+    ; WAVE32-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[INTRINSIC_CONVERGENT]], [[C]](s32)
+    ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
+    ; WAVE32-NEXT: [[COPY2:%[0-9]+]]:sgpr(p5) = COPY [[COPY1]](p5)
+    ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY2]], [[SHL]](s32)
+    ; WAVE32-NEXT: $sp_reg = COPY [[PTR_ADD]](p5)
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[COPY2]](p5)
+    %0:_(s32) = COPY $vgpr0
+    %1:_(p5) = G_DYN_STACKALLOC %0, 4
+    S_ENDPGM 0, implicit %1
+...
+
+---
+name: test_dyn_stackalloc_vgpr_align16
+legalized:       true
+frameInfo:
+  maxAlignment: 16
+stack:
+  - { id: 0, type: variable-sized, alignment: 16 }
+body: |
+  bb.0:
+    liveins: $vgpr0
+
+    ; WAVE64-LABEL: name: test_dyn_stackalloc_vgpr_align16
+    ; WAVE64: liveins: $vgpr0
+    ; WAVE64-NEXT: {{  $}}
+    ; WAVE64-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; WAVE64-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.wave.reduce.umax), [[COPY]](s32), 0
+    ; WAVE64-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
+    ; WAVE64-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[INTRINSIC_CONVERGENT]], [[C]](s32)
+    ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
+    ; WAVE64-NEXT: [[COPY2:%[0-9]+]]:sgpr(p5) = COPY [[COPY1]](p5)
+    ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY2]], [[SHL]](s32)
+    ; WAVE64-NEXT: $sp_reg = COPY [[PTR_ADD]](p5)
+    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[COPY2]](p5)
+    ;
+    ; WAVE32-LABEL: name: test_dyn_stackalloc_vgpr_align16
+    ; WAVE32: liveins: $vgpr0
+    ; WAVE32-NEXT: {{  $}}
+    ; WAVE32-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; WAVE32-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.wave.reduce.umax), [[COPY]](s32), 0
+    ; WAVE32-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
+    ; WAVE32-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[INTRINSIC_CONVERGENT]], [[C]](s32)
+    ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
+    ; WAVE32-NEXT: [[COPY2:%[0-9]+]]:sgpr(p5) = COPY [[COPY1]](p5)
+    ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY2]], [[SHL]](s32)
+    ; WAVE32-NEXT: $sp_reg = COPY [[PTR_ADD]](p5)
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[COPY2]](p5)
+    %0:_(s32) = COPY $vgpr0
+    %1:_(p5) = G_DYN_STACKALLOC %0, 16
+    S_ENDPGM 0, implicit %1
+...
+
+---
+name: test_dyn_stackalloc_vgpr_align64
+legalized:       true
+frameInfo:
+  maxAlignment: 64
+stack:
+  - { id: 0, type: variable-sized, alignment: 64 }
+body: |
+  bb.0:
+    liveins: $vgpr0
+
+    ; WAVE64-LABEL: name: test_dyn_stackalloc_vgpr_align64
+    ; WAVE64: liveins: $vgpr0
+    ; WAVE64-NEXT: {{  $}}
+    ; WAVE64-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; WAVE64-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.wave.reduce.umax), [[COPY]](s32), 0
+    ; WAVE64-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
+    ; WAVE64-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[INTRINSIC_CONVERGENT]], [[C]](s32)
+    ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
+    ; WAVE64-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4095
+    ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[C1]](s32)
+    ; WAVE64-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -4096
+    ; WAVE64-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C2]](s32)
+    ; WAVE64-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[PTRMASK]], [[SHL]](s32)
+    ; WAVE64-NEXT: $sp_reg = COPY [[PTR_ADD1]](p5)
+    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[PTRMASK]](p5)
+    ;
+    ; WAVE32-LABEL: name: test_dyn_stackalloc_vgpr_align64
+    ; WAVE32: liveins: $vgpr0
+    ; WAVE32-NEXT: {{  $}}
+    ; WAVE32-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; WAVE32-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.wave.reduce.umax), [[COPY]](s32), 0
+    ; WAVE32-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
+    ; WAVE32-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[INTRINSIC_CONVERGENT]], [[C]](s32)
+    ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
+    ; WAVE32-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2047
+    ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[C1]](s32)
+    ; WAVE32-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -2048
+    ; WAVE32-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C2]](s32)
+    ; WAVE32-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[PTRMASK]], [[SHL]](s32)
+    ; WAVE32-NEXT: $sp_reg = COPY [[PTR_ADD1]](p5)
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[PTRMASK]](p5)
+    %0:_(s32) = COPY $vgpr0
+    %1:_(p5) = G_DYN_STACKALLOC %0, 64
+    S_ENDPGM 0, implicit %1
+...

diff  --git a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
index 73aa87e5c55d20..9acb3a42ae102c 100644
--- a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
@@ -1,64 +1,829 @@
-; RUN: not llc -mtriple=amdgcn-- -mcpu=tahiti -mattr=+promote-alloca -verify-machineinstrs < %s 2>&1 | FileCheck %s
-; RUN: not llc -mtriple=amdgcn-- -mcpu=tahiti -mattr=-promote-alloca -verify-machineinstrs < %s 2>&1 | FileCheck %s
-; RUN: not llc -mtriple=r600-- -mcpu=cypress < %s 2>&1 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11-GISEL %s
 target datalayout = "A5"
 
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
 define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform(i32 %n) {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_kernel_uniform:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX9-SDAG-NEXT:    s_add_u32 s0, s0, s17
+; GFX9-SDAG-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-SDAG-NEXT:    s_movk_i32 s32, 0x400
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, s32
+; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_lshl_b32 s4, s4, 2
+; GFX9-SDAG-NEXT:    s_add_i32 s4, s4, 15
+; GFX9-SDAG-NEXT:    s_and_b32 s4, s4, -16
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX9-SDAG-NEXT:    s_lshl_b32 s4, s4, 6
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, 0
+; GFX9-SDAG-NEXT:    s_add_i32 s32, s5, s4
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s5
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_kernel_uniform:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX9-GISEL-NEXT:    s_add_u32 s0, s0, s17
+; GFX9-GISEL-NEXT:    s_movk_i32 s32, 0x400
+; GFX9-GISEL-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, s32
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_lshl2_add_u32 s5, s5, 15
+; GFX9-GISEL-NEXT:    s_and_b32 s5, s5, -16
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-GISEL-NEXT:    s_lshl_b32 s5, s5, 6
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, 0
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s4, s5
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_kernel_uniform:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX11-SDAG-NEXT:    s_mov_b32 s32, 16
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, 0
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    s_add_i32 s0, s0, 15
+; GFX11-SDAG-NEXT:    s_and_b32 s0, s0, -16
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s1, s0
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_kernel_uniform:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX11-GISEL-NEXT:    s_mov_b32 s32, 16
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, s32
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s0 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_lshl2_add_u32 s1, s1, 15
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_and_b32 s1, s1, -16
+; GFX11-GISEL-NEXT:    s_lshl_b32 s1, s1, 5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s0, s1
+; GFX11-GISEL-NEXT:    s_endpgm
   %alloca = alloca i32, i32 %n, addrspace(5)
   store volatile i32 123, ptr addrspace(5) %alloca
   ret void
 }
 
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
 define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform_over_aligned(i32 %n) {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_kernel_uniform_over_aligned:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX9-SDAG-NEXT:    s_add_u32 s0, s0, s17
+; GFX9-SDAG-NEXT:    s_movk_i32 s32, 0x2000
+; GFX9-SDAG-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-SDAG-NEXT:    s_add_i32 s5, s32, 0x1fff
+; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_lshl_b32 s4, s4, 2
+; GFX9-SDAG-NEXT:    s_add_i32 s4, s4, 15
+; GFX9-SDAG-NEXT:    s_and_b32 s5, s5, 0xffffe000
+; GFX9-SDAG-NEXT:    s_and_b32 s4, s4, -16
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 10
+; GFX9-SDAG-NEXT:    s_lshl_b32 s4, s4, 6
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, 0
+; GFX9-SDAG-NEXT:    s_add_i32 s32, s5, s4
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_kernel_uniform_over_aligned:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX9-GISEL-NEXT:    s_movk_i32 s32, 0x2000
+; GFX9-GISEL-NEXT:    s_add_u32 s0, s0, s17
+; GFX9-GISEL-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-GISEL-NEXT:    s_add_u32 s5, s32, 0x1fff
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_lshl2_add_u32 s4, s4, 15
+; GFX9-GISEL-NEXT:    s_and_b32 s5, s5, 0xffffe000
+; GFX9-GISEL-NEXT:    s_and_b32 s4, s4, -16
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 10
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-GISEL-NEXT:    s_lshl_b32 s4, s4, 6
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, 0
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s5, s4
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_kernel_uniform_over_aligned:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-SDAG-NEXT:    s_movk_i32 s32, 0x80
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 10
+; GFX11-SDAG-NEXT:    s_add_i32 s1, s32, 0xfff
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, 0
+; GFX11-SDAG-NEXT:    s_and_b32 s1, s1, 0xfffff000
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    s_add_i32 s0, s0, 15
+; GFX11-SDAG-NEXT:    s_and_b32 s0, s0, -16
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s1, s0
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_kernel_uniform_over_aligned:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-GISEL-NEXT:    s_movk_i32 s32, 0x80
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 10
+; GFX11-GISEL-NEXT:    s_add_u32 s1, s32, 0xfff
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, 0
+; GFX11-GISEL-NEXT:    s_and_b32 s1, s1, 0xfffff000
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_lshl2_add_u32 s0, s0, 15
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_and_b32 s0, s0, -16
+; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT:    s_endpgm
   %alloca = alloca i32, i32 %n, align 128, addrspace(5)
   store volatile i32 10, ptr addrspace(5) %alloca
   ret void
 }
 
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
 define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform_under_aligned(i32 %n) {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_kernel_uniform_under_aligned:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX9-SDAG-NEXT:    s_add_u32 s0, s0, s17
+; GFX9-SDAG-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-SDAG-NEXT:    s_movk_i32 s32, 0x400
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, s32
+; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_lshl_b32 s4, s4, 2
+; GFX9-SDAG-NEXT:    s_add_i32 s4, s4, 15
+; GFX9-SDAG-NEXT:    s_and_b32 s4, s4, -16
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 22
+; GFX9-SDAG-NEXT:    s_lshl_b32 s4, s4, 6
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, 0
+; GFX9-SDAG-NEXT:    s_add_i32 s32, s5, s4
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s5
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_kernel_uniform_under_aligned:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX9-GISEL-NEXT:    s_add_u32 s0, s0, s17
+; GFX9-GISEL-NEXT:    s_movk_i32 s32, 0x400
+; GFX9-GISEL-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, s32
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_lshl2_add_u32 s5, s5, 15
+; GFX9-GISEL-NEXT:    s_and_b32 s5, s5, -16
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 22
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-GISEL-NEXT:    s_lshl_b32 s5, s5, 6
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, 0
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s4, s5
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_kernel_uniform_under_aligned:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 22
+; GFX11-SDAG-NEXT:    s_mov_b32 s32, 16
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, 0
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    s_add_i32 s0, s0, 15
+; GFX11-SDAG-NEXT:    s_and_b32 s0, s0, -16
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s1, s0
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_kernel_uniform_under_aligned:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 22
+; GFX11-GISEL-NEXT:    s_mov_b32 s32, 16
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, s32
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s0 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_lshl2_add_u32 s1, s1, 15
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_and_b32 s1, s1, -16
+; GFX11-GISEL-NEXT:    s_lshl_b32 s1, s1, 5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s0, s1
+; GFX11-GISEL-NEXT:    s_endpgm
   %alloca = alloca i32, i32 %n, align 2, addrspace(5)
   store volatile i32 22, ptr addrspace(5) %alloca
   ret void
 }
 
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
 define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent() {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_kernel_divergent:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_add_u32 s0, s0, s17
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
+; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, 0
+; GFX9-SDAG-NEXT:    s_movk_i32 s32, 0x400
+; GFX9-SDAG-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-SDAG-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX9-SDAG-NEXT:  ; %bb.2:
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, s32
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s6, 6, v0
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_kernel_divergent:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_add_u32 s0, s0, s17
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, 0
+; GFX9-GISEL-NEXT:    s_movk_i32 s32, 0x400
+; GFX9-GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-GISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-GISEL-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX9-GISEL-NEXT:  ; %bb.2:
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, s32
+; GFX9-GISEL-NEXT:    s_lshl_b32 s5, s6, 6
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s4, s5
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_kernel_divergent:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-SDAG-NEXT:    s_mov_b32 s0, 0
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, 0
+; GFX11-SDAG-NEXT:    s_mov_b32 s32, 16
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
+; GFX11-SDAG-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-SDAG-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX11-SDAG-NEXT:  ; %bb.2:
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 0x7b
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 5, s1
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_kernel_divergent:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s32, 16
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-GISEL-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX11-GISEL-NEXT:  ; %bb.2:
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, s32
+; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_endpgm
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
   %alloca = alloca float, i32 %idx, addrspace(5)
   store volatile i32 123, ptr addrspace(5) %alloca
   ret void
 }
 
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
 define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent_over_aligned() {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_kernel_divergent_over_aligned:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_add_u32 s0, s0, s17
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
+; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, 0
+; GFX9-SDAG-NEXT:    s_movk_i32 s32, 0x2000
+; GFX9-SDAG-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-SDAG-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX9-SDAG-NEXT:  ; %bb.2:
+; GFX9-SDAG-NEXT:    s_add_i32 s4, s32, 0x1fff
+; GFX9-SDAG-NEXT:    s_and_b32 s4, s4, 0xffffe000
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v1, s6, 6, v0
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x1bc
+; GFX9-SDAG-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_kernel_divergent_over_aligned:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_add_u32 s0, s0, s17
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, 0
+; GFX9-GISEL-NEXT:    s_movk_i32 s32, 0x2000
+; GFX9-GISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-GISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-GISEL-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX9-GISEL-NEXT:  ; %bb.2:
+; GFX9-GISEL-NEXT:    s_add_u32 s5, s32, 0x1fff
+; GFX9-GISEL-NEXT:    s_and_b32 s5, s5, 0xffffe000
+; GFX9-GISEL-NEXT:    s_lshl_b32 s4, s6, 6
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0x1bc
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s5, s4
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_kernel_divergent_over_aligned:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-NEXT:    s_movk_i32 s32, 0x80
+; GFX11-SDAG-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-SDAG-NEXT:    s_add_i32 s0, s32, 0xfff
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, 0
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-SDAG-NEXT:    s_and_b32 s0, s0, 0xfffff000
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, 0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
+; GFX11-SDAG-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s4, v0, s3
+; GFX11-SDAG-NEXT:    s_bitset0_b32 s2, s3
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_max_u32 s1, s1, s4
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX11-SDAG-NEXT:  ; %bb.2:
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s1, 5, s0
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 0x1bc
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s0 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_kernel_divergent_over_aligned:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, 0
+; GFX11-GISEL-NEXT:    s_movk_i32 s32, 0x80
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-GISEL-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX11-GISEL-NEXT:  ; %bb.2:
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x1bc
+; GFX11-GISEL-NEXT:    s_add_u32 s1, s32, 0xfff
+; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-GISEL-NEXT:    s_and_b32 s1, s1, 0xfffff000
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_endpgm
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
   %alloca = alloca i32, i32 %idx, align 128, addrspace(5)
   store volatile i32 444, ptr addrspace(5) %alloca
   ret void
 }
 
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
 define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent_under_aligned() {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_kernel_divergent_under_aligned:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_add_u32 s0, s0, s17
+; GFX9-SDAG-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, 0
+; GFX9-SDAG-NEXT:    s_movk_i32 s32, 0x400
+; GFX9-SDAG-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-SDAG-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX9-SDAG-NEXT:  ; %bb.2:
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, s32
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s6, 6, v0
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0x29a
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_kernel_divergent_under_aligned:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_add_u32 s0, s0, s17
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 4, 15
+; GFX9-GISEL-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, 0
+; GFX9-GISEL-NEXT:    s_movk_i32 s32, 0x400
+; GFX9-GISEL-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-GISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-GISEL-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX9-GISEL-NEXT:  ; %bb.2:
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, s32
+; GFX9-GISEL-NEXT:    s_lshl_b32 s5, s6, 6
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0x29a
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s4, s5
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_kernel_divergent_under_aligned:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-SDAG-NEXT:    s_mov_b32 s0, 0
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, 0
+; GFX11-SDAG-NEXT:    s_mov_b32 s32, 16
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX11-SDAG-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-SDAG-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX11-SDAG-NEXT:  ; %bb.2:
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 0x29a
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 5, s1
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_kernel_divergent_under_aligned:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s32, 16
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 4, 15
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-GISEL-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX11-GISEL-NEXT:  ; %bb.2:
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x29a
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, s32
+; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_endpgm
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
   %alloca = alloca i128, i32 %idx, align 2, addrspace(5)
   store volatile i32 666, ptr addrspace(5) %alloca
   ret void
 }
 
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
 define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 %n, i32 %m) {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_kernel_multiple_allocas:
+; GFX9-SDAG:       ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX9-SDAG-NEXT:    s_add_u32 s0, s0, s17
+; GFX9-SDAG-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s8, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, 0
+; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX9-SDAG-NEXT:    s_movk_i32 s32, 0x2000
+; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX9-SDAG-NEXT:  ; %bb.1: ; %bb.0
+; GFX9-SDAG-NEXT:    s_lshl_b32 s5, s5, 2
+; GFX9-SDAG-NEXT:    s_add_i32 s5, s5, 15
+; GFX9-SDAG-NEXT:    s_add_i32 s6, s32, 0xfff
+; GFX9-SDAG-NEXT:    s_and_b32 s5, s5, -16
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT:    s_and_b32 s9, s6, 0xfffff000
+; GFX9-SDAG-NEXT:    s_lshl_b32 s5, s5, 6
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
+; GFX9-SDAG-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9-SDAG-NEXT:    s_add_i32 s32, s9, s5
+; GFX9-SDAG-NEXT:  .LBB6_2: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s5, s[6:7]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s10, v0, s5
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[6:7], s5
+; GFX9-SDAG-NEXT:    s_max_u32 s8, s8, s10
+; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB6_2
+; GFX9-SDAG-NEXT:  ; %bb.3:
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, s32
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s5
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s8, 6, v0
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 3
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 4
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s5
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:  .LBB6_4: ; %bb.1
+; GFX9-SDAG-NEXT:    s_lshl_b32 s4, s4, 2
+; GFX9-SDAG-NEXT:    s_add_i32 s4, s4, 15
+; GFX9-SDAG-NEXT:    s_and_b32 s4, s4, -16
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 1
+; GFX9-SDAG-NEXT:    s_lshl_b32 s4, s4, 6
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, s32
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s33
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 2
+; GFX9-SDAG-NEXT:    s_add_i32 s32, s5, s4
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s5
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_kernel_multiple_allocas:
+; GFX9-GISEL:       ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX9-GISEL-NEXT:    s_add_u32 s0, s0, s17
+; GFX9-GISEL-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-GISEL-NEXT:    s_mov_b32 s8, 0
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, 0
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX9-GISEL-NEXT:    s_movk_i32 s32, 0x2000
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX9-GISEL-NEXT:  ; %bb.1: ; %bb.0
+; GFX9-GISEL-NEXT:    s_lshl2_add_u32 s5, s5, 15
+; GFX9-GISEL-NEXT:    s_and_b32 s5, s5, -16
+; GFX9-GISEL-NEXT:    s_lshl_b32 s6, s5, 6
+; GFX9-GISEL-NEXT:    s_add_u32 s5, s32, 0xfff
+; GFX9-GISEL-NEXT:    s_and_b32 s5, s5, 0xfffff000
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s5, s6
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9-GISEL-NEXT:  .LBB6_2: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
+; GFX9-GISEL-NEXT:    v_readlane_b32 s10, v0, s9
+; GFX9-GISEL-NEXT:    s_bitset0_b64 s[6:7], s9
+; GFX9-GISEL-NEXT:    s_max_u32 s8, s8, s10
+; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB6_2
+; GFX9-GISEL-NEXT:  ; %bb.3:
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, s32
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 3
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-GISEL-NEXT:    s_lshl_b32 s7, s8, 6
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 4
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s6, s7
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:  .LBB6_4: ; %bb.1
+; GFX9-GISEL-NEXT:    s_lshl2_add_u32 s4, s4, 15
+; GFX9-GISEL-NEXT:    s_mov_b32 s5, s32
+; GFX9-GISEL-NEXT:    s_and_b32 s4, s4, -16
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 1
+; GFX9-GISEL-NEXT:    s_lshl_b32 s4, s4, 6
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], s33
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s5, s4
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_kernel_multiple_allocas:
+; GFX11-SDAG:       ; %bb.0: ; %entry
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-SDAG-NEXT:    s_mov_b32 s2, 0
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, 0
+; GFX11-SDAG-NEXT:    s_movk_i32 s32, 0x80
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX11-SDAG-NEXT:  ; %bb.1: ; %bb.0
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-NEXT:    s_lshl_b32 s1, s1, 2
+; GFX11-SDAG-NEXT:    s_add_i32 s3, s32, 0x7ff
+; GFX11-SDAG-NEXT:    s_add_i32 s1, s1, 15
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    s_and_b32 s4, s1, -16
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-SDAG-NEXT:    s_and_b32 s1, s3, 0xfffff800
+; GFX11-SDAG-NEXT:    s_lshl_b32 s3, s4, 5
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s1, s3
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
+; GFX11-SDAG-NEXT:    s_mov_b32 s3, exec_lo
+; GFX11-SDAG-NEXT:  .LBB6_2: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s4, s3
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX11-SDAG-NEXT:    s_bitset0_b32 s3, s4
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_max_u32 s2, s2, s5
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB6_2
+; GFX11-SDAG-NEXT:  ; %bb.3:
+; GFX11-SDAG-NEXT:    s_mov_b32 s3, s32
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v1, 3 :: v_dual_mov_b32 v2, 4
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s2, 5, s3
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v2, s3 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT:  .LBB6_4: ; %bb.1
+; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
+; GFX11-SDAG-NEXT:    s_add_i32 s0, s0, 15
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
+; GFX11-SDAG-NEXT:    s_and_b32 s0, s0, -16
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v0, s33 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s1, s0
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_kernel_multiple_allocas:
+; GFX11-GISEL:       ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-GISEL-NEXT:    s_mov_b32 s2, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, 0
+; GFX11-GISEL-NEXT:    s_movk_i32 s32, 0x80
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX11-GISEL-NEXT:  ; %bb.1: ; %bb.0
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT:    s_lshl2_add_u32 s1, s1, 15
+; GFX11-GISEL-NEXT:    s_add_u32 s3, s32, 0x7ff
+; GFX11-GISEL-NEXT:    s_and_b32 s1, s1, -16
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_lshl_b32 s4, s1, 5
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT:    s_and_b32 s1, s3, 0xfffff800
+; GFX11-GISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s4
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT:  .LBB6_2: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s4, s3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX11-GISEL-NEXT:    s_bitset0_b32 s3, s4
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_max_u32 s2, s2, s5
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB6_2
+; GFX11-GISEL-NEXT:  ; %bb.3:
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 4
+; GFX11-GISEL-NEXT:    s_mov_b32 s3, s32
+; GFX11-GISEL-NEXT:    s_lshl_b32 s2, s2, 5
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v1, s3 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s3, s2
+; GFX11-GISEL-NEXT:  .LBB6_4: ; %bb.1
+; GFX11-GISEL-NEXT:    s_lshl2_add_u32 s0, s0, 15
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
+; GFX11-GISEL-NEXT:    s_and_b32 s0, s0, -16
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, s32
+; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s33 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v1, s1 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT:    s_endpgm
 entry:
   %cond = icmp eq i32 %n, 0
   %alloca1 = alloca i32, i32 8, addrspace(5)
@@ -77,10 +842,206 @@ bb.1:
   ret void
 }
 
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
 define amdgpu_kernel void @test_dynamic_stackalloc_kernel_control_flow(i32 %n, i32 %m) {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_kernel_control_flow:
+; GFX9-SDAG:       ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX9-SDAG-NEXT:    s_add_u32 s0, s0, s17
+; GFX9-SDAG-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, 0
+; GFX9-SDAG-NEXT:    s_movk_i32 s32, 0x1000
+; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0
+; GFX9-SDAG-NEXT:    s_cbranch_scc0 .LBB7_6
+; GFX9-SDAG-NEXT:  ; %bb.1: ; %bb.1
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
+; GFX9-SDAG-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9-SDAG-NEXT:  .LBB7_2: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s8, s[6:7]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s9, v0, s8
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[6:7], s8
+; GFX9-SDAG-NEXT:    s_max_u32 s4, s4, s9
+; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB7_2
+; GFX9-SDAG-NEXT:  ; %bb.3:
+; GFX9-SDAG-NEXT:    s_mov_b32 s6, s32
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s4, 6, v0
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 1
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s6
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_cbranch_execnz .LBB7_5
+; GFX9-SDAG-NEXT:  .LBB7_4: ; %bb.0
+; GFX9-SDAG-NEXT:    s_lshl_b32 s5, s5, 2
+; GFX9-SDAG-NEXT:    s_add_i32 s4, s32, 0xfff
+; GFX9-SDAG-NEXT:    s_add_i32 s5, s5, 15
+; GFX9-SDAG-NEXT:    s_and_b32 s4, s4, 0xfffff000
+; GFX9-SDAG-NEXT:    s_and_b32 s5, s5, -16
+; GFX9-SDAG-NEXT:    s_lshl_b32 s5, s5, 6
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 2
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-SDAG-NEXT:    s_add_i32 s32, s4, s5
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:  .LBB7_5: ; %bb.2
+; GFX9-SDAG-NEXT:    s_endpgm
+; GFX9-SDAG-NEXT:  .LBB7_6:
+; GFX9-SDAG-NEXT:    s_branch .LBB7_4
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_kernel_control_flow:
+; GFX9-GISEL:       ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX9-GISEL-NEXT:    s_add_u32 s0, s0, s17
+; GFX9-GISEL-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-GISEL-NEXT:    s_mov_b32 s8, 0
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, 0
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, 1
+; GFX9-GISEL-NEXT:    s_movk_i32 s32, 0x1000
+; GFX9-GISEL-NEXT:    s_cbranch_scc0 .LBB7_4
+; GFX9-GISEL-NEXT:  ; %bb.1: ; %bb.1
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9-GISEL-NEXT:  .LBB7_2: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s4, s[6:7]
+; GFX9-GISEL-NEXT:    v_readlane_b32 s9, v0, s4
+; GFX9-GISEL-NEXT:    s_bitset0_b64 s[6:7], s4
+; GFX9-GISEL-NEXT:    s_max_u32 s8, s8, s9
+; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB7_2
+; GFX9-GISEL-NEXT:  ; %bb.3:
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, s32
+; GFX9-GISEL-NEXT:    s_lshl_b32 s6, s8, 6
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s4, s6
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
+; GFX9-GISEL-NEXT:  .LBB7_4: ; %Flow
+; GFX9-GISEL-NEXT:    s_xor_b32 s4, s4, 1
+; GFX9-GISEL-NEXT:    s_and_b32 s4, s4, 1
+; GFX9-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB7_6
+; GFX9-GISEL-NEXT:  ; %bb.5: ; %bb.0
+; GFX9-GISEL-NEXT:    s_lshl2_add_u32 s4, s5, 15
+; GFX9-GISEL-NEXT:    s_add_u32 s5, s32, 0xfff
+; GFX9-GISEL-NEXT:    s_and_b32 s4, s4, -16
+; GFX9-GISEL-NEXT:    s_and_b32 s5, s5, 0xfffff000
+; GFX9-GISEL-NEXT:    s_lshl_b32 s4, s4, 6
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s5, s4
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:  .LBB7_6: ; %bb.2
+; GFX9-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_kernel_control_flow:
+; GFX11-SDAG:       ; %bb.0: ; %entry
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, 0
+; GFX11-SDAG-NEXT:    s_mov_b32 s32, 64
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX11-SDAG-NEXT:    s_mov_b32 s0, 0
+; GFX11-SDAG-NEXT:    s_cbranch_scc0 .LBB7_6
+; GFX11-SDAG-NEXT:  ; %bb.1: ; %bb.1
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
+; GFX11-SDAG-NEXT:  .LBB7_2: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s4, v0, s3
+; GFX11-SDAG-NEXT:    s_bitset0_b32 s2, s3
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_max_u32 s0, s0, s4
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB7_2
+; GFX11-SDAG-NEXT:  ; %bb.3:
+; GFX11-SDAG-NEXT:    s_mov_b32 s2, s32
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 1
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 5, s2
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s2 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT:    s_cbranch_execnz .LBB7_5
+; GFX11-SDAG-NEXT:  .LBB7_4: ; %bb.0
+; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s1, 2
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 2
+; GFX11-SDAG-NEXT:    s_add_i32 s0, s0, 15
+; GFX11-SDAG-NEXT:    s_add_i32 s1, s32, 0x7ff
+; GFX11-SDAG-NEXT:    s_and_b32 s0, s0, -16
+; GFX11-SDAG-NEXT:    s_and_b32 s1, s1, 0xfffff800
+; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s1, s0
+; GFX11-SDAG-NEXT:  .LBB7_5: ; %bb.2
+; GFX11-SDAG-NEXT:    s_endpgm
+; GFX11-SDAG-NEXT:  .LBB7_6:
+; GFX11-SDAG-NEXT:    s_branch .LBB7_4
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_kernel_control_flow:
+; GFX11-GISEL:       ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-GISEL-NEXT:    s_mov_b32 s2, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s32, 64
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, 1
+; GFX11-GISEL-NEXT:    s_cbranch_scc0 .LBB7_4
+; GFX11-GISEL-NEXT:  ; %bb.1: ; %bb.1
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT:  .LBB7_2: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s3, s0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s4, v0, s3
+; GFX11-GISEL-NEXT:    s_bitset0_b32 s0, s3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_max_u32 s2, s2, s4
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB7_2
+; GFX11-GISEL-NEXT:  ; %bb.3:
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 1
+; GFX11-GISEL-NEXT:    s_mov_b32 s3, s32
+; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s2, 5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s3, s0
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s3 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:  .LBB7_4: ; %Flow
+; GFX11-GISEL-NEXT:    s_xor_b32 s0, s0, 1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_and_b32 s0, s0, 1
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB7_6
+; GFX11-GISEL-NEXT:  ; %bb.5: ; %bb.0
+; GFX11-GISEL-NEXT:    s_lshl2_add_u32 s0, s1, 15
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 2
+; GFX11-GISEL-NEXT:    s_add_u32 s1, s32, 0x7ff
+; GFX11-GISEL-NEXT:    s_and_b32 s0, s0, -16
+; GFX11-GISEL-NEXT:    s_and_b32 s1, s1, 0xfffff800
+; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT:  .LBB7_6: ; %bb.2
+; GFX11-GISEL-NEXT:    s_endpgm
 entry:
   %cond = icmp eq i32 %n, 0
   br i1 %cond, label %bb.0, label %bb.1
@@ -97,62 +1058,1113 @@ bb.2:
   ret void
 }
 
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
 define void @test_dynamic_stackalloc_device_uniform(i32 %n) {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_uniform:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT:    s_mov_b32 s9, s33
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s32
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-SDAG-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-SDAG-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX9-SDAG-NEXT:  ; %bb.2:
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, s32
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s6, 6, v0
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s9
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_uniform:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT:    s_mov_b32 s9, s33
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, s32
+; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-GISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-GISEL-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX9-GISEL-NEXT:  ; %bb.2:
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, s32
+; GFX9-GISEL-NEXT:    s_lshl_b32 s5, s6, 6
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s4, s5
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, s9
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_uniform:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-SDAG-NEXT:    s_mov_b32 s4, s33
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-SDAG-NEXT:    s_mov_b32 s0, 0
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s32
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-SDAG-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-SDAG-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX11-SDAG-NEXT:  ; %bb.2:
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 0x7b
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 5, s1
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s4
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_uniform:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT:    s_mov_b32 s4, s33
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s32
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-GISEL-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX11-GISEL-NEXT:  ; %bb.2:
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, s32
+; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s4
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, i32 %n, addrspace(5)
   store volatile i32 123, ptr addrspace(5) %alloca
   ret void
 }
 
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
 define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_uniform_over_aligned:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT:    s_mov_b32 s9, s33
+; GFX9-SDAG-NEXT:    s_add_i32 s33, s32, 0x1fc0
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX9-SDAG-NEXT:    s_and_b32 s33, s33, 0xffffe000
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0x4000
+; GFX9-SDAG-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-SDAG-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB9_1
+; GFX9-SDAG-NEXT:  ; %bb.2:
+; GFX9-SDAG-NEXT:    s_add_i32 s4, s32, 0x1fff
+; GFX9-SDAG-NEXT:    s_and_b32 s4, s4, 0xffffe000
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v1, s6, 6, v0
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 10
+; GFX9-SDAG-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0xc000
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s9
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_uniform_over_aligned:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT:    s_mov_b32 s9, s33
+; GFX9-GISEL-NEXT:    s_add_i32 s33, s32, 0x1fc0
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX9-GISEL-NEXT:    s_and_b32 s33, s33, 0xffffe000
+; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0x4000
+; GFX9-GISEL-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-GISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-GISEL-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB9_1
+; GFX9-GISEL-NEXT:  ; %bb.2:
+; GFX9-GISEL-NEXT:    s_add_u32 s5, s32, 0x1fff
+; GFX9-GISEL-NEXT:    s_lshl_b32 s4, s6, 6
+; GFX9-GISEL-NEXT:    s_and_b32 s5, s5, 0xffffe000
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s5, s4
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 10
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0xc000
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, s9
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_uniform_over_aligned:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-SDAG-NEXT:    s_mov_b32 s4, s33
+; GFX11-SDAG-NEXT:    s_add_i32 s33, s32, 0x7f
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-SDAG-NEXT:    s_mov_b32 s0, 0
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-SDAG-NEXT:    s_and_b32 s33, s33, 0xffffff80
+; GFX11-SDAG-NEXT:    s_addk_i32 s32, 0x100
+; GFX11-SDAG-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-SDAG-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB9_1
+; GFX11-SDAG-NEXT:  ; %bb.2:
+; GFX11-SDAG-NEXT:    s_add_i32 s1, s32, 0xfff
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 10
+; GFX11-SDAG-NEXT:    s_and_b32 s1, s1, 0xfffff000
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s4
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 5, s1
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_addk_i32 s32, 0xff00
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_uniform_over_aligned:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT:    s_mov_b32 s4, s33
+; GFX11-GISEL-NEXT:    s_add_i32 s33, s32, 0x7f
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT:    s_and_b32 s33, s33, 0xffffff80
+; GFX11-GISEL-NEXT:    s_addk_i32 s32, 0x100
+; GFX11-GISEL-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-GISEL-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB9_1
+; GFX11-GISEL-NEXT:  ; %bb.2:
+; GFX11-GISEL-NEXT:    s_add_u32 s1, s32, 0xfff
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 10
+; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-GISEL-NEXT:    s_and_b32 s1, s1, 0xfffff000
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s4
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_addk_i32 s32, 0xff00
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, i32 %n, align 128, addrspace(5)
   store volatile i32 10, ptr addrspace(5) %alloca
   ret void
 }
 
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
 define void @test_dynamic_stackalloc_device_uniform_under_aligned(i32 %n) {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_uniform_under_aligned:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT:    s_mov_b32 s9, s33
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s32
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-SDAG-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-SDAG-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB10_1
+; GFX9-SDAG-NEXT:  ; %bb.2:
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, s32
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s6, 6, v0
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 22
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s9
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_uniform_under_aligned:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT:    s_mov_b32 s9, s33
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, s32
+; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-GISEL-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-GISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-GISEL-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB10_1
+; GFX9-GISEL-NEXT:  ; %bb.2:
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, s32
+; GFX9-GISEL-NEXT:    s_lshl_b32 s5, s6, 6
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s4, s5
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 22
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, s9
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_uniform_under_aligned:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-SDAG-NEXT:    s_mov_b32 s4, s33
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-SDAG-NEXT:    s_mov_b32 s0, 0
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s32
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-SDAG-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-SDAG-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB10_1
+; GFX11-SDAG-NEXT:  ; %bb.2:
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 22
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 5, s1
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s4
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_uniform_under_aligned:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT:    s_mov_b32 s4, s33
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s32
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-GISEL-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-GISEL-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB10_1
+; GFX11-GISEL-NEXT:  ; %bb.2:
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 22
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, s32
+; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s4
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, i32 %n, align 2, addrspace(5)
   store volatile i32 22, ptr addrspace(5) %alloca
   ret void
 }
 
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
 define void @test_dynamic_stackalloc_device_divergent() {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_divergent:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT:    s_mov_b32 s9, s33
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
+; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s32
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-SDAG-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-SDAG-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB11_1
+; GFX9-SDAG-NEXT:  ; %bb.2:
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, s32
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s6, 6, v0
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s9
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_divergent:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT:    s_mov_b32 s9, s33
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, s32
+; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-GISEL-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-GISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-GISEL-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB11_1
+; GFX9-GISEL-NEXT:  ; %bb.2:
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, s32
+; GFX9-GISEL-NEXT:    s_lshl_b32 s5, s6, 6
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s4, s5
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, s9
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_divergent:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; GFX11-SDAG-NEXT:    s_mov_b32 s4, s33
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-SDAG-NEXT:    s_mov_b32 s0, 0
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s32
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
+; GFX11-SDAG-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-SDAG-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB11_1
+; GFX11-SDAG-NEXT:  ; %bb.2:
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 0x7b
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 5, s1
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s4
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_divergent:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; GFX11-GISEL-NEXT:    s_mov_b32 s4, s33
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s32
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-GISEL-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB11_1
+; GFX11-GISEL-NEXT:  ; %bb.2:
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, s32
+; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s4
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
   %alloca = alloca i32, i32 %idx, addrspace(5)
   store volatile i32 123, ptr addrspace(5) %alloca
   ret void
 }
 
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
 define void @test_dynamic_stackalloc_device_divergent_over_aligned() {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_divergent_over_aligned:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_mov_b32 s10, s33
+; GFX9-SDAG-NEXT:    s_add_i32 s33, s32, 0x1fc0
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0x4000
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; GFX9-SDAG-NEXT:    s_add_i32 s4, s32, 0x1fff
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT:    s_and_b32 s6, s4, 0xffffe000
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
+; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-SDAG-NEXT:    s_mov_b32 s7, 0
+; GFX9-SDAG-NEXT:    s_and_b32 s33, s33, 0xffffe000
+; GFX9-SDAG-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s8, s[4:5]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s9, v0, s8
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[4:5], s8
+; GFX9-SDAG-NEXT:    s_max_u32 s7, s7, s9
+; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB12_1
+; GFX9-SDAG-NEXT:  ; %bb.2:
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v1, s7, 6, v0
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x1bc
+; GFX9-SDAG-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0xc000
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s10
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_over_aligned:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT:    s_mov_b32 s9, s33
+; GFX9-GISEL-NEXT:    s_add_i32 s33, s32, 0x1fc0
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX9-GISEL-NEXT:    s_and_b32 s33, s33, 0xffffe000
+; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0x4000
+; GFX9-GISEL-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-GISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-GISEL-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB12_1
+; GFX9-GISEL-NEXT:  ; %bb.2:
+; GFX9-GISEL-NEXT:    s_add_u32 s5, s32, 0x1fff
+; GFX9-GISEL-NEXT:    s_lshl_b32 s4, s6, 6
+; GFX9-GISEL-NEXT:    s_and_b32 s5, s5, 0xffffe000
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s5, s4
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0x1bc
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0xc000
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, s9
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_divergent_over_aligned:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; GFX11-SDAG-NEXT:    s_mov_b32 s5, s33
+; GFX11-SDAG-NEXT:    s_add_i32 s33, s32, 0x7f
+; GFX11-SDAG-NEXT:    s_addk_i32 s32, 0x100
+; GFX11-SDAG-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-SDAG-NEXT:    s_add_i32 s0, s32, 0xfff
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, 0
+; GFX11-SDAG-NEXT:    s_and_b32 s0, s0, 0xfffff000
+; GFX11-SDAG-NEXT:    s_and_b32 s33, s33, 0xffffff80
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
+; GFX11-SDAG-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s4, v0, s3
+; GFX11-SDAG-NEXT:    s_bitset0_b32 s2, s3
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_max_u32 s1, s1, s4
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB12_1
+; GFX11-SDAG-NEXT:  ; %bb.2:
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s1, 5, s0
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 0x1bc
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s5
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s0 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    s_addk_i32 s32, 0xff00
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_over_aligned:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; GFX11-GISEL-NEXT:    s_mov_b32 s4, s33
+; GFX11-GISEL-NEXT:    s_add_i32 s33, s32, 0x7f
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT:    s_and_b32 s33, s33, 0xffffff80
+; GFX11-GISEL-NEXT:    s_addk_i32 s32, 0x100
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-GISEL-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB12_1
+; GFX11-GISEL-NEXT:  ; %bb.2:
+; GFX11-GISEL-NEXT:    s_add_u32 s1, s32, 0xfff
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x1bc
+; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-GISEL-NEXT:    s_and_b32 s1, s1, 0xfffff000
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s4
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_addk_i32 s32, 0xff00
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
   %alloca = alloca i32, i32 %idx, align 128, addrspace(5)
   store volatile i32 444, ptr addrspace(5) %alloca
   ret void
 }
 
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
 define void @test_dynamic_stackalloc_device_divergent_under_aligned() {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_divergent_under_aligned:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT:    s_mov_b32 s9, s33
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
+; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s32
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-SDAG-NEXT:  .LBB13_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-SDAG-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB13_1
+; GFX9-SDAG-NEXT:  ; %bb.2:
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, s32
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s6, 6, v0
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0x29a
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s9
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_under_aligned:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT:    s_mov_b32 s9, s33
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, s32
+; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-GISEL-NEXT:  .LBB13_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-GISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-GISEL-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB13_1
+; GFX9-GISEL-NEXT:  ; %bb.2:
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, s32
+; GFX9-GISEL-NEXT:    s_lshl_b32 s5, s6, 6
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s4, s5
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0x29a
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, s9
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_divergent_under_aligned:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; GFX11-SDAG-NEXT:    s_mov_b32 s4, s33
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-SDAG-NEXT:    s_mov_b32 s0, 0
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s32
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
+; GFX11-SDAG-NEXT:  .LBB13_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-SDAG-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB13_1
+; GFX11-SDAG-NEXT:  ; %bb.2:
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 0x29a
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 5, s1
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s4
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_under_aligned:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; GFX11-GISEL-NEXT:    s_mov_b32 s4, s33
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s32
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT:  .LBB13_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-GISEL-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB13_1
+; GFX11-GISEL-NEXT:  ; %bb.2:
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x29a
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, s32
+; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s4
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
   %alloca = alloca i32, i32 %idx, align 2, addrspace(5)
   store volatile i32 666, ptr addrspace(5) %alloca
   ret void
 }
 
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
 define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_multiple_allocas:
+; GFX9-SDAG:       ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_mov_b32 s13, s33
+; GFX9-SDAG-NEXT:    s_add_i32 s33, s32, 0xfc0
+; GFX9-SDAG-NEXT:    s_mov_b32 s8, 0
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-SDAG-NEXT:    s_and_b32 s33, s33, 0xfffff000
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0x3000
+; GFX9-SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-SDAG-NEXT:    s_cbranch_execz .LBB14_6
+; GFX9-SDAG-NEXT:  ; %bb.1: ; %bb.0
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v1, v1, 2, 15
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v1, -16, v1
+; GFX9-SDAG-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9-SDAG-NEXT:    s_mov_b32 s10, 0
+; GFX9-SDAG-NEXT:  .LBB14_2: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s9, s[6:7]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s11, v1, s9
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[6:7], s9
+; GFX9-SDAG-NEXT:    s_max_u32 s10, s10, s11
+; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB14_2
+; GFX9-SDAG-NEXT:  ; %bb.3:
+; GFX9-SDAG-NEXT:    s_add_i32 s6, s32, 0xfff
+; GFX9-SDAG-NEXT:    s_and_b32 s9, s6, 0xfffff000
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v1, s10, 6, v1
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v1
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v1, 0x3ff, v31
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v1, v1, 2, 15
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v1, 0x1ff0, v1
+; GFX9-SDAG-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9-SDAG-NEXT:    s_mov_b32 s10, 0
+; GFX9-SDAG-NEXT:  .LBB14_4: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s11, s[6:7]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s12, v1, s11
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[6:7], s11
+; GFX9-SDAG-NEXT:    s_max_u32 s10, s10, s12
+; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB14_4
+; GFX9-SDAG-NEXT:  ; %bb.5:
+; GFX9-SDAG-NEXT:    s_mov_b32 s6, s32
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v1, s10, 6, v1
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 3
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s9
+; GFX9-SDAG-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 4
+; GFX9-SDAG-NEXT:    buffer_store_dword v1, off, s[0:3], s6
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:  .LBB14_6: ; %bb.1
+; GFX9-SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 2
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-SDAG-NEXT:  .LBB14_7: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s6, s[4:5]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s7, v0, s6
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[4:5], s6
+; GFX9-SDAG-NEXT:    s_max_u32 s8, s8, s7
+; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB14_7
+; GFX9-SDAG-NEXT:  ; %bb.8:
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, s32
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s8, 6, v0
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 1
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s33
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    buffer_store_dword v1, off, s[0:3], s4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0xd000
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s13
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_multiple_allocas:
+; GFX9-GISEL:       ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_mov_b32 s13, s33
+; GFX9-GISEL-NEXT:    s_add_i32 s33, s32, 0xfc0
+; GFX9-GISEL-NEXT:    s_mov_b32 s8, 0
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-GISEL-NEXT:    s_and_b32 s33, s33, 0xfffff000
+; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0x3000
+; GFX9-GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-GISEL-NEXT:    s_cbranch_execz .LBB14_6
+; GFX9-GISEL-NEXT:  ; %bb.1: ; %bb.0
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v1, v1, 2, 15
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0x3ff, v31
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, -16, v1
+; GFX9-GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9-GISEL-NEXT:    s_mov_b32 s9, 0
+; GFX9-GISEL-NEXT:  .LBB14_2: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
+; GFX9-GISEL-NEXT:    v_readlane_b32 s11, v1, s10
+; GFX9-GISEL-NEXT:    s_bitset0_b64 s[6:7], s10
+; GFX9-GISEL-NEXT:    s_max_u32 s9, s9, s11
+; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB14_2
+; GFX9-GISEL-NEXT:  ; %bb.3:
+; GFX9-GISEL-NEXT:    s_add_u32 s7, s32, 0xfff
+; GFX9-GISEL-NEXT:    s_lshl_b32 s6, s9, 6
+; GFX9-GISEL-NEXT:    s_and_b32 s9, s7, 0xfffff000
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v1, v2, 2, 15
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s9, s6
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, -16, v1
+; GFX9-GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9-GISEL-NEXT:    s_mov_b32 s10, 0
+; GFX9-GISEL-NEXT:  .LBB14_4: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s11, s[6:7]
+; GFX9-GISEL-NEXT:    v_readlane_b32 s12, v1, s11
+; GFX9-GISEL-NEXT:    s_bitset0_b64 s[6:7], s11
+; GFX9-GISEL-NEXT:    s_max_u32 s10, s10, s12
+; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB14_4
+; GFX9-GISEL-NEXT:  ; %bb.5:
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, s32
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 3
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s9
+; GFX9-GISEL-NEXT:    s_lshl_b32 s7, s10, 6
+; GFX9-GISEL-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 4
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s6, s7
+; GFX9-GISEL-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:  .LBB14_6: ; %bb.1
+; GFX9-GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-GISEL-NEXT:  .LBB14_7: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s6, s[4:5]
+; GFX9-GISEL-NEXT:    v_readlane_b32 s7, v0, s6
+; GFX9-GISEL-NEXT:    s_bitset0_b64 s[4:5], s6
+; GFX9-GISEL-NEXT:    s_max_u32 s8, s8, s7
+; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB14_7
+; GFX9-GISEL-NEXT:  ; %bb.8:
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, s32
+; GFX9-GISEL-NEXT:    s_lshl_b32 s5, s8, 6
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 1
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s4, s5
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], s33
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0xd000
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, s13
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_multiple_allocas:
+; GFX11-SDAG:       ; %bb.0: ; %entry
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_mov_b32 s7, s33
+; GFX11-SDAG-NEXT:    s_add_i32 s33, s32, 63
+; GFX11-SDAG-NEXT:    s_mov_b32 s0, 0
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-SDAG-NEXT:    s_and_not1_b32 s33, s33, 63
+; GFX11-SDAG-NEXT:    s_addk_i32 s32, 0xc0
+; GFX11-SDAG-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX11-SDAG-NEXT:    s_cbranch_execz .LBB14_6
+; GFX11-SDAG-NEXT:  ; %bb.1: ; %bb.0
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v1, v1, 2, 15
+; GFX11-SDAG-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-SDAG-NEXT:    s_mov_b32 s3, 0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, -16, v1
+; GFX11-SDAG-NEXT:  .LBB14_2: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s4, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s5, v1, s4
+; GFX11-SDAG-NEXT:    s_bitset0_b32 s2, s4
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_max_u32 s3, s3, s5
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB14_2
+; GFX11-SDAG-NEXT:  ; %bb.3:
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, 0x3ff, v31
+; GFX11-SDAG-NEXT:    s_add_i32 s2, s32, 0x7ff
+; GFX11-SDAG-NEXT:    s_mov_b32 s4, exec_lo
+; GFX11-SDAG-NEXT:    s_and_b32 s2, s2, 0xfffff800
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v2, s3, 5, s2
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v1, v1, 2, 15
+; GFX11-SDAG-NEXT:    s_mov_b32 s3, 0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, 0x1ff0, v1
+; GFX11-SDAG-NEXT:  .LBB14_4: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s5, s4
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s6, v1, s5
+; GFX11-SDAG-NEXT:    s_bitset0_b32 s4, s5
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_max_u32 s3, s3, s6
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB14_4
+; GFX11-SDAG-NEXT:  ; %bb.5:
+; GFX11-SDAG-NEXT:    s_mov_b32 s4, s32
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v1, s3, 5, s4
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v2, s2 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v3, s4 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v1
+; GFX11-SDAG-NEXT:  .LBB14_6: ; %bb.1
+; GFX11-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v1, v0, 2, 15
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 2
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, -16, v1
+; GFX11-SDAG-NEXT:  .LBB14_7: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s3, v1, s2
+; GFX11-SDAG-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB14_7
+; GFX11-SDAG-NEXT:  ; %bb.8:
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v2, 1
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v1, s0, 5, s1
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v2, s33 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v1
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s7
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_addk_i32 s32, 0xff40
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_multiple_allocas:
+; GFX11-GISEL:       ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_mov_b32 s7, s33
+; GFX11-GISEL-NEXT:    s_add_i32 s33, s32, 63
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-GISEL-NEXT:    s_and_not1_b32 s33, s33, 63
+; GFX11-GISEL-NEXT:    s_addk_i32 s32, 0xc0
+; GFX11-GISEL-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX11-GISEL-NEXT:    s_cbranch_execz .LBB14_6
+; GFX11-GISEL-NEXT:  ; %bb.1: ; %bb.0
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v2, v1, 2, 15
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0x3ff, v31
+; GFX11-GISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX11-GISEL-NEXT:    s_mov_b32 s2, 0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, -16, v2
+; GFX11-GISEL-NEXT:  .LBB14_2: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s4, s3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s5, v2, s4
+; GFX11-GISEL-NEXT:    s_bitset0_b32 s3, s4
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_max_u32 s2, s2, s5
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB14_2
+; GFX11-GISEL-NEXT:  ; %bb.3:
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v1, v1, 2, 15
+; GFX11-GISEL-NEXT:    s_lshl_b32 s5, s2, 5
+; GFX11-GISEL-NEXT:    s_add_u32 s2, s32, 0x7ff
+; GFX11-GISEL-NEXT:    s_mov_b32 s4, exec_lo
+; GFX11-GISEL-NEXT:    s_and_b32 s2, s2, 0xfffff800
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, -16, v1
+; GFX11-GISEL-NEXT:    s_mov_b32 s3, 0
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s2, s5
+; GFX11-GISEL-NEXT:  .LBB14_4: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s5, s4
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s6, v1, s5
+; GFX11-GISEL-NEXT:    s_bitset0_b32 s4, s5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_max_u32 s3, s3, s6
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB14_4
+; GFX11-GISEL-NEXT:  ; %bb.5:
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 3 :: v_dual_mov_b32 v2, 4
+; GFX11-GISEL-NEXT:    s_mov_b32 s4, s32
+; GFX11-GISEL-NEXT:    s_lshl_b32 s3, s3, 5
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v1, s2 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v2, s4 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s4, s3
+; GFX11-GISEL-NEXT:  .LBB14_6: ; %bb.1
+; GFX11-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT:  .LBB14_7: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-GISEL-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB14_7
+; GFX11-GISEL-NEXT:  ; %bb.8:
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, s32
+; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s33 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v1, s1 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s7
+; GFX11-GISEL-NEXT:    s_addk_i32 s32, 0xff40
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %cond = icmp eq i32 %n, 0
   %alloca1 = alloca i32, i32 8, addrspace(5)
@@ -171,10 +2183,272 @@ bb.1:
   ret void
 }
 
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
 define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_control_flow:
+; GFX9-SDAG:       ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_mov_b32 s11, s33
+; GFX9-SDAG-NEXT:    s_add_i32 s33, s32, 0xfc0
+; GFX9-SDAG-NEXT:    s_mov_b32 s8, 0
+; GFX9-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-SDAG-NEXT:    s_and_b32 s33, s33, 0xfffff000
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0x2000
+; GFX9-SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-SDAG-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9-SDAG-NEXT:    s_cbranch_execz .LBB15_4
+; GFX9-SDAG-NEXT:  ; %bb.1: ; %bb.1
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v1, v1, 2, 15
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 2
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v1, -16, v1
+; GFX9-SDAG-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9-SDAG-NEXT:  .LBB15_2: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s9, s[6:7]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s10, v1, s9
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[6:7], s9
+; GFX9-SDAG-NEXT:    s_max_u32 s8, s8, s10
+; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB15_2
+; GFX9-SDAG-NEXT:  ; %bb.3:
+; GFX9-SDAG-NEXT:    s_add_i32 s6, s32, 0xfff
+; GFX9-SDAG-NEXT:    s_and_b32 s6, s6, 0xfffff000
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v2, s8, 6, v1
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v2
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    ; implicit-def: $vgpr31
+; GFX9-SDAG-NEXT:  .LBB15_4: ; %Flow
+; GFX9-SDAG-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX9-SDAG-NEXT:    s_cbranch_execz .LBB15_8
+; GFX9-SDAG-NEXT:  ; %bb.5: ; %bb.0
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
+; GFX9-SDAG-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9-SDAG-NEXT:    s_mov_b32 s8, 0
+; GFX9-SDAG-NEXT:  .LBB15_6: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s9, s[6:7]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s10, v0, s9
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[6:7], s9
+; GFX9-SDAG-NEXT:    s_max_u32 s8, s8, s10
+; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB15_6
+; GFX9-SDAG-NEXT:  ; %bb.7:
+; GFX9-SDAG-NEXT:    s_mov_b32 s6, s32
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s8, 6, v0
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 1
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s6
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:  .LBB15_8: ; %bb.2
+; GFX9-SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0xe000
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s11
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_control_flow:
+; GFX9-GISEL:       ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_mov_b32 s11, s33
+; GFX9-GISEL-NEXT:    s_add_i32 s33, s32, 0xfc0
+; GFX9-GISEL-NEXT:    s_mov_b32 s8, 0
+; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-GISEL-NEXT:    s_and_b32 s33, s33, 0xfffff000
+; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0x2000
+; GFX9-GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-GISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9-GISEL-NEXT:    s_cbranch_execz .LBB15_4
+; GFX9-GISEL-NEXT:  ; %bb.1: ; %bb.1
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v1, 2, 15
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9-GISEL-NEXT:  .LBB15_2: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
+; GFX9-GISEL-NEXT:    v_readlane_b32 s10, v0, s9
+; GFX9-GISEL-NEXT:    s_bitset0_b64 s[6:7], s9
+; GFX9-GISEL-NEXT:    s_max_u32 s8, s8, s10
+; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB15_2
+; GFX9-GISEL-NEXT:  ; %bb.3:
+; GFX9-GISEL-NEXT:    s_add_u32 s7, s32, 0xfff
+; GFX9-GISEL-NEXT:    s_and_b32 s7, s7, 0xfffff000
+; GFX9-GISEL-NEXT:    s_lshl_b32 s6, s8, 6
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s7, s6
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    ; implicit-def: $vgpr31
+; GFX9-GISEL-NEXT:  .LBB15_4: ; %Flow
+; GFX9-GISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX9-GISEL-NEXT:    s_cbranch_execz .LBB15_8
+; GFX9-GISEL-NEXT:  ; %bb.5: ; %bb.0
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9-GISEL-NEXT:    s_mov_b32 s8, 0
+; GFX9-GISEL-NEXT:  .LBB15_6: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s9, s[6:7]
+; GFX9-GISEL-NEXT:    v_readlane_b32 s10, v0, s9
+; GFX9-GISEL-NEXT:    s_bitset0_b64 s[6:7], s9
+; GFX9-GISEL-NEXT:    s_max_u32 s8, s8, s10
+; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB15_6
+; GFX9-GISEL-NEXT:  ; %bb.7:
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, s32
+; GFX9-GISEL-NEXT:    s_lshl_b32 s7, s8, 6
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s6, s7
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:  .LBB15_8: ; %bb.2
+; GFX9-GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0xe000
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, s11
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_control_flow:
+; GFX11-SDAG:       ; %bb.0: ; %entry
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_mov_b32 s5, s33
+; GFX11-SDAG-NEXT:    s_add_i32 s33, s32, 63
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, 0
+; GFX11-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-SDAG-NEXT:    s_and_not1_b32 s33, s33, 63
+; GFX11-SDAG-NEXT:    s_addk_i32 s32, 0x80
+; GFX11-SDAG-NEXT:    v_cmpx_ne_u32_e32 0, v0
+; GFX11-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-SDAG-NEXT:    s_cbranch_execz .LBB15_4
+; GFX11-SDAG-NEXT:  ; %bb.1: ; %bb.1
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v1, v1, 2, 15
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 2
+; GFX11-SDAG-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, -16, v1
+; GFX11-SDAG-NEXT:  .LBB15_2: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s4, v1, s3
+; GFX11-SDAG-NEXT:    s_bitset0_b32 s2, s3
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_max_u32 s1, s1, s4
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB15_2
+; GFX11-SDAG-NEXT:  ; %bb.3:
+; GFX11-SDAG-NEXT:    s_add_i32 s2, s32, 0x7ff
+; GFX11-SDAG-NEXT:    ; implicit-def: $vgpr31
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    s_and_b32 s2, s2, 0xfffff800
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v1, s1, 5, s2
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v0, s2 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v1
+; GFX11-SDAG-NEXT:  .LBB15_4: ; %Flow
+; GFX11-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-SDAG-NEXT:    s_cbranch_execz .LBB15_8
+; GFX11-SDAG-NEXT:  ; %bb.5: ; %bb.0
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; GFX11-SDAG-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, 0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
+; GFX11-SDAG-NEXT:  .LBB15_6: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s4, v0, s3
+; GFX11-SDAG-NEXT:    s_bitset0_b32 s2, s3
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_max_u32 s1, s1, s4
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB15_6
+; GFX11-SDAG-NEXT:  ; %bb.7:
+; GFX11-SDAG-NEXT:    s_mov_b32 s2, s32
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 1
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s1, 5, s2
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s2 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT:  .LBB15_8: ; %bb.2
+; GFX11-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_addk_i32 s32, 0xff80
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s5
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_control_flow:
+; GFX11-GISEL:       ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_mov_b32 s5, s33
+; GFX11-GISEL-NEXT:    s_add_i32 s33, s32, 63
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-GISEL-NEXT:    s_and_not1_b32 s33, s33, 63
+; GFX11-GISEL-NEXT:    s_addk_i32 s32, 0x80
+; GFX11-GISEL-NEXT:    v_cmpx_ne_u32_e32 0, v0
+; GFX11-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-GISEL-NEXT:    s_cbranch_execz .LBB15_4
+; GFX11-GISEL-NEXT:  ; %bb.1: ; %bb.1
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v1, 2, 15
+; GFX11-GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT:  .LBB15_2: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s4, v0, s3
+; GFX11-GISEL-NEXT:    s_bitset0_b32 s2, s3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_max_u32 s1, s1, s4
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB15_2
+; GFX11-GISEL-NEXT:  ; %bb.3:
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 2
+; GFX11-GISEL-NEXT:    s_add_u32 s2, s32, 0x7ff
+; GFX11-GISEL-NEXT:    s_lshl_b32 s1, s1, 5
+; GFX11-GISEL-NEXT:    s_and_b32 s2, s2, 0xfffff800
+; GFX11-GISEL-NEXT:    ; implicit-def: $vgpr31
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s2, s1
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s2 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:  .LBB15_4: ; %Flow
+; GFX11-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-GISEL-NEXT:    s_cbranch_execz .LBB15_8
+; GFX11-GISEL-NEXT:  ; %bb.5: ; %bb.0
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; GFX11-GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT:  .LBB15_6: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s4, v0, s3
+; GFX11-GISEL-NEXT:    s_bitset0_b32 s2, s3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_max_u32 s1, s1, s4
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB15_6
+; GFX11-GISEL-NEXT:  ; %bb.7:
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 1
+; GFX11-GISEL-NEXT:    s_mov_b32 s2, s32
+; GFX11-GISEL-NEXT:    s_lshl_b32 s1, s1, 5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s2, s1
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s2 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:  .LBB15_8: ; %bb.2
+; GFX11-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-GISEL-NEXT:    s_addk_i32 s32, 0xff80
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s5
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %cond = icmp eq i32 %n, 0
   br i1 %cond, label %bb.0, label %bb.1
@@ -190,3 +2464,257 @@ bb.1:
 bb.2:
   ret void
 }
+
+define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i16(i16 %n) {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i16:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT:    s_mov_b32 s9, s33
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x7fff0, v0
+; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s32
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-SDAG-NEXT:  .LBB16_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-SDAG-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB16_1
+; GFX9-SDAG-NEXT:  ; %bb.2:
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, s32
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s6, 6, v0
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0x29a
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s9
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i16:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT:    s_mov_b32 s9, s33
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, s32
+; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-GISEL-NEXT:  .LBB16_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-GISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-GISEL-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB16_1
+; GFX9-GISEL-NEXT:  ; %bb.2:
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, s32
+; GFX9-GISEL-NEXT:    s_lshl_b32 s5, s6, 6
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s4, s5
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0x29a
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, s9
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i16:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-SDAG-NEXT:    s_mov_b32 s4, s33
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-SDAG-NEXT:    s_mov_b32 s0, 0
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s32
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x7fff0, v0
+; GFX11-SDAG-NEXT:  .LBB16_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-SDAG-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB16_1
+; GFX11-SDAG-NEXT:  ; %bb.2:
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 0x29a
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 5, s1
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s4
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i16:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-GISEL-NEXT:    s_mov_b32 s4, s33
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s32
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT:  .LBB16_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-GISEL-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB16_1
+; GFX11-GISEL-NEXT:  ; %bb.2:
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x29a
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, s32
+; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s4
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %alloca = alloca i32, i16 %n, align 2, addrspace(5)
+  store volatile i32 666, ptr addrspace(5) %alloca
+  ret void
+}
+
+define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i64(i64 %n) {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i64:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT:    s_mov_b32 s9, s33
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s32
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-SDAG-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-SDAG-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB17_1
+; GFX9-SDAG-NEXT:  ; %bb.2:
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, s32
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, s6, 6, v0
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0x29a
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s9
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i64:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT:    s_mov_b32 s9, s33
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, s32
+; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-GISEL-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9-GISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9-GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9-GISEL-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB17_1
+; GFX9-GISEL-NEXT:  ; %bb.2:
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, s32
+; GFX9-GISEL-NEXT:    s_lshl_b32 s5, s6, 6
+; GFX9-GISEL-NEXT:    s_add_u32 s32, s4, s5
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0x29a
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-GISEL-NEXT:    s_mov_b32 s33, s9
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-SDAG-NEXT:    s_mov_b32 s4, s33
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-SDAG-NEXT:    s_mov_b32 s0, 0
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s32
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-SDAG-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-SDAG-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB17_1
+; GFX11-SDAG-NEXT:  ; %bb.2:
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 0x29a
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 5, s1
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s4
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT:    s_mov_b32 s4, s33
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s32
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-GISEL-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX11-GISEL-NEXT:    s_bitset0_b32 s1, s2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_max_u32 s0, s0, s3
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB17_1
+; GFX11-GISEL-NEXT:  ; %bb.2:
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x29a
+; GFX11-GISEL-NEXT:    s_mov_b32 s1, s32
+; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s4
+; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %alloca = alloca i32, i64 %n, align 2, addrspace(5)
+  store volatile i32 666, ptr addrspace(5) %alloca
+  ret void
+}


        


More information about the llvm-commits mailing list