[llvm] [AMDGPU] Supporting dynamically sized allocas (PR #118764)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Dec 5 00:47:49 PST 2024
https://github.com/easyonaadit created https://github.com/llvm/llvm-project/pull/118764
When the stack space to be allocated is not known at
compile time, a wave wide reduction is applied to get
the maximum stack space needed. Uniform stack pointer
is updated accordingly.
>From 97e4869c29ba488c9b4f462e1eaeb9e72b720a4e Mon Sep 17 00:00:00 2001
From: easyonaadit <aaditya.alokdeshpande at amd.com>
Date: Sun, 1 Dec 2024 14:43:05 +0530
Subject: [PATCH] [AMDGPU] supporting dynamically sized allocas
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 50 +-
llvm/test/CodeGen/AMDGPU/dynamic-alloca.ll | 2141 ++++++++++++++++++++
2 files changed, 2169 insertions(+), 22 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/dynamic-alloca.ll
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a212a9218ca0db..ea51fa44f32fea 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3997,10 +3997,11 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
InVals, /*IsThisReturn=*/false, SDValue());
}
-// This is identical to the default implementation in ExpandDYNAMIC_STACKALLOC,
-// except for applying the wave size scale to the increment amount.
-SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(SDValue Op,
- SelectionDAG &DAG) const {
+SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
+ SelectionDAG &DAG) const {
+ // This is identical to the default implementation in ExpandDYNAMIC_STACKALLOC,
+ // except for applying the wave size scale to the increment amount and doing a
+ // wave reduction for divergent allocation size.
const MachineFunction &MF = DAG.getMachineFunction();
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
@@ -4018,6 +4019,8 @@ SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(SDValue Op,
Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
SDValue Size = Tmp2.getOperand(1);
+
+ // Start address of the dynamically sized stack object
SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
Chain = SP.getValue(1);
MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue();
@@ -4027,12 +4030,28 @@ SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(SDValue Op,
? ISD::ADD
: ISD::SUB;
- SDValue ScaledSize = DAG.getNode(
- ISD::SHL, dl, VT, Size,
- DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
+ if (isa<ConstantSDNode>(Op.getOperand(1))){
+ SDValue ScaledSize = DAG.getNode(
+ ISD::SHL, dl, VT, Size,
+ DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
+ Tmp1 = DAG.getNode(Opc, dl, VT, SP, ScaledSize); // Value
+ }
+ else{
+ SDValue WaveReduction =
+ DAG.getTargetConstant(Intrinsic::amdgcn_wave_reduce_umax, dl, MVT::i32);
+ Size = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32,
+ WaveReduction, Size, DAG.getConstant(0, dl, MVT::i32));
+ SDValue ScaledSize = DAG.getNode(
+ ISD::SHL, dl, VT, Size,
+ DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
+ Tmp1 = DAG.getNode(Opc, dl, VT, SP, ScaledSize); // Value in vgpr.
+ SDValue ReadFirstLaneID =
+ DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, dl, MVT::i32);
+ Tmp1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32,
+ ReadFirstLaneID, Tmp1);
+ }
Align StackAlign = TFL->getStackAlign();
- Tmp1 = DAG.getNode(Opc, dl, VT, SP, ScaledSize); // Value
if (Alignment && *Alignment > StackAlign) {
Tmp1 = DAG.getNode(
ISD::AND, dl, VT, Tmp1,
@@ -4042,25 +4061,12 @@ SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(SDValue Op,
}
Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
+ Tmp1 = SP;
Tmp2 = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
return DAG.getMergeValues({Tmp1, Tmp2}, dl);
}
-SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
- SelectionDAG &DAG) const {
- // We only handle constant sizes here to allow non-entry block, static sized
- // allocas. A truly dynamic value is more difficult to support because we
- // don't know if the size value is uniform or not. If the size isn't uniform,
- // we would need to do a wave reduction to get the maximum size to know how
- // much to increment the uniform stack pointer.
- SDValue Size = Op.getOperand(1);
- if (isa<ConstantSDNode>(Size))
- return lowerDYNAMIC_STACKALLOCImpl(Op, DAG); // Use "generic" expansion.
-
- return AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(Op, DAG);
-}
-
SDValue SITargetLowering::LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const {
if (Op.getValueType() != MVT::i32)
return Op; // Defer to cannot select error.
diff --git a/llvm/test/CodeGen/AMDGPU/dynamic-alloca.ll b/llvm/test/CodeGen/AMDGPU/dynamic-alloca.ll
new file mode 100644
index 00000000000000..4a4915ebf4e9de
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/dynamic-alloca.ll
@@ -0,0 +1,2141 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -march=amdgcn -mcpu=tonga -global-isel=0 < %s | FileCheck -check-prefixes=GFX8 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -global-isel=0 < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10,GFX1064 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GFX10,GFX1032 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11,GFX1164 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11,GFX1132 %s
+
+define amdgpu_kernel void @constant_value() {
+; GFX8DAGISEL-LABEL: constant_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX8DAGISEL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GFX8DAGISEL-NEXT: s_mov_b32 s14, -1
+; GFX8DAGISEL-NEXT: s_mov_b32 s15, 0xe00000
+; GFX8DAGISEL-NEXT: s_add_u32 s12, s12, s11
+; GFX8DAGISEL-NEXT: s_addc_u32 s13, s13, 0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX8DAGISEL-NEXT: buffer_store_dword v0, off, s[12:15], 0
+; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX8DAGISEL-NEXT: s_endpgm
+; GFX8-LABEL: constant_value:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
+; GFX8-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT: s_mov_b32 s90, -1
+; GFX8-NEXT: s_mov_b32 s91, 0xe80000
+; GFX8-NEXT: s_add_u32 s88, s88, s11
+; GFX8-NEXT: s_addc_u32 s89, s89, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX8-NEXT: buffer_store_dword v0, off, s[88:91], 0
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: constant_value:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s14, -1
+; GFX9-NEXT: s_mov_b32 s15, 0xe00000
+; GFX9-NEXT: s_add_u32 s12, s12, s11
+; GFX9-NEXT: s_addc_u32 s13, s13, 0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX9-NEXT: buffer_store_dword v0, off, s[12:15], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_endpgm
+;
+; GFX1064-LABEL: constant_value:
+; GFX1064: ; %bb.0: ; %entry
+; GFX1064-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT: s_mov_b32 s14, -1
+; GFX1064-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX1064-NEXT: s_mov_b32 s15, 0x31e16000
+; GFX1064-NEXT: s_add_u32 s12, s12, s11
+; GFX1064-NEXT: s_addc_u32 s13, s13, 0
+; GFX1064-NEXT: buffer_store_dword v0, off, s[12:15], 0
+; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1064-NEXT: s_endpgm
+;
+; GFX1032-LABEL: constant_value:
+; GFX1032: ; %bb.0: ; %entry
+; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT: s_mov_b32 s14, -1
+; GFX1032-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000
+; GFX1032-NEXT: s_add_u32 s12, s12, s11
+; GFX1032-NEXT: s_addc_u32 s13, s13, 0
+; GFX1032-NEXT: buffer_store_dword v0, off, s[12:15], 0
+; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1032-NEXT: s_endpgm
+;
+; GFX11-LABEL: constant_value:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX11-NEXT: scratch_store_b32 off, v0, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_endpgm
+entry:
+ %n = add i32 5, 0
+ %dyn_alloca = alloca i32, i32 %n, addrspace(5)
+ store volatile i32 123, ptr addrspace(5) %dyn_alloca
+ ret void
+}
+
+define amdgpu_kernel void @uniform_value(i32 %n) {
+; GFX8DAGISEL-LABEL: uniform_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX8DAGISEL-NEXT: s_load_dword s0, s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GFX8DAGISEL-NEXT: s_mov_b32 s14, -1
+; GFX8DAGISEL-NEXT: s_mov_b32 s15, 0xe00000
+; GFX8DAGISEL-NEXT: s_add_u32 s12, s12, s11
+; GFX8DAGISEL-NEXT: s_addc_u32 s13, s13, 0
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: s_lshl_b32 s0, s0, 2
+; GFX8DAGISEL-NEXT: s_add_i32 s0, s0, 15
+; GFX8DAGISEL-NEXT: s_movk_i32 s32, 0x400
+; GFX8DAGISEL-NEXT: s_and_b32 s0, s0, -16
+; GFX8DAGISEL-NEXT: s_mov_b32 s1, s32
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX8DAGISEL-NEXT: s_lshl_b32 s0, s0, 6
+; GFX8DAGISEL-NEXT: s_mov_b32 s33, 0
+; GFX8DAGISEL-NEXT: s_add_i32 s32, s1, s0
+; GFX8DAGISEL-NEXT: buffer_store_dword v0, off, s[12:15], s1
+; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX8DAGISEL-NEXT: s_endpgm
+; GFX8-LABEL: uniform_value:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
+; GFX8-NEXT: s_load_dword s0, s[4:5], 0x24
+; GFX8-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT: s_mov_b32 s90, -1
+; GFX8-NEXT: s_mov_b32 s91, 0xe80000
+; GFX8-NEXT: s_add_u32 s88, s88, s11
+; GFX8-NEXT: s_addc_u32 s89, s89, 0
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_lshl_b32 s0, s0, 2
+; GFX8-NEXT: s_add_i32 s0, s0, 15
+; GFX8-NEXT: s_movk_i32 s32, 0x400
+; GFX8-NEXT: s_and_b32 s0, s0, -16
+; GFX8-NEXT: s_mov_b32 s1, s32
+; GFX8-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX8-NEXT: s_lshl_b32 s0, s0, 6
+; GFX8-NEXT: s_mov_b32 s33, 0
+; GFX8-NEXT: s_add_i32 s32, s1, s0
+; GFX8-NEXT: buffer_store_dword v0, off, s[88:91], s1
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: uniform_value:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24
+; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s14, -1
+; GFX9-NEXT: s_mov_b32 s15, 0xe00000
+; GFX9-NEXT: s_add_u32 s12, s12, s11
+; GFX9-NEXT: s_addc_u32 s13, s13, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_lshl_b32 s0, s0, 2
+; GFX9-NEXT: s_add_i32 s0, s0, 15
+; GFX9-NEXT: s_movk_i32 s32, 0x400
+; GFX9-NEXT: s_and_b32 s0, s0, -16
+; GFX9-NEXT: s_mov_b32 s1, s32
+; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX9-NEXT: s_lshl_b32 s0, s0, 6
+; GFX9-NEXT: s_mov_b32 s33, 0
+; GFX9-NEXT: s_add_i32 s32, s1, s0
+; GFX9-NEXT: buffer_store_dword v0, off, s[12:15], s1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_endpgm
+;
+; GFX1064-LABEL: uniform_value:
+; GFX1064: ; %bb.0: ; %entry
+; GFX1064-NEXT: s_load_dword s0, s[4:5], 0x24
+; GFX1064-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT: s_mov_b32 s14, -1
+; GFX1064-NEXT: s_mov_b32 s15, 0x31e16000
+; GFX1064-NEXT: s_add_u32 s12, s12, s11
+; GFX1064-NEXT: s_addc_u32 s13, s13, 0
+; GFX1064-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX1064-NEXT: s_movk_i32 s32, 0x400
+; GFX1064-NEXT: s_mov_b32 s33, 0
+; GFX1064-NEXT: s_mov_b32 s1, s32
+; GFX1064-NEXT: buffer_store_dword v0, off, s[12:15], s1
+; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: s_lshl_b32 s0, s0, 2
+; GFX1064-NEXT: s_add_i32 s0, s0, 15
+; GFX1064-NEXT: s_and_b32 s0, s0, -16
+; GFX1064-NEXT: s_lshl_b32 s0, s0, 6
+; GFX1064-NEXT: s_add_i32 s32, s1, s0
+; GFX1064-NEXT: s_endpgm
+;
+; GFX1032-LABEL: uniform_value:
+; GFX1032: ; %bb.0: ; %entry
+; GFX1032-NEXT: s_load_dword s0, s[4:5], 0x24
+; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT: s_mov_b32 s14, -1
+; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000
+; GFX1032-NEXT: s_add_u32 s12, s12, s11
+; GFX1032-NEXT: s_addc_u32 s13, s13, 0
+; GFX1032-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX1032-NEXT: s_movk_i32 s32, 0x200
+; GFX1032-NEXT: s_mov_b32 s33, 0
+; GFX1032-NEXT: s_mov_b32 s1, s32
+; GFX1032-NEXT: buffer_store_dword v0, off, s[12:15], s1
+; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: s_lshl_b32 s0, s0, 2
+; GFX1032-NEXT: s_add_i32 s0, s0, 15
+; GFX1032-NEXT: s_and_b32 s0, s0, -16
+; GFX1032-NEXT: s_lshl_b32 s0, s0, 5
+; GFX1032-NEXT: s_add_i32 s32, s1, s0
+; GFX1032-NEXT: s_endpgm
+;
+; GFX1164-LABEL: uniform_value:
+; GFX1164: ; %bb.0: ; %entry
+; GFX1164-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX1164-NEXT: s_mov_b32 s32, 16
+; GFX1164-NEXT: s_mov_b32 s33, 0
+; GFX1164-NEXT: s_mov_b32 s1, s32
+; GFX1164-NEXT: scratch_store_b32 off, v0, s1 dlc
+; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: s_lshl_b32 s0, s0, 2
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: s_add_i32 s0, s0, 15
+; GFX1164-NEXT: s_and_b32 s0, s0, -16
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: s_lshl_b32 s0, s0, 6
+; GFX1164-NEXT: s_add_i32 s32, s1, s0
+; GFX1164-NEXT: s_endpgm
+;
+; GFX1132-LABEL: uniform_value:
+; GFX1132: ; %bb.0: ; %entry
+; GFX1132-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX1132-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX1132-NEXT: s_mov_b32 s32, 16
+; GFX1132-NEXT: s_mov_b32 s33, 0
+; GFX1132-NEXT: s_mov_b32 s1, s32
+; GFX1132-NEXT: scratch_store_b32 off, v0, s1 dlc
+; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: s_lshl_b32 s0, s0, 2
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_add_i32 s0, s0, 15
+; GFX1132-NEXT: s_and_b32 s0, s0, -16
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_lshl_b32 s0, s0, 5
+; GFX1132-NEXT: s_add_i32 s32, s1, s0
+; GFX1132-NEXT: s_endpgm
+entry:
+ %dyn_alloca = alloca i32, i32 %n, addrspace(5)
+ store volatile i32 123, ptr addrspace(5) %dyn_alloca
+ ret void
+}
+
+define amdgpu_kernel void @divergent_value() {
+; GFX8DAGISEL-LABEL: divergent_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX8DAGISEL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GFX8DAGISEL-NEXT: s_mov_b32 s14, -1
+; GFX8DAGISEL-NEXT: s_mov_b32 s15, 0xe00000
+; GFX8DAGISEL-NEXT: s_add_u32 s12, s12, s11
+; GFX8DAGISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX8DAGISEL-NEXT: s_addc_u32 s13, s13, 0
+; GFX8DAGISEL-NEXT: v_and_b32_e32 v0, 0x1ff0, v0
+; GFX8DAGISEL-NEXT: s_mov_b64 s[0:1], exec
+; GFX8DAGISEL-NEXT: s_mov_b32 s2, 0
+; GFX8DAGISEL-NEXT: s_mov_b32 s33, 0
+; GFX8DAGISEL-NEXT: s_movk_i32 s32, 0x400
+; GFX8DAGISEL-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s3, s[0:1]
+; GFX8DAGISEL-NEXT: v_readlane_b32 s4, v0, s3
+; GFX8DAGISEL-NEXT: s_bitset0_b64 s[0:1], s3
+; GFX8DAGISEL-NEXT: s_max_u32 s2, s2, s4
+; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB2_1
+; GFX8DAGISEL-NEXT: ; %bb.2:
+; GFX8DAGISEL-NEXT: s_mov_b32 s0, s32
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_lshl_add_u32 v0, s2, 6, v0
+; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s32, v0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX8DAGISEL-NEXT: buffer_store_dword v0, off, s[12:15], s0
+; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX8DAGISEL-NEXT: s_endpgm
+; GFX8-LABEL: divergent_value:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
+; GFX8-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT: s_mov_b32 s90, -1
+; GFX8-NEXT: s_mov_b32 s91, 0xe80000
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX8-NEXT: s_add_u32 s88, s88, s11
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 15, v0
+; GFX8-NEXT: s_addc_u32 s89, s89, 0
+; GFX8-NEXT: v_and_b32_e32 v0, 0x1ff0, v0
+; GFX8-NEXT: s_mov_b64 s[0:1], exec
+; GFX8-NEXT: s_mov_b32 s2, 0
+; GFX8-NEXT: s_mov_b32 s33, 0
+; GFX8-NEXT: s_movk_i32 s32, 0x400
+; GFX8-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1]
+; GFX8-NEXT: v_readlane_b32 s4, v0, s3
+; GFX8-NEXT: s_bitset0_b64 s[0:1], s3
+; GFX8-NEXT: s_max_u32 s2, s2, s4
+; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX8-NEXT: s_cbranch_scc1 .LBB2_1
+; GFX8-NEXT: ; %bb.2:
+; GFX8-NEXT: v_lshlrev_b32_e64 v0, 6, s2
+; GFX8-NEXT: s_mov_b32 s0, s32
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: v_readfirstlane_b32 s32, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX8-NEXT: buffer_store_dword v0, off, s[88:91], s0
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: divergent_value:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s14, -1
+; GFX9-NEXT: s_mov_b32 s15, 0xe00000
+; GFX9-NEXT: s_add_u32 s12, s12, s11
+; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-NEXT: s_addc_u32 s13, s13, 0
+; GFX9-NEXT: v_and_b32_e32 v0, 0x1ff0, v0
+; GFX9-NEXT: s_mov_b64 s[0:1], exec
+; GFX9-NEXT: s_mov_b32 s2, 0
+; GFX9-NEXT: s_mov_b32 s33, 0
+; GFX9-NEXT: s_movk_i32 s32, 0x400
+; GFX9-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1]
+; GFX9-NEXT: v_readlane_b32 s4, v0, s3
+; GFX9-NEXT: s_bitset0_b64 s[0:1], s3
+; GFX9-NEXT: s_max_u32 s2, s2, s4
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: s_cbranch_scc1 .LBB2_1
+; GFX9-NEXT: ; %bb.2:
+; GFX9-NEXT: s_mov_b32 s0, s32
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_lshl_add_u32 v0, s2, 6, v0
+; GFX9-NEXT: v_readfirstlane_b32 s32, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX9-NEXT: buffer_store_dword v0, off, s[12:15], s0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_endpgm
+;
+; GFX1064-LABEL: divergent_value:
+; GFX1064: ; %bb.0: ; %entry
+; GFX1064-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX1064-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT: s_mov_b32 s14, -1
+; GFX1064-NEXT: s_mov_b32 s15, 0x31e16000
+; GFX1064-NEXT: v_and_b32_e32 v0, 0x1ff0, v0
+; GFX1064-NEXT: s_add_u32 s12, s12, s11
+; GFX1064-NEXT: s_addc_u32 s13, s13, 0
+; GFX1064-NEXT: s_mov_b64 s[0:1], exec
+; GFX1064-NEXT: s_mov_b32 s2, 0
+; GFX1064-NEXT: s_mov_b32 s33, 0
+; GFX1064-NEXT: s_movk_i32 s32, 0x400
+; GFX1064-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_ff1_i32_b64 s3, s[0:1]
+; GFX1064-NEXT: v_readlane_b32 s4, v0, s3
+; GFX1064-NEXT: s_bitset0_b64 s[0:1], s3
+; GFX1064-NEXT: s_max_u32 s2, s2, s4
+; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT: s_cbranch_scc1 .LBB2_1
+; GFX1064-NEXT: ; %bb.2:
+; GFX1064-NEXT: s_mov_b32 s0, s32
+; GFX1064-NEXT: v_mov_b32_e32 v1, 0x7b
+; GFX1064-NEXT: v_lshl_add_u32 v0, s2, 6, s0
+; GFX1064-NEXT: buffer_store_dword v1, off, s[12:15], s0
+; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1064-NEXT: v_readfirstlane_b32 s32, v0
+; GFX1064-NEXT: s_endpgm
+;
+; GFX1032-LABEL: divergent_value:
+; GFX1032: ; %bb.0: ; %entry
+; GFX1032-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT: s_mov_b32 s14, -1
+; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000
+; GFX1032-NEXT: v_and_b32_e32 v0, 0x1ff0, v0
+; GFX1032-NEXT: s_add_u32 s12, s12, s11
+; GFX1032-NEXT: s_addc_u32 s13, s13, 0
+; GFX1032-NEXT: s_mov_b32 s1, exec_lo
+; GFX1032-NEXT: s_mov_b32 s0, 0
+; GFX1032-NEXT: s_mov_b32 s33, 0
+; GFX1032-NEXT: s_movk_i32 s32, 0x200
+; GFX1032-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_ff1_i32_b32 s2, s1
+; GFX1032-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1032-NEXT: s_bitset0_b32 s1, s2
+; GFX1032-NEXT: s_max_u32 s0, s0, s3
+; GFX1032-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1032-NEXT: s_cbranch_scc1 .LBB2_1
+; GFX1032-NEXT: ; %bb.2:
+; GFX1032-NEXT: s_mov_b32 s1, s32
+; GFX1032-NEXT: v_mov_b32_e32 v1, 0x7b
+; GFX1032-NEXT: v_lshl_add_u32 v0, s0, 5, s1
+; GFX1032-NEXT: buffer_store_dword v1, off, s[12:15], s1
+; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1032-NEXT: v_readfirstlane_b32 s32, v0
+; GFX1032-NEXT: s_endpgm
+;
+; GFX1164-LABEL: divergent_value:
+; GFX1164: ; %bb.0: ; %entry
+; GFX1164-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: s_mov_b32 s2, 0
+; GFX1164-NEXT: s_mov_b32 s33, 0
+; GFX1164-NEXT: s_mov_b32 s32, 16
+; GFX1164-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: v_and_b32_e32 v0, 0x1ff0, v0
+; GFX1164-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_ctz_i32_b64 s3, s[0:1]
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_readlane_b32 s4, v0, s3
+; GFX1164-NEXT: s_bitset0_b64 s[0:1], s3
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: s_max_u32 s2, s2, s4
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_cbranch_scc1 .LBB2_1
+; GFX1164-NEXT: ; %bb.2:
+; GFX1164-NEXT: s_mov_b32 s0, s32
+; GFX1164-NEXT: v_mov_b32_e32 v1, 0x7b
+; GFX1164-NEXT: v_lshl_add_u32 v0, s2, 6, s0
+; GFX1164-NEXT: scratch_store_b32 off, v1, s0 dlc
+; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1164-NEXT: v_readfirstlane_b32 s32, v0
+; GFX1164-NEXT: s_endpgm
+;
+; GFX1132-LABEL: divergent_value:
+; GFX1132: ; %bb.0: ; %entry
+; GFX1132-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132-NEXT: s_mov_b32 s1, exec_lo
+; GFX1132-NEXT: s_mov_b32 s0, 0
+; GFX1132-NEXT: s_mov_b32 s33, 0
+; GFX1132-NEXT: s_mov_b32 s32, 16
+; GFX1132-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_and_b32_e32 v0, 0x1ff0, v0
+; GFX1132-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_ctz_i32_b32 s2, s1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1132-NEXT: s_bitset0_b32 s1, s2
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: s_max_u32 s0, s0, s3
+; GFX1132-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1132-NEXT: s_cbranch_scc1 .LBB2_1
+; GFX1132-NEXT: ; %bb.2:
+; GFX1132-NEXT: s_mov_b32 s1, s32
+; GFX1132-NEXT: v_mov_b32_e32 v1, 0x7b
+; GFX1132-NEXT: v_lshl_add_u32 v0, s0, 5, s1
+; GFX1132-NEXT: scratch_store_b32 off, v1, s1 dlc
+; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1132-NEXT: v_readfirstlane_b32 s32, v0
+; GFX1132-NEXT: s_endpgm
+entry:
+ %idx = call i32 @llvm.amdgcn.workitem.id.x()
+ %dyn_alloca = alloca i32, i32 %idx, addrspace(5)
+ store volatile i32 123, ptr addrspace(5) %dyn_alloca
+ ret void
+}
+
+define void @custom_alignment(i32 %n) {
+; GFX8DAGISEL-LABEL: custom_alignment:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX8DAGISEL-NEXT: s_mov_b32 s9, s33
+; GFX8DAGISEL-NEXT: s_add_i32 s33, s32, 0xffc0
+; GFX8DAGISEL-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX8DAGISEL-NEXT: s_mov_b32 s6, 0
+; GFX8DAGISEL-NEXT: s_and_b32 s33, s33, 0xffff0000
+; GFX8DAGISEL-NEXT: s_add_i32 s32, s32, 0x20000
+; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX8DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX8DAGISEL-NEXT: s_max_u32 s6, s6, s8
+; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX8DAGISEL-NEXT: ; %bb.2:
+; GFX8DAGISEL-NEXT: s_mov_b32 s4, s32
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX8DAGISEL-NEXT: v_lshl_add_u32 v0, s6, 6, v0
+; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s5, v0
+; GFX8DAGISEL-NEXT: s_and_b32 s32, s5, 0xffff0000
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX8DAGISEL-NEXT: buffer_store_dword v0, off, s[0:3], s4
+; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX8DAGISEL-NEXT: s_add_i32 s32, s32, 0xfffe0000
+; GFX8DAGISEL-NEXT: s_mov_b32 s33, s9
+; GFX8DAGISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX8-LABEL: custom_alignment:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 15, v0
+; GFX8-NEXT: s_mov_b32 s9, s33
+; GFX8-NEXT: s_add_i32 s33, s32, 0xffc0
+; GFX8-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX8-NEXT: s_mov_b64 s[4:5], exec
+; GFX8-NEXT: s_mov_b32 s6, 0
+; GFX8-NEXT: s_and_b32 s33, s33, 0xffff0000
+; GFX8-NEXT: s_add_i32 s32, s32, 0x20000
+; GFX8-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX8-NEXT: v_readlane_b32 s8, v0, s7
+; GFX8-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX8-NEXT: s_max_u32 s6, s6, s8
+; GFX8-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX8-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX8-NEXT: ; %bb.2:
+; GFX8-NEXT: v_lshlrev_b32_e64 v0, 6, s6
+; GFX8-NEXT: s_mov_b32 s4, s32
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0
+; GFX8-NEXT: v_readfirstlane_b32 s5, v0
+; GFX8-NEXT: s_and_b32 s32, s5, 0xffff0000
+; GFX8-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], s4
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_add_i32 s32, s32, 0xfffe0000
+; GFX8-NEXT: s_mov_b32 s33, s9
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: custom_alignment:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-NEXT: s_mov_b32 s9, s33
+; GFX9-NEXT: s_add_i32 s33, s32, 0xffc0
+; GFX9-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX9-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-NEXT: s_mov_b32 s6, 0
+; GFX9-NEXT: s_and_b32 s33, s33, 0xffff0000
+; GFX9-NEXT: s_add_i32 s32, s32, 0x20000
+; GFX9-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX9-NEXT: v_readlane_b32 s8, v0, s7
+; GFX9-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX9-NEXT: s_max_u32 s6, s6, s8
+; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX9-NEXT: ; %bb.2:
+; GFX9-NEXT: s_mov_b32 s4, s32
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_lshl_add_u32 v0, s6, 6, v0
+; GFX9-NEXT: v_readfirstlane_b32 s5, v0
+; GFX9-NEXT: s_and_b32 s32, s5, 0xffff0000
+; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s4
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_add_i32 s32, s32, 0xfffe0000
+; GFX9-NEXT: s_mov_b32 s33, s9
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1064-LABEL: custom_alignment:
+; GFX1064: ; %bb.0: ; %entry
+; GFX1064-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX1064-NEXT: s_mov_b32 s9, s33
+; GFX1064-NEXT: s_add_i32 s33, s32, 0xffc0
+; GFX1064-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064-NEXT: s_mov_b32 s6, 0
+; GFX1064-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX1064-NEXT: s_and_b32 s33, s33, 0xffff0000
+; GFX1064-NEXT: s_add_i32 s32, s32, 0x20000
+; GFX1064-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX1064-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1064-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX1064-NEXT: s_max_u32 s6, s6, s8
+; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1064-NEXT: ; %bb.2:
+; GFX1064-NEXT: s_mov_b32 s4, s32
+; GFX1064-NEXT: s_mov_b32 s33, s9
+; GFX1064-NEXT: v_lshl_add_u32 v0, s6, 6, s4
+; GFX1064-NEXT: v_readfirstlane_b32 s5, v0
+; GFX1064-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX1064-NEXT: s_and_b32 s32, s5, 0xffff0000
+; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], s4
+; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1064-NEXT: s_add_i32 s32, s32, 0xfffe0000
+; GFX1064-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1032-LABEL: custom_alignment:
+; GFX1032: ; %bb.0: ; %entry
+; GFX1032-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX1032-NEXT: s_mov_b32 s8, s33
+; GFX1032-NEXT: s_add_i32 s33, s32, 0x7fe0
+; GFX1032-NEXT: s_mov_b32 s5, exec_lo
+; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX1032-NEXT: s_and_b32 s33, s33, 0xffff8000
+; GFX1032-NEXT: s_add_i32 s32, s32, 0x10000
+; GFX1032-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_ff1_i32_b32 s6, s5
+; GFX1032-NEXT: v_readlane_b32 s7, v0, s6
+; GFX1032-NEXT: s_bitset0_b32 s5, s6
+; GFX1032-NEXT: s_max_u32 s4, s4, s7
+; GFX1032-NEXT: s_cmp_lg_u32 s5, 0
+; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1032-NEXT: ; %bb.2:
+; GFX1032-NEXT: s_mov_b32 s5, s32
+; GFX1032-NEXT: s_mov_b32 s33, s8
+; GFX1032-NEXT: v_lshl_add_u32 v0, s4, 5, s5
+; GFX1032-NEXT: v_readfirstlane_b32 s4, v0
+; GFX1032-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX1032-NEXT: s_and_b32 s32, s4, 0xffff8000
+; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], s5
+; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1032-NEXT: s_add_i32 s32, s32, 0xffff0000
+; GFX1032-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1164-LABEL: custom_alignment:
+; GFX1164: ; %bb.0: ; %entry
+; GFX1164-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX1164-NEXT: s_mov_b32 s5, s33
+; GFX1164-NEXT: s_add_i32 s33, s32, 0x3ff
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: s_mov_b32 s2, 0
+; GFX1164-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX1164-NEXT: s_and_b32 s33, s33, 0xfffffc00
+; GFX1164-NEXT: s_addk_i32 s32, 0x800
+; GFX1164-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_ctz_i32_b64 s3, s[0:1]
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_readlane_b32 s4, v0, s3
+; GFX1164-NEXT: s_bitset0_b64 s[0:1], s3
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: s_max_u32 s2, s2, s4
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1164-NEXT: ; %bb.2:
+; GFX1164-NEXT: s_mov_b32 s0, s32
+; GFX1164-NEXT: s_mov_b32 s33, s5
+; GFX1164-NEXT: v_lshl_add_u32 v0, s2, 6, s0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-NEXT: v_readfirstlane_b32 s1, v0
+; GFX1164-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX1164-NEXT: s_and_b32 s32, s1, 0xffff0000
+; GFX1164-NEXT: scratch_store_b32 off, v0, s0 dlc
+; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1164-NEXT: s_addk_i32 s32, 0xf800
+; GFX1164-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1132-LABEL: custom_alignment:
+; GFX1132: ; %bb.0: ; %entry
+; GFX1132-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX1132-NEXT: s_mov_b32 s4, s33
+; GFX1132-NEXT: s_add_i32 s33, s32, 0x3ff
+; GFX1132-NEXT: s_mov_b32 s1, exec_lo
+; GFX1132-NEXT: s_mov_b32 s0, 0
+; GFX1132-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX1132-NEXT: s_and_b32 s33, s33, 0xfffffc00
+; GFX1132-NEXT: s_addk_i32 s32, 0x800
+; GFX1132-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_ctz_i32_b32 s2, s1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1132-NEXT: s_bitset0_b32 s1, s2
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: s_max_u32 s0, s0, s3
+; GFX1132-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1132-NEXT: ; %bb.2:
+; GFX1132-NEXT: s_mov_b32 s1, s32
+; GFX1132-NEXT: s_mov_b32 s33, s4
+; GFX1132-NEXT: v_lshl_add_u32 v0, s0, 5, s1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1132-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX1132-NEXT: s_and_b32 s32, s0, 0xffff8000
+; GFX1132-NEXT: scratch_store_b32 off, v0, s1 dlc
+; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1132-NEXT: s_addk_i32 s32, 0xf800
+; GFX1132-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %dyn_alloca = alloca i32, i32 %n, align 1024, addrspace(5)
+ store volatile i32 123, ptr addrspace(5) %dyn_alloca
+ ret void
+}
+
+define void @pointer_offset(<4 x i32> %a, <4 x i32> %b, i32 %n) {
+; GFX8DAGISEL-LABEL: pointer_offset:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8DAGISEL-NEXT: s_mov_b32 s9, s33
+; GFX8DAGISEL-NEXT: v_lshlrev_b32_e32 v8, 4, v8
+; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX8DAGISEL-NEXT: s_mov_b32 s6, 0
+; GFX8DAGISEL-NEXT: s_mov_b32 s33, s32
+; GFX8DAGISEL-NEXT: s_addk_i32 s32, 0x400
+; GFX8DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v8, s7
+; GFX8DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX8DAGISEL-NEXT: s_max_u32 s6, s6, s8
+; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1
+; GFX8DAGISEL-NEXT: ; %bb.2:
+; GFX8DAGISEL-NEXT: s_mov_b32 s4, s32
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v8, s4
+; GFX8DAGISEL-NEXT: v_lshl_add_u32 v8, s6, 6, v8
+; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s32, v8
+; GFX8DAGISEL-NEXT: buffer_store_dword v3, off, s[0:3], s4 offset:12
+; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX8DAGISEL-NEXT: buffer_store_dword v2, off, s[0:3], s4 offset:8
+; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX8DAGISEL-NEXT: buffer_store_dword v1, off, s[0:3], s4 offset:4
+; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX8DAGISEL-NEXT: buffer_store_dword v0, off, s[0:3], s4
+; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX8DAGISEL-NEXT: buffer_store_dword v7, off, s[0:3], s4 offset:28
+; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX8DAGISEL-NEXT: buffer_store_dword v6, off, s[0:3], s4 offset:24
+; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX8DAGISEL-NEXT: buffer_store_dword v5, off, s[0:3], s4 offset:20
+; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX8DAGISEL-NEXT: buffer_store_dword v4, off, s[0:3], s4 offset:16
+; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX8DAGISEL-NEXT: s_addk_i32 s32, 0xfc00
+; GFX8DAGISEL-NEXT: s_mov_b32 s33, s9
+; GFX8DAGISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX8-LABEL: pointer_offset:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_mov_b32 s9, s33
+; GFX8-NEXT: v_lshlrev_b32_e32 v8, 4, v8
+; GFX8-NEXT: s_mov_b64 s[4:5], exec
+; GFX8-NEXT: s_mov_b32 s6, 0
+; GFX8-NEXT: s_mov_b32 s33, s32
+; GFX8-NEXT: s_addk_i32 s32, 0x400
+; GFX8-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX8-NEXT: v_readlane_b32 s8, v8, s7
+; GFX8-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX8-NEXT: s_max_u32 s6, s6, s8
+; GFX8-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX8-NEXT: s_cbranch_scc1 .LBB4_1
+; GFX8-NEXT: ; %bb.2:
+; GFX8-NEXT: v_lshlrev_b32_e64 v8, 6, s6
+; GFX8-NEXT: s_mov_b32 s4, s32
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, s4, v8
+; GFX8-NEXT: v_readfirstlane_b32 s32, v8
+; GFX8-NEXT: buffer_store_dword v3, off, s[0:3], s4 offset:12
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_store_dword v2, off, s[0:3], s4 offset:8
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_store_dword v1, off, s[0:3], s4 offset:4
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], s4
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_store_dword v7, off, s[0:3], s4 offset:28
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_store_dword v6, off, s[0:3], s4 offset:24
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_store_dword v5, off, s[0:3], s4 offset:20
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_store_dword v4, off, s[0:3], s4 offset:16
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_addk_i32 s32, 0xfc00
+; GFX8-NEXT: s_mov_b32 s33, s9
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: pointer_offset:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s9, s33
+; GFX9-NEXT: v_lshlrev_b32_e32 v8, 4, v8
+; GFX9-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-NEXT: s_mov_b32 s6, 0
+; GFX9-NEXT: s_mov_b32 s33, s32
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX9-NEXT: v_readlane_b32 s8, v8, s7
+; GFX9-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX9-NEXT: s_max_u32 s6, s6, s8
+; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-NEXT: s_cbranch_scc1 .LBB4_1
+; GFX9-NEXT: ; %bb.2:
+; GFX9-NEXT: s_mov_b32 s4, s32
+; GFX9-NEXT: v_mov_b32_e32 v8, s4
+; GFX9-NEXT: v_lshl_add_u32 v8, s6, 6, v8
+; GFX9-NEXT: v_readfirstlane_b32 s32, v8
+; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s4 offset:12
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s4 offset:8
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s4 offset:4
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s4
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s4 offset:28
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s4 offset:24
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s4 offset:20
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s4 offset:16
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_addk_i32 s32, 0xfc00
+; GFX9-NEXT: s_mov_b32 s33, s9
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1064-LABEL: pointer_offset:
+; GFX1064: ; %bb.0: ; %entry
+; GFX1064-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064-NEXT: v_lshlrev_b32_e32 v8, 4, v8
+; GFX1064-NEXT: s_mov_b32 s9, s33
+; GFX1064-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064-NEXT: s_mov_b32 s6, 0
+; GFX1064-NEXT: s_mov_b32 s33, s32
+; GFX1064-NEXT: s_addk_i32 s32, 0x400
+; GFX1064-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX1064-NEXT: v_readlane_b32 s8, v8, s7
+; GFX1064-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX1064-NEXT: s_max_u32 s6, s6, s8
+; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1064-NEXT: s_cbranch_scc1 .LBB4_1
+; GFX1064-NEXT: ; %bb.2:
+; GFX1064-NEXT: s_mov_b32 s4, s32
+; GFX1064-NEXT: s_mov_b32 s33, s9
+; GFX1064-NEXT: v_lshl_add_u32 v8, s6, 6, s4
+; GFX1064-NEXT: buffer_store_dword v3, off, s[0:3], s4 offset:12
+; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1064-NEXT: buffer_store_dword v2, off, s[0:3], s4 offset:8
+; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1064-NEXT: buffer_store_dword v1, off, s[0:3], s4 offset:4
+; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], s4
+; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1064-NEXT: buffer_store_dword v7, off, s[0:3], s4 offset:28
+; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1064-NEXT: buffer_store_dword v6, off, s[0:3], s4 offset:24
+; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1064-NEXT: buffer_store_dword v5, off, s[0:3], s4 offset:20
+; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1064-NEXT: buffer_store_dword v4, off, s[0:3], s4 offset:16
+; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1064-NEXT: v_readfirstlane_b32 s32, v8
+; GFX1064-NEXT: s_addk_i32 s32, 0xfc00
+; GFX1064-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1032-LABEL: pointer_offset:
+; GFX1032: ; %bb.0: ; %entry
+; GFX1032-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032-NEXT: v_lshlrev_b32_e32 v8, 4, v8
+; GFX1032-NEXT: s_mov_b32 s8, s33
+; GFX1032-NEXT: s_mov_b32 s5, exec_lo
+; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: s_mov_b32 s33, s32
+; GFX1032-NEXT: s_addk_i32 s32, 0x200
+; GFX1032-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_ff1_i32_b32 s6, s5
+; GFX1032-NEXT: v_readlane_b32 s7, v8, s6
+; GFX1032-NEXT: s_bitset0_b32 s5, s6
+; GFX1032-NEXT: s_max_u32 s4, s4, s7
+; GFX1032-NEXT: s_cmp_lg_u32 s5, 0
+; GFX1032-NEXT: s_cbranch_scc1 .LBB4_1
+; GFX1032-NEXT: ; %bb.2:
+; GFX1032-NEXT: s_mov_b32 s5, s32
+; GFX1032-NEXT: s_mov_b32 s33, s8
+; GFX1032-NEXT: v_lshl_add_u32 v8, s4, 5, s5
+; GFX1032-NEXT: buffer_store_dword v3, off, s[0:3], s5 offset:12
+; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1032-NEXT: buffer_store_dword v2, off, s[0:3], s5 offset:8
+; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1032-NEXT: buffer_store_dword v1, off, s[0:3], s5 offset:4
+; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], s5
+; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1032-NEXT: buffer_store_dword v7, off, s[0:3], s5 offset:28
+; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1032-NEXT: buffer_store_dword v6, off, s[0:3], s5 offset:24
+; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1032-NEXT: buffer_store_dword v5, off, s[0:3], s5 offset:20
+; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1032-NEXT: buffer_store_dword v4, off, s[0:3], s5 offset:16
+; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1032-NEXT: v_readfirstlane_b32 s32, v8
+; GFX1032-NEXT: s_addk_i32 s32, 0xfe00
+; GFX1032-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1164-LABEL: pointer_offset:
+; GFX1164: ; %bb.0: ; %entry
+; GFX1164-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164-NEXT: v_lshlrev_b32_e32 v8, 4, v8
+; GFX1164-NEXT: s_mov_b32 s5, s33
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: s_mov_b32 s2, 0
+; GFX1164-NEXT: s_mov_b32 s33, s32
+; GFX1164-NEXT: s_add_i32 s32, s32, 16
+; GFX1164-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_ctz_i32_b64 s3, s[0:1]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_readlane_b32 s4, v8, s3
+; GFX1164-NEXT: s_bitset0_b64 s[0:1], s3
+; GFX1164-NEXT: s_max_u32 s2, s2, s4
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_cbranch_scc1 .LBB4_1
+; GFX1164-NEXT: ; %bb.2:
+; GFX1164-NEXT: s_mov_b32 s0, s32
+; GFX1164-NEXT: s_mov_b32 s33, s5
+; GFX1164-NEXT: v_lshl_add_u32 v8, s2, 6, s0
+; GFX1164-NEXT: s_add_i32 s1, s0, 16
+; GFX1164-NEXT: scratch_store_b128 off, v[0:3], s0 dlc
+; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1164-NEXT: scratch_store_b128 off, v[4:7], s1 dlc
+; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1164-NEXT: v_readfirstlane_b32 s32, v8
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: s_add_i32 s32, s32, -16
+; GFX1164-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1132-LABEL: pointer_offset:
+; GFX1132: ; %bb.0: ; %entry
+; GFX1132-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132-NEXT: v_lshlrev_b32_e32 v8, 4, v8
+; GFX1132-NEXT: s_mov_b32 s4, s33
+; GFX1132-NEXT: s_mov_b32 s1, exec_lo
+; GFX1132-NEXT: s_mov_b32 s0, 0
+; GFX1132-NEXT: s_mov_b32 s33, s32
+; GFX1132-NEXT: s_add_i32 s32, s32, 16
+; GFX1132-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_ctz_i32_b32 s2, s1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: v_readlane_b32 s3, v8, s2
+; GFX1132-NEXT: s_bitset0_b32 s1, s2
+; GFX1132-NEXT: s_max_u32 s0, s0, s3
+; GFX1132-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1132-NEXT: s_cbranch_scc1 .LBB4_1
+; GFX1132-NEXT: ; %bb.2:
+; GFX1132-NEXT: s_mov_b32 s1, s32
+; GFX1132-NEXT: s_mov_b32 s33, s4
+; GFX1132-NEXT: v_lshl_add_u32 v8, s0, 5, s1
+; GFX1132-NEXT: s_add_i32 s0, s1, 16
+; GFX1132-NEXT: scratch_store_b128 off, v[0:3], s1 dlc
+; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1132-NEXT: scratch_store_b128 off, v[4:7], s0 dlc
+; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1132-NEXT: v_readfirstlane_b32 s32, v8
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: s_add_i32 s32, s32, -16
+; GFX1132-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %dyn_alloca = alloca <4 x i32>, i32 %n, addrspace(5)
+ %ptr = getelementptr <4 x i32>, ptr addrspace(5) %dyn_alloca, i32 1
+ store volatile <4 x i32> %a, ptr addrspace(5) %dyn_alloca
+ store volatile <4 x i32> %b, ptr addrspace(5) %ptr
+ ret void
+}
+
+define void @multiple_allocas(i32 %m, i32 %n) {
+; GFX8DAGISEL-LABEL: multiple_allocas:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX8DAGISEL-NEXT: s_mov_b32 s10, s33
+; GFX8DAGISEL-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX8DAGISEL-NEXT: s_mov_b32 s7, 0
+; GFX8DAGISEL-NEXT: s_mov_b32 s33, s32
+; GFX8DAGISEL-NEXT: s_addk_i32 s32, 0x800
+; GFX8DAGISEL-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s6, s[4:5]
+; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s6
+; GFX8DAGISEL-NEXT: s_bitset0_b64 s[4:5], s6
+; GFX8DAGISEL-NEXT: s_max_u32 s7, s7, s8
+; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX8DAGISEL-NEXT: ; %bb.2:
+; GFX8DAGISEL-NEXT: s_mov_b32 s6, s32
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX8DAGISEL-NEXT: v_lshl_add_u32 v0, s7, 6, v0
+; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s32, v0
+; GFX8DAGISEL-NEXT: v_lshl_add_u32 v0, v1, 2, 15
+; GFX8DAGISEL-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX8DAGISEL-NEXT: s_mov_b32 s7, 0
+; GFX8DAGISEL-NEXT: .LBB5_3: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s8, s[4:5]
+; GFX8DAGISEL-NEXT: v_readlane_b32 s9, v0, s8
+; GFX8DAGISEL-NEXT: s_bitset0_b64 s[4:5], s8
+; GFX8DAGISEL-NEXT: s_max_u32 s7, s7, s9
+; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB5_3
+; GFX8DAGISEL-NEXT: ; %bb.4:
+; GFX8DAGISEL-NEXT: s_mov_b32 s4, s32
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX8DAGISEL-NEXT: v_lshl_add_u32 v0, s7, 6, v0
+; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s32, v0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX8DAGISEL-NEXT: buffer_store_dword v0, off, s[0:3], s33
+; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX8DAGISEL-NEXT: buffer_store_dword v0, off, s[0:3], s6
+; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX8DAGISEL-NEXT: buffer_store_dword v0, off, s[0:3], s4
+; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX8DAGISEL-NEXT: s_addk_i32 s32, 0xf800
+; GFX8DAGISEL-NEXT: s_mov_b32 s33, s10
+; GFX8DAGISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX8-LABEL: multiple_allocas:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 15, v0
+; GFX8-NEXT: s_mov_b32 s10, s33
+; GFX8-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX8-NEXT: s_mov_b64 s[4:5], exec
+; GFX8-NEXT: s_mov_b32 s6, 0
+; GFX8-NEXT: s_mov_b32 s33, s32
+; GFX8-NEXT: s_addk_i32 s32, 0x800
+; GFX8-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX8-NEXT: v_readlane_b32 s8, v0, s7
+; GFX8-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX8-NEXT: s_max_u32 s6, s6, s8
+; GFX8-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX8-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX8-NEXT: ; %bb.2:
+; GFX8-NEXT: v_lshlrev_b32_e64 v0, 6, s6
+; GFX8-NEXT: s_mov_b32 s6, s32
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v0
+; GFX8-NEXT: v_readfirstlane_b32 s32, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 15, v0
+; GFX8-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX8-NEXT: s_mov_b64 s[4:5], exec
+; GFX8-NEXT: s_mov_b32 s7, 0
+; GFX8-NEXT: .LBB5_3: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_ff1_i32_b64 s8, s[4:5]
+; GFX8-NEXT: v_readlane_b32 s9, v0, s8
+; GFX8-NEXT: s_bitset0_b64 s[4:5], s8
+; GFX8-NEXT: s_max_u32 s7, s7, s9
+; GFX8-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX8-NEXT: s_cbranch_scc1 .LBB5_3
+; GFX8-NEXT: ; %bb.4:
+; GFX8-NEXT: v_lshlrev_b32_e64 v0, 6, s7
+; GFX8-NEXT: s_mov_b32 s4, s32
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0
+; GFX8-NEXT: v_readfirstlane_b32 s32, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], s33
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], s6
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], s4
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_addk_i32 s32, 0xf800
+; GFX8-NEXT: s_mov_b32 s33, s10
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: multiple_allocas:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-NEXT: s_mov_b32 s10, s33
+; GFX9-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX9-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-NEXT: s_mov_b32 s7, 0
+; GFX9-NEXT: s_mov_b32 s33, s32
+; GFX9-NEXT: s_addk_i32 s32, 0x800
+; GFX9-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_ff1_i32_b64 s6, s[4:5]
+; GFX9-NEXT: v_readlane_b32 s8, v0, s6
+; GFX9-NEXT: s_bitset0_b64 s[4:5], s6
+; GFX9-NEXT: s_max_u32 s7, s7, s8
+; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX9-NEXT: ; %bb.2:
+; GFX9-NEXT: s_mov_b32 s6, s32
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_lshl_add_u32 v0, s7, 6, v0
+; GFX9-NEXT: v_readfirstlane_b32 s32, v0
+; GFX9-NEXT: v_lshl_add_u32 v0, v1, 2, 15
+; GFX9-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX9-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-NEXT: s_mov_b32 s7, 0
+; GFX9-NEXT: .LBB5_3: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_ff1_i32_b64 s8, s[4:5]
+; GFX9-NEXT: v_readlane_b32 s9, v0, s8
+; GFX9-NEXT: s_bitset0_b64 s[4:5], s8
+; GFX9-NEXT: s_max_u32 s7, s7, s9
+; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-NEXT: s_cbranch_scc1 .LBB5_3
+; GFX9-NEXT: ; %bb.4:
+; GFX9-NEXT: s_mov_b32 s4, s32
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_lshl_add_u32 v0, s7, 6, v0
+; GFX9-NEXT: v_readfirstlane_b32 s32, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s33
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s6
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s4
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_addk_i32 s32, 0xf800
+; GFX9-NEXT: s_mov_b32 s33, s10
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1064-LABEL: multiple_allocas:
+; GFX1064: ; %bb.0: ; %entry
+; GFX1064-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX1064-NEXT: s_mov_b32 s10, s33
+; GFX1064-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064-NEXT: s_mov_b32 s7, 0
+; GFX1064-NEXT: s_mov_b32 s33, s32
+; GFX1064-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX1064-NEXT: s_addk_i32 s32, 0x800
+; GFX1064-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_ff1_i32_b64 s6, s[4:5]
+; GFX1064-NEXT: v_readlane_b32 s8, v0, s6
+; GFX1064-NEXT: s_bitset0_b64 s[4:5], s6
+; GFX1064-NEXT: s_max_u32 s7, s7, s8
+; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX1064-NEXT: ; %bb.2:
+; GFX1064-NEXT: v_lshl_add_u32 v0, v1, 2, 15
+; GFX1064-NEXT: s_mov_b32 s6, s32
+; GFX1064-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064-NEXT: v_lshl_add_u32 v1, s7, 6, s6
+; GFX1064-NEXT: s_mov_b32 s7, 0
+; GFX1064-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX1064-NEXT: v_readfirstlane_b32 s32, v1
+; GFX1064-NEXT: .LBB5_3: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_ff1_i32_b64 s8, s[4:5]
+; GFX1064-NEXT: v_readlane_b32 s9, v0, s8
+; GFX1064-NEXT: s_bitset0_b64 s[4:5], s8
+; GFX1064-NEXT: s_max_u32 s7, s7, s9
+; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1064-NEXT: s_cbranch_scc1 .LBB5_3
+; GFX1064-NEXT: ; %bb.4:
+; GFX1064-NEXT: s_mov_b32 s4, s32
+; GFX1064-NEXT: v_mov_b32_e32 v1, 0x7b
+; GFX1064-NEXT: v_lshl_add_u32 v0, s7, 6, s4
+; GFX1064-NEXT: buffer_store_dword v1, off, s[0:3], s33
+; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1064-NEXT: buffer_store_dword v1, off, s[0:3], s6
+; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1064-NEXT: buffer_store_dword v1, off, s[0:3], s4
+; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1064-NEXT: v_readfirstlane_b32 s32, v0
+; GFX1064-NEXT: s_mov_b32 s33, s10
+; GFX1064-NEXT: s_addk_i32 s32, 0xf800
+; GFX1064-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1032-LABEL: multiple_allocas:
+; GFX1032: ; %bb.0: ; %entry
+; GFX1032-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX1032-NEXT: s_mov_b32 s9, s33
+; GFX1032-NEXT: s_mov_b32 s4, exec_lo
+; GFX1032-NEXT: s_mov_b32 s5, 0
+; GFX1032-NEXT: s_mov_b32 s33, s32
+; GFX1032-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX1032-NEXT: s_addk_i32 s32, 0x400
+; GFX1032-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_ff1_i32_b32 s6, s4
+; GFX1032-NEXT: v_readlane_b32 s7, v0, s6
+; GFX1032-NEXT: s_bitset0_b32 s4, s6
+; GFX1032-NEXT: s_max_u32 s5, s5, s7
+; GFX1032-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX1032-NEXT: ; %bb.2:
+; GFX1032-NEXT: v_lshl_add_u32 v0, v1, 2, 15
+; GFX1032-NEXT: s_mov_b32 s4, s32
+; GFX1032-NEXT: s_mov_b32 s6, exec_lo
+; GFX1032-NEXT: v_lshl_add_u32 v1, s5, 5, s4
+; GFX1032-NEXT: s_mov_b32 s5, 0
+; GFX1032-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX1032-NEXT: v_readfirstlane_b32 s32, v1
+; GFX1032-NEXT: .LBB5_3: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_ff1_i32_b32 s7, s6
+; GFX1032-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1032-NEXT: s_bitset0_b32 s6, s7
+; GFX1032-NEXT: s_max_u32 s5, s5, s8
+; GFX1032-NEXT: s_cmp_lg_u32 s6, 0
+; GFX1032-NEXT: s_cbranch_scc1 .LBB5_3
+; GFX1032-NEXT: ; %bb.4:
+; GFX1032-NEXT: s_mov_b32 s6, s32
+; GFX1032-NEXT: v_mov_b32_e32 v1, 0x7b
+; GFX1032-NEXT: v_lshl_add_u32 v0, s5, 5, s6
+; GFX1032-NEXT: buffer_store_dword v1, off, s[0:3], s33
+; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1032-NEXT: buffer_store_dword v1, off, s[0:3], s4
+; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1032-NEXT: buffer_store_dword v1, off, s[0:3], s6
+; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1032-NEXT: v_readfirstlane_b32 s32, v0
+; GFX1032-NEXT: s_mov_b32 s33, s9
+; GFX1032-NEXT: s_addk_i32 s32, 0xfc00
+; GFX1032-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1164-LABEL: multiple_allocas:
+; GFX1164: ; %bb.0: ; %entry
+; GFX1164-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX1164-NEXT: s_mov_b32 s6, s33
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: s_mov_b32 s3, 0
+; GFX1164-NEXT: s_mov_b32 s33, s32
+; GFX1164-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX1164-NEXT: s_add_i32 s32, s32, 32
+; GFX1164-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT: s_bitset0_b64 s[0:1], s2
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: s_max_u32 s3, s3, s4
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX1164-NEXT: ; %bb.2:
+; GFX1164-NEXT: v_lshl_add_u32 v0, v1, 2, 15
+; GFX1164-NEXT: s_mov_b32 s2, s32
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: v_lshl_add_u32 v1, s3, 6, s2
+; GFX1164-NEXT: s_mov_b32 s3, 0
+; GFX1164-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-NEXT: v_readfirstlane_b32 s32, v1
+; GFX1164-NEXT: .LBB5_3: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1]
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1164-NEXT: s_bitset0_b64 s[0:1], s4
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: s_max_u32 s3, s3, s5
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_cbranch_scc1 .LBB5_3
+; GFX1164-NEXT: ; %bb.4:
+; GFX1164-NEXT: s_mov_b32 s0, s32
+; GFX1164-NEXT: v_mov_b32_e32 v1, 0x7b
+; GFX1164-NEXT: v_lshl_add_u32 v0, s3, 6, s0
+; GFX1164-NEXT: scratch_store_b32 off, v1, s33 dlc
+; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1164-NEXT: scratch_store_b32 off, v1, s2 dlc
+; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1164-NEXT: scratch_store_b32 off, v1, s0 dlc
+; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1164-NEXT: v_readfirstlane_b32 s32, v0
+; GFX1164-NEXT: s_mov_b32 s33, s6
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: s_addk_i32 s32, 0xffe0
+; GFX1164-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1132-LABEL: multiple_allocas:
+; GFX1132: ; %bb.0: ; %entry
+; GFX1132-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX1132-NEXT: s_mov_b32 s5, s33
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: s_mov_b32 s1, 0
+; GFX1132-NEXT: s_mov_b32 s33, s32
+; GFX1132-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX1132-NEXT: s_add_i32 s32, s32, 32
+; GFX1132-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_ctz_i32_b32 s2, s0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1132-NEXT: s_bitset0_b32 s0, s2
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: s_max_u32 s1, s1, s3
+; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX1132-NEXT: ; %bb.2:
+; GFX1132-NEXT: v_lshl_add_u32 v0, v1, 2, 15
+; GFX1132-NEXT: s_mov_b32 s0, s32
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132-NEXT: v_lshl_add_u32 v1, s1, 5, s0
+; GFX1132-NEXT: s_mov_b32 s1, 0
+; GFX1132-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-NEXT: v_readfirstlane_b32 s32, v1
+; GFX1132-NEXT: .LBB5_3: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_ctz_i32_b32 s3, s2
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: v_readlane_b32 s4, v0, s3
+; GFX1132-NEXT: s_bitset0_b32 s2, s3
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: s_max_u32 s1, s1, s4
+; GFX1132-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132-NEXT: s_cbranch_scc1 .LBB5_3
+; GFX1132-NEXT: ; %bb.4:
+; GFX1132-NEXT: s_mov_b32 s2, s32
+; GFX1132-NEXT: v_mov_b32_e32 v1, 0x7b
+; GFX1132-NEXT: v_lshl_add_u32 v0, s1, 5, s2
+; GFX1132-NEXT: scratch_store_b32 off, v1, s33 dlc
+; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1132-NEXT: scratch_store_b32 off, v1, s0 dlc
+; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1132-NEXT: scratch_store_b32 off, v1, s2 dlc
+; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1132-NEXT: v_readfirstlane_b32 s32, v0
+; GFX1132-NEXT: s_mov_b32 s33, s5
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: s_addk_i32 s32, 0xffe0
+; GFX1132-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %static_alloca = alloca i32, i32 4, addrspace(5)
+ %dyn_alloca_1 = alloca i32, i32 %m, addrspace(5)
+ %dyn_alloca_2 = alloca i32, i32 %n, addrspace(5)
+ store volatile i32 123, ptr addrspace(5) %static_alloca
+ store volatile i32 123, ptr addrspace(5) %dyn_alloca_1
+ store volatile i32 123, ptr addrspace(5) %dyn_alloca_2
+ ret void
+}
+
+define void @callee(<33 x i32> %a){
+; GFX8DAGISEL-LABEL: callee:
+; GFX8DAGISEL: ; %bb.0:
+; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8DAGISEL-NEXT: s_mov_b32 s9, s33
+; GFX8DAGISEL-NEXT: s_mov_b32 s33, s32
+; GFX8DAGISEL-NEXT: buffer_load_dword v0, off, s[0:3], s33
+; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX8DAGISEL-NEXT: s_mov_b32 s6, 0
+; GFX8DAGISEL-NEXT: s_addk_i32 s32, 0x400
+; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX8DAGISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX8DAGISEL-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX8DAGISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX8DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX8DAGISEL-NEXT: s_max_u32 s6, s6, s8
+; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX8DAGISEL-NEXT: ; %bb.2:
+; GFX8DAGISEL-NEXT: s_mov_b32 s4, s32
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX8DAGISEL-NEXT: v_lshl_add_u32 v0, s6, 6, v0
+; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s32, v0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, 0x3039
+; GFX8DAGISEL-NEXT: buffer_store_dword v0, off, s[0:3], s4
+; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX8DAGISEL-NEXT: s_addk_i32 s32, 0xfc00
+; GFX8DAGISEL-NEXT: s_mov_b32 s33, s9
+; GFX8DAGISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX8-LABEL: callee:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_mov_b32 s9, s33
+; GFX8-NEXT: s_mov_b32 s33, s32
+; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s33
+; GFX8-NEXT: s_mov_b64 s[4:5], exec
+; GFX8-NEXT: s_mov_b32 s6, 0
+; GFX8-NEXT: s_addk_i32 s32, 0x400
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 15, v0
+; GFX8-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX8-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX8-NEXT: v_readlane_b32 s8, v0, s7
+; GFX8-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX8-NEXT: s_max_u32 s6, s6, s8
+; GFX8-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX8-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX8-NEXT: ; %bb.2:
+; GFX8-NEXT: v_lshlrev_b32_e64 v0, 6, s6
+; GFX8-NEXT: s_mov_b32 s4, s32
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0
+; GFX8-NEXT: v_readfirstlane_b32 s32, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, 0x3039
+; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], s4
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_addk_i32 s32, 0xfc00
+; GFX8-NEXT: s_mov_b32 s33, s9
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: callee:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s9, s33
+; GFX9-NEXT: s_mov_b32 s33, s32
+; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s33
+; GFX9-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-NEXT: s_mov_b32 s6, 0
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX9-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX9-NEXT: v_readlane_b32 s8, v0, s7
+; GFX9-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX9-NEXT: s_max_u32 s6, s6, s8
+; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX9-NEXT: ; %bb.2:
+; GFX9-NEXT: s_mov_b32 s4, s32
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_lshl_add_u32 v0, s6, 6, v0
+; GFX9-NEXT: v_readfirstlane_b32 s32, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0x3039
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s4
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_addk_i32 s32, 0xfc00
+; GFX9-NEXT: s_mov_b32 s33, s9
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1064-LABEL: callee:
+; GFX1064: ; %bb.0:
+; GFX1064-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064-NEXT: s_mov_b32 s9, s33
+; GFX1064-NEXT: s_mov_b32 s33, s32
+; GFX1064-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064-NEXT: buffer_load_dword v0, off, s[0:3], s33
+; GFX1064-NEXT: s_mov_b32 s6, 0
+; GFX1064-NEXT: s_addk_i32 s32, 0x400
+; GFX1064-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX1064-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX1064-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX1064-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1064-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX1064-NEXT: s_max_u32 s6, s6, s8
+; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1064-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX1064-NEXT: ; %bb.2:
+; GFX1064-NEXT: s_mov_b32 s4, s32
+; GFX1064-NEXT: v_mov_b32_e32 v1, 0x3039
+; GFX1064-NEXT: v_lshl_add_u32 v0, s6, 6, s4
+; GFX1064-NEXT: s_mov_b32 s33, s9
+; GFX1064-NEXT: buffer_store_dword v1, off, s[0:3], s4
+; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1064-NEXT: v_readfirstlane_b32 s32, v0
+; GFX1064-NEXT: s_addk_i32 s32, 0xfc00
+; GFX1064-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1032-LABEL: callee:
+; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032-NEXT: s_mov_b32 s8, s33
+; GFX1032-NEXT: s_mov_b32 s33, s32
+; GFX1032-NEXT: s_mov_b32 s5, exec_lo
+; GFX1032-NEXT: buffer_load_dword v0, off, s[0:3], s33
+; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: s_addk_i32 s32, 0x200
+; GFX1032-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX1032-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX1032-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_ff1_i32_b32 s6, s5
+; GFX1032-NEXT: v_readlane_b32 s7, v0, s6
+; GFX1032-NEXT: s_bitset0_b32 s5, s6
+; GFX1032-NEXT: s_max_u32 s4, s4, s7
+; GFX1032-NEXT: s_cmp_lg_u32 s5, 0
+; GFX1032-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX1032-NEXT: ; %bb.2:
+; GFX1032-NEXT: s_mov_b32 s5, s32
+; GFX1032-NEXT: v_mov_b32_e32 v1, 0x3039
+; GFX1032-NEXT: v_lshl_add_u32 v0, s4, 5, s5
+; GFX1032-NEXT: s_mov_b32 s33, s8
+; GFX1032-NEXT: buffer_store_dword v1, off, s[0:3], s5
+; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1032-NEXT: v_readfirstlane_b32 s32, v0
+; GFX1032-NEXT: s_addk_i32 s32, 0xfe00
+; GFX1032-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1164-LABEL: callee:
+; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164-NEXT: s_mov_b32 s5, s33
+; GFX1164-NEXT: s_mov_b32 s33, s32
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: scratch_load_b32 v0, off, s33
+; GFX1164-NEXT: s_mov_b32 s2, 0
+; GFX1164-NEXT: s_add_i32 s32, s32, 16
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX1164-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_ctz_i32_b64 s3, s[0:1]
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_readlane_b32 s4, v0, s3
+; GFX1164-NEXT: s_bitset0_b64 s[0:1], s3
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: s_max_u32 s2, s2, s4
+; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX1164-NEXT: ; %bb.2:
+; GFX1164-NEXT: s_mov_b32 s0, s32
+; GFX1164-NEXT: v_mov_b32_e32 v1, 0x3039
+; GFX1164-NEXT: v_lshl_add_u32 v0, s2, 6, s0
+; GFX1164-NEXT: s_mov_b32 s33, s5
+; GFX1164-NEXT: scratch_store_b32 off, v1, s0 dlc
+; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1164-NEXT: v_readfirstlane_b32 s32, v0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: s_add_i32 s32, s32, -16
+; GFX1164-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1132-LABEL: callee:
+; GFX1132: ; %bb.0:
+; GFX1132-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132-NEXT: s_mov_b32 s4, s33
+; GFX1132-NEXT: s_mov_b32 s33, s32
+; GFX1132-NEXT: s_mov_b32 s1, exec_lo
+; GFX1132-NEXT: scratch_load_b32 v0, off, s33
+; GFX1132-NEXT: s_mov_b32 s0, 0
+; GFX1132-NEXT: s_add_i32 s32, s32, 16
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX1132-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_ctz_i32_b32 s2, s1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1132-NEXT: s_bitset0_b32 s1, s2
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: s_max_u32 s0, s0, s3
+; GFX1132-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1132-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX1132-NEXT: ; %bb.2:
+; GFX1132-NEXT: s_mov_b32 s1, s32
+; GFX1132-NEXT: v_mov_b32_e32 v1, 0x3039
+; GFX1132-NEXT: v_lshl_add_u32 v0, s0, 5, s1
+; GFX1132-NEXT: s_mov_b32 s33, s4
+; GFX1132-NEXT: scratch_store_b32 off, v1, s1 dlc
+; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1132-NEXT: v_readfirstlane_b32 s32, v0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: s_add_i32 s32, s32, -16
+; GFX1132-NEXT: s_setpc_b64 s[30:31]
+ %val = extractelement <33 x i32> %a, i32 31
+ %dyn_alloca_callee = alloca i32, i32 %val, addrspace(5)
+ store volatile i32 12345, ptr addrspace(5) %dyn_alloca_callee
+ ret void
+}
+
+define amdgpu_kernel void @caller(<33 x i32> %a) {
+; GFX8DAGISEL-LABEL: caller:
+; GFX8DAGISEL: ; %bb.0:
+; GFX8DAGISEL-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0
+; GFX8DAGISEL-NEXT: s_mov_b32 s12, s8
+; GFX8DAGISEL-NEXT: s_load_dwordx16 s[16:31], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x64
+; GFX8DAGISEL-NEXT: s_load_dword s8, s[4:5], 0xa4
+; GFX8DAGISEL-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1
+; GFX8DAGISEL-NEXT: s_mov_b32 s54, -1
+; GFX8DAGISEL-NEXT: s_mov_b32 s55, 0xe00000
+; GFX8DAGISEL-NEXT: s_add_u32 s52, s52, s11
+; GFX8DAGISEL-NEXT: v_lshl_add_u32 v3, v0, 3, 15
+; GFX8DAGISEL-NEXT: s_addc_u32 s53, s53, 0
+; GFX8DAGISEL-NEXT: s_mov_b32 s14, s10
+; GFX8DAGISEL-NEXT: s_mov_b32 s13, s9
+; GFX8DAGISEL-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX8DAGISEL-NEXT: v_and_b32_e32 v3, 0x3ff0, v3
+; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec
+; GFX8DAGISEL-NEXT: s_mov_b32 s9, 0
+; GFX8DAGISEL-NEXT: s_mov_b32 s33, 0
+; GFX8DAGISEL-NEXT: s_movk_i32 s32, 0x400
+; GFX8DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s15, s[6:7]
+; GFX8DAGISEL-NEXT: v_readlane_b32 s34, v3, s15
+; GFX8DAGISEL-NEXT: s_bitset0_b64 s[6:7], s15
+; GFX8DAGISEL-NEXT: s_max_u32 s9, s9, s34
+; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX8DAGISEL-NEXT: ; %bb.2:
+; GFX8DAGISEL-NEXT: s_mov_b32 s6, s32
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s6
+; GFX8DAGISEL-NEXT: v_lshl_add_u32 v3, s9, 6, v3
+; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s32, v3
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, 0x7b
+; GFX8DAGISEL-NEXT: buffer_store_dword v3, off, s[52:55], s6
+; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s51
+; GFX8DAGISEL-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX8DAGISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX8DAGISEL-NEXT: buffer_store_dword v3, off, s[52:55], s32
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s8
+; GFX8DAGISEL-NEXT: s_add_u32 s8, s4, 0x124
+; GFX8DAGISEL-NEXT: s_addc_u32 s9, s5, 0
+; GFX8DAGISEL-NEXT: s_getpc_b64 s[4:5]
+; GFX8DAGISEL-NEXT: s_add_u32 s4, s4, callee at gotpcrel32@lo+4
+; GFX8DAGISEL-NEXT: s_addc_u32 s5, s5, callee at gotpcrel32@hi+12
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0
+; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX8DAGISEL-NEXT: s_mov_b64 s[0:1], s[52:53]
+; GFX8DAGISEL-NEXT: buffer_store_dword v3, off, s[52:55], s32 offset:4
+; GFX8DAGISEL-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], s[54:55]
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s16
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s17
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s18
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s19
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v4, s20
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v5, s21
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v6, s22
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v7, s23
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v8, s24
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v9, s25
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v10, s26
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v11, s27
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v12, s28
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v13, s29
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v14, s30
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v15, s31
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v16, s36
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v17, s37
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v18, s38
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v19, s39
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v20, s40
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v21, s41
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v22, s42
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v23, s43
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v24, s44
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v25, s45
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v26, s46
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v27, s47
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v28, s48
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v29, s49
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v30, s50
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: s_swappc_b64 s[30:31], s[34:35]
+; GFX8DAGISEL-NEXT: s_endpgm
+; GFX8-LABEL: caller:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
+; GFX8-NEXT: s_mov_b32 s12, s8
+; GFX8-NEXT: s_load_dwordx16 s[16:31], s[4:5], 0x24
+; GFX8-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x64
+; GFX8-NEXT: s_load_dword s8, s[4:5], 0xa4
+; GFX8-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT: s_mov_b32 s90, -1
+; GFX8-NEXT: s_mov_b32 s91, 0xe80000
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v0
+; GFX8-NEXT: s_add_u32 s88, s88, s11
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 15, v3
+; GFX8-NEXT: s_addc_u32 s89, s89, 0
+; GFX8-NEXT: s_mov_b32 s14, s10
+; GFX8-NEXT: s_mov_b32 s13, s9
+; GFX8-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX8-NEXT: v_and_b32_e32 v3, 0x3ff0, v3
+; GFX8-NEXT: s_mov_b64 s[6:7], exec
+; GFX8-NEXT: s_mov_b32 s9, 0
+; GFX8-NEXT: s_mov_b32 s33, 0
+; GFX8-NEXT: s_movk_i32 s32, 0x400
+; GFX8-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_ff1_i32_b64 s15, s[6:7]
+; GFX8-NEXT: v_readlane_b32 s34, v3, s15
+; GFX8-NEXT: s_bitset0_b64 s[6:7], s15
+; GFX8-NEXT: s_max_u32 s9, s9, s34
+; GFX8-NEXT: s_cmp_lg_u64 s[6:7], 0
+; GFX8-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX8-NEXT: ; %bb.2:
+; GFX8-NEXT: v_lshlrev_b32_e64 v3, 6, s9
+; GFX8-NEXT: s_mov_b32 s6, s32
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, s6, v3
+; GFX8-NEXT: v_readfirstlane_b32 s32, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, 0x7b
+; GFX8-NEXT: buffer_store_dword v3, off, s[88:91], s6
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v3, s51
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX8-NEXT: buffer_store_dword v3, off, s[88:91], s32
+; GFX8-NEXT: v_mov_b32_e32 v3, s8
+; GFX8-NEXT: s_add_u32 s8, s4, 0x124
+; GFX8-NEXT: s_addc_u32 s9, s5, 0
+; GFX8-NEXT: s_getpc_b64 s[4:5]
+; GFX8-NEXT: s_add_u32 s4, s4, callee at gotpcrel32@lo+4
+; GFX8-NEXT: s_addc_u32 s5, s5, callee at gotpcrel32@hi+12
+; GFX8-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX8-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX8-NEXT: s_mov_b64 s[0:1], s[88:89]
+; GFX8-NEXT: buffer_store_dword v3, off, s[88:91], s32 offset:4
+; GFX8-NEXT: v_or_b32_e32 v31, v0, v2
+; GFX8-NEXT: s_mov_b64 s[2:3], s[90:91]
+; GFX8-NEXT: v_mov_b32_e32 v0, s16
+; GFX8-NEXT: v_mov_b32_e32 v1, s17
+; GFX8-NEXT: v_mov_b32_e32 v2, s18
+; GFX8-NEXT: v_mov_b32_e32 v3, s19
+; GFX8-NEXT: v_mov_b32_e32 v4, s20
+; GFX8-NEXT: v_mov_b32_e32 v5, s21
+; GFX8-NEXT: v_mov_b32_e32 v6, s22
+; GFX8-NEXT: v_mov_b32_e32 v7, s23
+; GFX8-NEXT: v_mov_b32_e32 v8, s24
+; GFX8-NEXT: v_mov_b32_e32 v9, s25
+; GFX8-NEXT: v_mov_b32_e32 v10, s26
+; GFX8-NEXT: v_mov_b32_e32 v11, s27
+; GFX8-NEXT: v_mov_b32_e32 v12, s28
+; GFX8-NEXT: v_mov_b32_e32 v13, s29
+; GFX8-NEXT: v_mov_b32_e32 v14, s30
+; GFX8-NEXT: v_mov_b32_e32 v15, s31
+; GFX8-NEXT: v_mov_b32_e32 v16, s36
+; GFX8-NEXT: v_mov_b32_e32 v17, s37
+; GFX8-NEXT: v_mov_b32_e32 v18, s38
+; GFX8-NEXT: v_mov_b32_e32 v19, s39
+; GFX8-NEXT: v_mov_b32_e32 v20, s40
+; GFX8-NEXT: v_mov_b32_e32 v21, s41
+; GFX8-NEXT: v_mov_b32_e32 v22, s42
+; GFX8-NEXT: v_mov_b32_e32 v23, s43
+; GFX8-NEXT: v_mov_b32_e32 v24, s44
+; GFX8-NEXT: v_mov_b32_e32 v25, s45
+; GFX8-NEXT: v_mov_b32_e32 v26, s46
+; GFX8-NEXT: v_mov_b32_e32 v27, s47
+; GFX8-NEXT: v_mov_b32_e32 v28, s48
+; GFX8-NEXT: v_mov_b32_e32 v29, s49
+; GFX8-NEXT: v_mov_b32_e32 v30, s50
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_swappc_b64 s[30:31], s[34:35]
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: caller:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s12, s8
+; GFX9-NEXT: s_load_dwordx16 s[16:31], s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x64
+; GFX9-NEXT: s_load_dword s8, s[4:5], 0xa4
+; GFX9-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s54, -1
+; GFX9-NEXT: s_mov_b32 s55, 0xe00000
+; GFX9-NEXT: s_add_u32 s52, s52, s11
+; GFX9-NEXT: v_lshl_add_u32 v3, v0, 3, 15
+; GFX9-NEXT: s_addc_u32 s53, s53, 0
+; GFX9-NEXT: s_mov_b32 s14, s10
+; GFX9-NEXT: s_mov_b32 s13, s9
+; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX9-NEXT: v_and_b32_e32 v3, 0x3ff0, v3
+; GFX9-NEXT: s_mov_b64 s[6:7], exec
+; GFX9-NEXT: s_mov_b32 s9, 0
+; GFX9-NEXT: s_mov_b32 s33, 0
+; GFX9-NEXT: s_movk_i32 s32, 0x400
+; GFX9-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_ff1_i32_b64 s15, s[6:7]
+; GFX9-NEXT: v_readlane_b32 s34, v3, s15
+; GFX9-NEXT: s_bitset0_b64 s[6:7], s15
+; GFX9-NEXT: s_max_u32 s9, s9, s34
+; GFX9-NEXT: s_cmp_lg_u64 s[6:7], 0
+; GFX9-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX9-NEXT: ; %bb.2:
+; GFX9-NEXT: s_mov_b32 s6, s32
+; GFX9-NEXT: v_mov_b32_e32 v3, s6
+; GFX9-NEXT: v_lshl_add_u32 v3, s9, 6, v3
+; GFX9-NEXT: v_readfirstlane_b32 s32, v3
+; GFX9-NEXT: v_mov_b32_e32 v3, 0x7b
+; GFX9-NEXT: buffer_store_dword v3, off, s[52:55], s6
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v3, s51
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT: buffer_store_dword v3, off, s[52:55], s32
+; GFX9-NEXT: v_mov_b32_e32 v3, s8
+; GFX9-NEXT: s_add_u32 s8, s4, 0x124
+; GFX9-NEXT: s_addc_u32 s9, s5, 0
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, callee at gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, callee at gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0
+; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX9-NEXT: s_mov_b64 s[0:1], s[52:53]
+; GFX9-NEXT: buffer_store_dword v3, off, s[52:55], s32 offset:4
+; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT: s_mov_b64 s[2:3], s[54:55]
+; GFX9-NEXT: v_mov_b32_e32 v0, s16
+; GFX9-NEXT: v_mov_b32_e32 v1, s17
+; GFX9-NEXT: v_mov_b32_e32 v2, s18
+; GFX9-NEXT: v_mov_b32_e32 v3, s19
+; GFX9-NEXT: v_mov_b32_e32 v4, s20
+; GFX9-NEXT: v_mov_b32_e32 v5, s21
+; GFX9-NEXT: v_mov_b32_e32 v6, s22
+; GFX9-NEXT: v_mov_b32_e32 v7, s23
+; GFX9-NEXT: v_mov_b32_e32 v8, s24
+; GFX9-NEXT: v_mov_b32_e32 v9, s25
+; GFX9-NEXT: v_mov_b32_e32 v10, s26
+; GFX9-NEXT: v_mov_b32_e32 v11, s27
+; GFX9-NEXT: v_mov_b32_e32 v12, s28
+; GFX9-NEXT: v_mov_b32_e32 v13, s29
+; GFX9-NEXT: v_mov_b32_e32 v14, s30
+; GFX9-NEXT: v_mov_b32_e32 v15, s31
+; GFX9-NEXT: v_mov_b32_e32 v16, s36
+; GFX9-NEXT: v_mov_b32_e32 v17, s37
+; GFX9-NEXT: v_mov_b32_e32 v18, s38
+; GFX9-NEXT: v_mov_b32_e32 v19, s39
+; GFX9-NEXT: v_mov_b32_e32 v20, s40
+; GFX9-NEXT: v_mov_b32_e32 v21, s41
+; GFX9-NEXT: v_mov_b32_e32 v22, s42
+; GFX9-NEXT: v_mov_b32_e32 v23, s43
+; GFX9-NEXT: v_mov_b32_e32 v24, s44
+; GFX9-NEXT: v_mov_b32_e32 v25, s45
+; GFX9-NEXT: v_mov_b32_e32 v26, s46
+; GFX9-NEXT: v_mov_b32_e32 v27, s47
+; GFX9-NEXT: v_mov_b32_e32 v28, s48
+; GFX9-NEXT: v_mov_b32_e32 v29, s49
+; GFX9-NEXT: v_mov_b32_e32 v30, s50
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
+; GFX9-NEXT: s_endpgm
+;
+; GFX1064-LABEL: caller:
+; GFX1064: ; %bb.0:
+; GFX1064-NEXT: s_mov_b32 s12, s8
+; GFX1064-NEXT: s_clause 0x2
+; GFX1064-NEXT: s_load_dwordx16 s[16:31], s[4:5], 0x24
+; GFX1064-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x64
+; GFX1064-NEXT: s_load_dword s8, s[4:5], 0xa4
+; GFX1064-NEXT: v_lshl_add_u32 v3, v0, 3, 15
+; GFX1064-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT: s_mov_b32 s54, -1
+; GFX1064-NEXT: s_mov_b32 s55, 0x31e16000
+; GFX1064-NEXT: v_and_b32_e32 v3, 0x3ff0, v3
+; GFX1064-NEXT: s_add_u32 s52, s52, s11
+; GFX1064-NEXT: s_addc_u32 s53, s53, 0
+; GFX1064-NEXT: s_mov_b32 s14, s10
+; GFX1064-NEXT: s_mov_b32 s13, s9
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX1064-NEXT: s_mov_b64 s[6:7], exec
+; GFX1064-NEXT: s_mov_b32 s9, 0
+; GFX1064-NEXT: s_mov_b32 s33, 0
+; GFX1064-NEXT: s_movk_i32 s32, 0x400
+; GFX1064-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_ff1_i32_b64 s15, s[6:7]
+; GFX1064-NEXT: v_readlane_b32 s34, v3, s15
+; GFX1064-NEXT: s_bitset0_b64 s[6:7], s15
+; GFX1064-NEXT: s_max_u32 s9, s9, s34
+; GFX1064-NEXT: s_cmp_lg_u64 s[6:7], 0
+; GFX1064-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1064-NEXT: ; %bb.2:
+; GFX1064-NEXT: s_mov_b32 s6, s32
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: v_mov_b32_e32 v6, s8
+; GFX1064-NEXT: s_add_u32 s8, s4, 0x124
+; GFX1064-NEXT: v_lshl_add_u32 v3, s9, 6, s6
+; GFX1064-NEXT: s_addc_u32 s9, s5, 0
+; GFX1064-NEXT: s_getpc_b64 s[4:5]
+; GFX1064-NEXT: s_add_u32 s4, s4, callee at gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s5, s5, callee at gotpcrel32@hi+12
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0x7b
+; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0
+; GFX1064-NEXT: v_mov_b32_e32 v5, s51
+; GFX1064-NEXT: v_readfirstlane_b32 s32, v3
+; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT: buffer_store_dword v4, off, s[52:55], s6
+; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1064-NEXT: buffer_store_dword v5, off, s[52:55], s32
+; GFX1064-NEXT: buffer_store_dword v6, off, s[52:55], s32 offset:4
+; GFX1064-NEXT: v_mov_b32_e32 v3, s19
+; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT: v_mov_b32_e32 v0, s16
+; GFX1064-NEXT: v_mov_b32_e32 v1, s17
+; GFX1064-NEXT: v_mov_b32_e32 v2, s18
+; GFX1064-NEXT: v_mov_b32_e32 v4, s20
+; GFX1064-NEXT: v_mov_b32_e32 v5, s21
+; GFX1064-NEXT: v_mov_b32_e32 v6, s22
+; GFX1064-NEXT: v_mov_b32_e32 v7, s23
+; GFX1064-NEXT: v_mov_b32_e32 v8, s24
+; GFX1064-NEXT: v_mov_b32_e32 v9, s25
+; GFX1064-NEXT: v_mov_b32_e32 v10, s26
+; GFX1064-NEXT: v_mov_b32_e32 v11, s27
+; GFX1064-NEXT: v_mov_b32_e32 v12, s28
+; GFX1064-NEXT: v_mov_b32_e32 v13, s29
+; GFX1064-NEXT: v_mov_b32_e32 v14, s30
+; GFX1064-NEXT: v_mov_b32_e32 v15, s31
+; GFX1064-NEXT: v_mov_b32_e32 v16, s36
+; GFX1064-NEXT: v_mov_b32_e32 v17, s37
+; GFX1064-NEXT: v_mov_b32_e32 v18, s38
+; GFX1064-NEXT: v_mov_b32_e32 v19, s39
+; GFX1064-NEXT: v_mov_b32_e32 v20, s40
+; GFX1064-NEXT: v_mov_b32_e32 v21, s41
+; GFX1064-NEXT: v_mov_b32_e32 v22, s42
+; GFX1064-NEXT: v_mov_b32_e32 v23, s43
+; GFX1064-NEXT: v_mov_b32_e32 v24, s44
+; GFX1064-NEXT: v_mov_b32_e32 v25, s45
+; GFX1064-NEXT: v_mov_b32_e32 v26, s46
+; GFX1064-NEXT: v_mov_b32_e32 v27, s47
+; GFX1064-NEXT: v_mov_b32_e32 v28, s48
+; GFX1064-NEXT: v_mov_b32_e32 v29, s49
+; GFX1064-NEXT: v_mov_b32_e32 v30, s50
+; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[52:53]
+; GFX1064-NEXT: s_mov_b64 s[2:3], s[54:55]
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: s_swappc_b64 s[30:31], s[34:35]
+; GFX1064-NEXT: s_endpgm
+;
+; GFX1032-LABEL: caller:
+; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT: s_mov_b32 s54, -1
+; GFX1032-NEXT: s_mov_b32 s55, 0x31c16000
+; GFX1032-NEXT: s_add_u32 s52, s52, s11
+; GFX1032-NEXT: s_mov_b32 s14, s10
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX1032-NEXT: s_clause 0x2
+; GFX1032-NEXT: s_load_dwordx16 s[16:31], s[4:5], 0x24
+; GFX1032-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x64
+; GFX1032-NEXT: s_load_dword s6, s[4:5], 0xa4
+; GFX1032-NEXT: v_lshl_add_u32 v3, v0, 3, 15
+; GFX1032-NEXT: s_addc_u32 s53, s53, 0
+; GFX1032-NEXT: s_mov_b32 s13, s9
+; GFX1032-NEXT: s_mov_b32 s12, s8
+; GFX1032-NEXT: s_mov_b32 s8, exec_lo
+; GFX1032-NEXT: v_and_b32_e32 v3, 0x3ff0, v3
+; GFX1032-NEXT: s_mov_b32 s7, 0
+; GFX1032-NEXT: s_mov_b32 s33, 0
+; GFX1032-NEXT: s_movk_i32 s32, 0x200
+; GFX1032-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_ff1_i32_b32 s9, s8
+; GFX1032-NEXT: v_readlane_b32 s15, v3, s9
+; GFX1032-NEXT: s_bitset0_b32 s8, s9
+; GFX1032-NEXT: s_max_u32 s7, s7, s15
+; GFX1032-NEXT: s_cmp_lg_u32 s8, 0
+; GFX1032-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1032-NEXT: ; %bb.2:
+; GFX1032-NEXT: v_mov_b32_e32 v4, 0x7b
+; GFX1032-NEXT: s_mov_b32 s8, s32
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: v_mov_b32_e32 v5, s51
+; GFX1032-NEXT: v_lshl_add_u32 v3, s7, 5, s8
+; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT: buffer_store_dword v4, off, s[52:55], s8
+; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1032-NEXT: s_add_u32 s8, s4, 0x124
+; GFX1032-NEXT: s_addc_u32 s9, s5, 0
+; GFX1032-NEXT: s_getpc_b64 s[4:5]
+; GFX1032-NEXT: s_add_u32 s4, s4, callee at gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s5, s5, callee at gotpcrel32@hi+12
+; GFX1032-NEXT: v_readfirstlane_b32 s32, v3
+; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0
+; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT: v_mov_b32_e32 v6, s6
+; GFX1032-NEXT: buffer_store_dword v5, off, s[52:55], s32
+; GFX1032-NEXT: buffer_store_dword v6, off, s[52:55], s32 offset:4
+; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT: v_mov_b32_e32 v0, s16
+; GFX1032-NEXT: v_mov_b32_e32 v1, s17
+; GFX1032-NEXT: v_mov_b32_e32 v2, s18
+; GFX1032-NEXT: v_mov_b32_e32 v3, s19
+; GFX1032-NEXT: v_mov_b32_e32 v4, s20
+; GFX1032-NEXT: v_mov_b32_e32 v5, s21
+; GFX1032-NEXT: v_mov_b32_e32 v6, s22
+; GFX1032-NEXT: v_mov_b32_e32 v7, s23
+; GFX1032-NEXT: v_mov_b32_e32 v8, s24
+; GFX1032-NEXT: v_mov_b32_e32 v9, s25
+; GFX1032-NEXT: v_mov_b32_e32 v10, s26
+; GFX1032-NEXT: v_mov_b32_e32 v11, s27
+; GFX1032-NEXT: v_mov_b32_e32 v12, s28
+; GFX1032-NEXT: v_mov_b32_e32 v13, s29
+; GFX1032-NEXT: v_mov_b32_e32 v14, s30
+; GFX1032-NEXT: v_mov_b32_e32 v15, s31
+; GFX1032-NEXT: v_mov_b32_e32 v16, s36
+; GFX1032-NEXT: v_mov_b32_e32 v17, s37
+; GFX1032-NEXT: v_mov_b32_e32 v18, s38
+; GFX1032-NEXT: v_mov_b32_e32 v19, s39
+; GFX1032-NEXT: v_mov_b32_e32 v20, s40
+; GFX1032-NEXT: v_mov_b32_e32 v21, s41
+; GFX1032-NEXT: v_mov_b32_e32 v22, s42
+; GFX1032-NEXT: v_mov_b32_e32 v23, s43
+; GFX1032-NEXT: v_mov_b32_e32 v24, s44
+; GFX1032-NEXT: v_mov_b32_e32 v25, s45
+; GFX1032-NEXT: v_mov_b32_e32 v26, s46
+; GFX1032-NEXT: v_mov_b32_e32 v27, s47
+; GFX1032-NEXT: v_mov_b32_e32 v28, s48
+; GFX1032-NEXT: v_mov_b32_e32 v29, s49
+; GFX1032-NEXT: v_mov_b32_e32 v30, s50
+; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[52:53]
+; GFX1032-NEXT: s_mov_b64 s[2:3], s[54:55]
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: s_swappc_b64 s[30:31], s[34:35]
+; GFX1032-NEXT: s_endpgm
+;
+; GFX1164-LABEL: caller:
+; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_mov_b32 s12, s8
+; GFX1164-NEXT: s_clause 0x2
+; GFX1164-NEXT: s_load_b512 s[16:31], s[4:5], 0x24
+; GFX1164-NEXT: s_load_b512 s[36:51], s[4:5], 0x64
+; GFX1164-NEXT: s_load_b32 s8, s[4:5], 0xa4
+; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX1164-NEXT: s_mov_b32 s14, s10
+; GFX1164-NEXT: s_mov_b32 s13, s9
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX1164-NEXT: s_mov_b64 s[6:7], exec
+; GFX1164-NEXT: v_lshl_add_u32 v1, v1, 3, 15
+; GFX1164-NEXT: s_mov_b32 s9, 0
+; GFX1164-NEXT: s_mov_b32 s33, 0
+; GFX1164-NEXT: s_mov_b32 s32, 16
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: v_and_b32_e32 v1, 0x3ff0, v1
+; GFX1164-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_ctz_i32_b64 s15, s[6:7]
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_readlane_b32 s34, v1, s15
+; GFX1164-NEXT: s_bitset0_b64 s[6:7], s15
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: s_max_u32 s9, s9, s34
+; GFX1164-NEXT: s_cmp_lg_u64 s[6:7], 0
+; GFX1164-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1164-NEXT: ; %bb.2:
+; GFX1164-NEXT: s_mov_b32 s6, s32
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: v_mov_b32_e32 v2, s8
+; GFX1164-NEXT: s_add_u32 s8, s4, 0x124
+; GFX1164-NEXT: v_lshl_add_u32 v3, s9, 6, s6
+; GFX1164-NEXT: s_addc_u32 s9, s5, 0
+; GFX1164-NEXT: s_getpc_b64 s[4:5]
+; GFX1164-NEXT: s_add_u32 s4, s4, callee at gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s5, s5, callee at gotpcrel32@hi+12
+; GFX1164-NEXT: v_mov_b32_e32 v4, 0x7b
+; GFX1164-NEXT: s_load_b64 s[34:35], s[4:5], 0x0
+; GFX1164-NEXT: v_mov_b32_e32 v1, s51
+; GFX1164-NEXT: v_readfirstlane_b32 s32, v3
+; GFX1164-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-NEXT: scratch_store_b32 off, v4, s6 dlc
+; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1164-NEXT: v_mov_b32_e32 v0, s16
+; GFX1164-NEXT: v_mov_b32_e32 v3, s19
+; GFX1164-NEXT: scratch_store_b64 off, v[1:2], s32
+; GFX1164-NEXT: v_mov_b32_e32 v1, s17
+; GFX1164-NEXT: v_mov_b32_e32 v2, s18
+; GFX1164-NEXT: v_mov_b32_e32 v4, s20
+; GFX1164-NEXT: v_mov_b32_e32 v5, s21
+; GFX1164-NEXT: v_mov_b32_e32 v6, s22
+; GFX1164-NEXT: v_mov_b32_e32 v7, s23
+; GFX1164-NEXT: v_mov_b32_e32 v8, s24
+; GFX1164-NEXT: v_mov_b32_e32 v9, s25
+; GFX1164-NEXT: v_mov_b32_e32 v10, s26
+; GFX1164-NEXT: v_mov_b32_e32 v11, s27
+; GFX1164-NEXT: v_mov_b32_e32 v12, s28
+; GFX1164-NEXT: v_mov_b32_e32 v13, s29
+; GFX1164-NEXT: v_mov_b32_e32 v14, s30
+; GFX1164-NEXT: v_mov_b32_e32 v15, s31
+; GFX1164-NEXT: v_mov_b32_e32 v16, s36
+; GFX1164-NEXT: v_mov_b32_e32 v17, s37
+; GFX1164-NEXT: v_mov_b32_e32 v18, s38
+; GFX1164-NEXT: v_mov_b32_e32 v19, s39
+; GFX1164-NEXT: v_mov_b32_e32 v20, s40
+; GFX1164-NEXT: v_mov_b32_e32 v21, s41
+; GFX1164-NEXT: v_mov_b32_e32 v22, s42
+; GFX1164-NEXT: v_mov_b32_e32 v23, s43
+; GFX1164-NEXT: v_mov_b32_e32 v24, s44
+; GFX1164-NEXT: v_mov_b32_e32 v25, s45
+; GFX1164-NEXT: v_mov_b32_e32 v26, s46
+; GFX1164-NEXT: v_mov_b32_e32 v27, s47
+; GFX1164-NEXT: v_mov_b32_e32 v28, s48
+; GFX1164-NEXT: v_mov_b32_e32 v29, s49
+; GFX1164-NEXT: v_mov_b32_e32 v30, s50
+; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[34:35]
+; GFX1164-NEXT: s_endpgm
+;
+; GFX1132-LABEL: caller:
+; GFX1132: ; %bb.0:
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX1132-NEXT: s_clause 0x2
+; GFX1132-NEXT: s_load_b512 s[16:31], s[4:5], 0x24
+; GFX1132-NEXT: s_load_b512 s[36:51], s[4:5], 0x64
+; GFX1132-NEXT: s_load_b32 s6, s[4:5], 0xa4
+; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_mov_b32 s8, exec_lo
+; GFX1132-NEXT: s_mov_b32 s7, 0
+; GFX1132-NEXT: s_mov_b32 s33, 0
+; GFX1132-NEXT: v_lshl_add_u32 v1, v1, 3, 15
+; GFX1132-NEXT: s_mov_b32 s32, 16
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_and_b32_e32 v1, 0x3ff0, v1
+; GFX1132-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_ctz_i32_b32 s9, s8
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: v_readlane_b32 s13, v1, s9
+; GFX1132-NEXT: s_bitset0_b32 s8, s9
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: s_max_u32 s7, s7, s13
+; GFX1132-NEXT: s_cmp_lg_u32 s8, 0
+; GFX1132-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1132-NEXT: ; %bb.2:
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: v_dual_mov_b32 v4, 0x7b :: v_dual_mov_b32 v1, s51
+; GFX1132-NEXT: s_mov_b32 s8, s32
+; GFX1132-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT: v_lshl_add_u32 v3, s7, 5, s8
+; GFX1132-NEXT: scratch_store_b32 off, v4, s8 dlc
+; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1132-NEXT: s_add_u32 s8, s4, 0x124
+; GFX1132-NEXT: s_addc_u32 s9, s5, 0
+; GFX1132-NEXT: s_getpc_b64 s[4:5]
+; GFX1132-NEXT: s_add_u32 s4, s4, callee at gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s5, s5, callee at gotpcrel32@hi+12
+; GFX1132-NEXT: v_readfirstlane_b32 s32, v3
+; GFX1132-NEXT: s_load_b64 s[34:35], s[4:5], 0x0
+; GFX1132-NEXT: v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v3, s19
+; GFX1132-NEXT: v_dual_mov_b32 v4, s20 :: v_dual_mov_b32 v5, s21
+; GFX1132-NEXT: scratch_store_b64 off, v[1:2], s32
+; GFX1132-NEXT: v_dual_mov_b32 v1, s17 :: v_dual_mov_b32 v2, s18
+; GFX1132-NEXT: v_dual_mov_b32 v6, s22 :: v_dual_mov_b32 v7, s23
+; GFX1132-NEXT: v_dual_mov_b32 v8, s24 :: v_dual_mov_b32 v9, s25
+; GFX1132-NEXT: v_dual_mov_b32 v10, s26 :: v_dual_mov_b32 v11, s27
+; GFX1132-NEXT: v_dual_mov_b32 v12, s28 :: v_dual_mov_b32 v13, s29
+; GFX1132-NEXT: v_dual_mov_b32 v14, s30 :: v_dual_mov_b32 v15, s31
+; GFX1132-NEXT: v_dual_mov_b32 v16, s36 :: v_dual_mov_b32 v17, s37
+; GFX1132-NEXT: v_dual_mov_b32 v18, s38 :: v_dual_mov_b32 v19, s39
+; GFX1132-NEXT: v_dual_mov_b32 v20, s40 :: v_dual_mov_b32 v21, s41
+; GFX1132-NEXT: v_dual_mov_b32 v22, s42 :: v_dual_mov_b32 v23, s43
+; GFX1132-NEXT: v_dual_mov_b32 v24, s44 :: v_dual_mov_b32 v25, s45
+; GFX1132-NEXT: v_dual_mov_b32 v26, s46 :: v_dual_mov_b32 v27, s47
+; GFX1132-NEXT: v_dual_mov_b32 v28, s48 :: v_dual_mov_b32 v29, s49
+; GFX1132-NEXT: v_mov_b32_e32 v30, s50
+; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX1132-NEXT: s_mov_b32 s13, s14
+; GFX1132-NEXT: s_mov_b32 s14, s15
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[34:35]
+; GFX1132-NEXT: s_endpgm
+ %idx = call i32 @llvm.amdgcn.workitem.id.x()
+ %dyn_alloca_caller = alloca i64, i32 %idx, addrspace(5)
+ store volatile i32 123, ptr addrspace(5) %dyn_alloca_caller
+ call void @callee(<33 x i32> %a)
+ ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX10: {{.*}}
More information about the llvm-commits
mailing list