[llvm] 3884cb9 - AMDGPU: Always reserve VGPR for AGPR copies on gfx908

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Wed Feb 16 15:48:21 PST 2022


Author: Matt Arsenault
Date: 2022-02-16T18:48:18-05:00
New Revision: 3884cb92359f68390816344f5e9bb40d78e492a5

URL: https://github.com/llvm/llvm-project/commit/3884cb92359f68390816344f5e9bb40d78e492a5
DIFF: https://github.com/llvm/llvm-project/commit/3884cb92359f68390816344f5e9bb40d78e492a5.diff

LOG: AMDGPU: Always reserve VGPR for AGPR copies on gfx908

Just because there aren't AGPRs in the original program doesn't mean
the register allocator can't choose to use them (unless we were to
forcibly reserve all AGPRs if there weren't any uses). This happens in
high pressure situations and introduces copies to avoid spills.

In this test, the allocator ends up introducing a copy from SGPR to
AGPR which requires an intermediate VGPR. I don't believe it would
introduce a copy from AGPR to AGPR in this situation, since it would
be trying to use an intermediate with a different class.

Theoretically this is also broken on gfx90a, but I have been unable to
come up with a testcase.

Added: 
    llvm/test/CodeGen/AMDGPU/regalloc-introduces-copy-sgpr-to-agpr.mir

Modified: 
    llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
    llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
    llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 3eaa28a0afeb0..9ce83a65dd0d8 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -622,7 +622,7 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
       } else
         MaxNumAGPRs = 0;
     }
-  } else if (ST.hasMAIInsts() && MFI->usesAGPRs(MF)) {
+  } else if (ST.hasMAIInsts()) {
     // In order to guarantee copying between AGPRs, we need a scratch VGPR
     // available at all times.
     reserveRegisterTuples(Reserved, AMDGPU::VGPR32);

diff  --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
index 8ec375cf93b6d..6a1f13c5d3e1c 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
@@ -254,59 +254,59 @@ define amdgpu_kernel void @no_agpr_no_reserve(<32 x i32> addrspace(1)* %arg) #0
 ; GFX908-LABEL: no_agpr_no_reserve:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 7, v0
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1] offset:16
-; GFX908-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1]
-; GFX908-NEXT:    global_load_dwordx4 v[8:11], v32, s[0:1] offset:48
-; GFX908-NEXT:    global_load_dwordx4 v[12:15], v32, s[0:1] offset:32
-; GFX908-NEXT:    global_load_dwordx4 v[16:19], v32, s[0:1] offset:80
-; GFX908-NEXT:    global_load_dwordx4 v[20:23], v32, s[0:1] offset:64
-; GFX908-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:112
-; GFX908-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:96
+; GFX908-NEXT:    global_load_dwordx4 v[0:3], v4, s[0:1] offset:16
+; GFX908-NEXT:    global_load_dwordx4 v[5:8], v4, s[0:1]
+; GFX908-NEXT:    global_load_dwordx4 v[9:12], v4, s[0:1] offset:48
+; GFX908-NEXT:    global_load_dwordx4 v[13:16], v4, s[0:1] offset:32
+; GFX908-NEXT:    global_load_dwordx4 v[17:20], v4, s[0:1] offset:80
+; GFX908-NEXT:    global_load_dwordx4 v[21:24], v4, s[0:1] offset:64
+; GFX908-NEXT:    global_load_dwordx4 v[25:28], v4, s[0:1] offset:112
+; GFX908-NEXT:    global_load_dwordx4 v[33:36], v4, s[0:1] offset:96
 ; GFX908-NEXT:    s_waitcnt vmcnt(7)
 ; GFX908-NEXT:    v_add_u32_e32 v3, v3, v3
 ; GFX908-NEXT:    v_add_u32_e32 v2, v2, v2
 ; GFX908-NEXT:    v_add_u32_e32 v1, v1, v1
 ; GFX908-NEXT:    v_add_u32_e32 v0, v0, v0
 ; GFX908-NEXT:    s_waitcnt vmcnt(6)
+; GFX908-NEXT:    v_add_u32_e32 v8, v8, v8
 ; GFX908-NEXT:    v_add_u32_e32 v7, v7, v7
 ; GFX908-NEXT:    v_add_u32_e32 v6, v6, v6
-; GFX908-NEXT:    v_add_u32_e32 v5, v5, v5
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_add_u32_e32 v31, v31, v31
-; GFX908-NEXT:    v_add_u32_e32 v30, v30, v30
-; GFX908-NEXT:    v_add_u32_e32 v29, v29, v29
-; GFX908-NEXT:    v_add_u32_e32 v28, v28, v28
-; GFX908-NEXT:    v_add_u32_e32 v4, v4, v4
+; GFX908-NEXT:    v_add_u32_e32 v36, v36, v36
+; GFX908-NEXT:    v_add_u32_e32 v35, v35, v35
+; GFX908-NEXT:    v_add_u32_e32 v34, v34, v34
+; GFX908-NEXT:    v_add_u32_e32 v33, v33, v33
+; GFX908-NEXT:    v_add_u32_e32 v5, v5, v5
+; GFX908-NEXT:    v_add_u32_e32 v12, v12, v12
 ; GFX908-NEXT:    v_add_u32_e32 v11, v11, v11
 ; GFX908-NEXT:    v_add_u32_e32 v10, v10, v10
 ; GFX908-NEXT:    v_add_u32_e32 v9, v9, v9
-; GFX908-NEXT:    v_add_u32_e32 v8, v8, v8
+; GFX908-NEXT:    v_add_u32_e32 v16, v16, v16
 ; GFX908-NEXT:    v_add_u32_e32 v15, v15, v15
 ; GFX908-NEXT:    v_add_u32_e32 v14, v14, v14
 ; GFX908-NEXT:    v_add_u32_e32 v13, v13, v13
-; GFX908-NEXT:    v_add_u32_e32 v12, v12, v12
+; GFX908-NEXT:    v_add_u32_e32 v20, v20, v20
 ; GFX908-NEXT:    v_add_u32_e32 v19, v19, v19
 ; GFX908-NEXT:    v_add_u32_e32 v18, v18, v18
 ; GFX908-NEXT:    v_add_u32_e32 v17, v17, v17
-; GFX908-NEXT:    v_add_u32_e32 v16, v16, v16
+; GFX908-NEXT:    v_add_u32_e32 v24, v24, v24
 ; GFX908-NEXT:    v_add_u32_e32 v23, v23, v23
 ; GFX908-NEXT:    v_add_u32_e32 v22, v22, v22
 ; GFX908-NEXT:    v_add_u32_e32 v21, v21, v21
-; GFX908-NEXT:    v_add_u32_e32 v20, v20, v20
+; GFX908-NEXT:    v_add_u32_e32 v28, v28, v28
 ; GFX908-NEXT:    v_add_u32_e32 v27, v27, v27
 ; GFX908-NEXT:    v_add_u32_e32 v26, v26, v26
 ; GFX908-NEXT:    v_add_u32_e32 v25, v25, v25
-; GFX908-NEXT:    v_add_u32_e32 v24, v24, v24
-; GFX908-NEXT:    global_store_dwordx4 v32, v[28:31], s[0:1] offset:96
-; GFX908-NEXT:    global_store_dwordx4 v32, v[24:27], s[0:1] offset:112
-; GFX908-NEXT:    global_store_dwordx4 v32, v[20:23], s[0:1] offset:64
-; GFX908-NEXT:    global_store_dwordx4 v32, v[16:19], s[0:1] offset:80
-; GFX908-NEXT:    global_store_dwordx4 v32, v[12:15], s[0:1] offset:32
-; GFX908-NEXT:    global_store_dwordx4 v32, v[8:11], s[0:1] offset:48
-; GFX908-NEXT:    global_store_dwordx4 v32, v[4:7], s[0:1]
-; GFX908-NEXT:    global_store_dwordx4 v32, v[0:3], s[0:1] offset:16
+; GFX908-NEXT:    global_store_dwordx4 v4, v[33:36], s[0:1] offset:96
+; GFX908-NEXT:    global_store_dwordx4 v4, v[25:28], s[0:1] offset:112
+; GFX908-NEXT:    global_store_dwordx4 v4, v[21:24], s[0:1] offset:64
+; GFX908-NEXT:    global_store_dwordx4 v4, v[17:20], s[0:1] offset:80
+; GFX908-NEXT:    global_store_dwordx4 v4, v[13:16], s[0:1] offset:32
+; GFX908-NEXT:    global_store_dwordx4 v4, v[9:12], s[0:1] offset:48
+; GFX908-NEXT:    global_store_dwordx4 v4, v[5:8], s[0:1]
+; GFX908-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
 ; GFX908-NEXT:    s_endpgm
 ;
 ; GFX90A-LABEL: no_agpr_no_reserve:
@@ -518,9 +518,358 @@ define void @v32_asm_def_use(float %v0, float %v1) #0 {
   ret void
 }
 
+define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg2, i64 %arg3, <2 x half> %arg4, <2 x half> %arg5) #3 {
+; GFX908-LABEL: introduced_copy_to_sgpr:
+; GFX908:       ; %bb.0: ; %bb
+; GFX908-NEXT:    global_load_ushort v0, v[0:1], off glc
+; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX908-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x10
+; GFX908-NEXT:    s_load_dword s7, s[4:5], 0x18
+; GFX908-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX908-NEXT:    s_mov_b32 s6, 0
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    v_cvt_f32_u32_e32 v1, s1
+; GFX908-NEXT:    s_sub_i32 s4, 0, s1
+; GFX908-NEXT:    s_lshl_b64 s[10:11], s[2:3], 5
+; GFX908-NEXT:    s_or_b32 s10, s10, 28
+; GFX908-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX908-NEXT:    v_mov_b32_e32 v13, s10
+; GFX908-NEXT:    s_lshr_b32 s12, s7, 16
+; GFX908-NEXT:    v_mov_b32_e32 v32, s11
+; GFX908-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
+; GFX908-NEXT:    v_cvt_u32_f32_e32 v2, v1
+; GFX908-NEXT:    v_cvt_f32_f16_e32 v28, s7
+; GFX908-NEXT:    v_cvt_f32_f16_e32 v29, s12
+; GFX908-NEXT:    v_accvgpr_write_b32 a0, v13
+; GFX908-NEXT:    v_mul_lo_u32 v1, s4, v2
+; GFX908-NEXT:    v_accvgpr_write_b32 a1, v32
+; GFX908-NEXT:    v_mov_b32_e32 v11, s3
+; GFX908-NEXT:    s_lshl_b64 s[4:5], s[8:9], 5
+; GFX908-NEXT:    v_mul_hi_u32 v3, v2, v1
+; GFX908-NEXT:    v_mov_b32_e32 v1, 0
+; GFX908-NEXT:    v_mov_b32_e32 v10, s2
+; GFX908-NEXT:    v_add_u32_e32 v2, v2, v3
+; GFX908-NEXT:    v_mul_hi_u32 v4, s0, v2
+; GFX908-NEXT:    v_mul_lo_u32 v5, v4, s1
+; GFX908-NEXT:    v_add_u32_e32 v6, 1, v4
+; GFX908-NEXT:    v_sub_u32_e32 v5, s0, v5
+; GFX908-NEXT:    v_cmp_le_u32_e32 vcc, s1, v5
+; GFX908-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX908-NEXT:    v_subrev_u32_e32 v6, s1, v5
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
+; GFX908-NEXT:    v_add_u32_e32 v7, 1, v4
+; GFX908-NEXT:    v_cmp_le_u32_e32 vcc, s1, v5
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_and_b32_e32 v30, 0xffff, v0
+; GFX908-NEXT:    v_cndmask_b32_e32 v0, v4, v7, vcc
+; GFX908-NEXT:    v_mul_lo_u32 v8, s9, v30
+; GFX908-NEXT:    v_mul_hi_u32 v9, s8, v30
+; GFX908-NEXT:    v_lshlrev_b64 v[2:3], 5, v[0:1]
+; GFX908-NEXT:    v_mul_lo_u32 v6, s8, v30
+; GFX908-NEXT:    v_add_u32_e32 v7, v9, v8
+; GFX908-NEXT:    v_accvgpr_write_b32 a2, v2
+; GFX908-NEXT:    v_accvgpr_write_b32 a3, v3
+; GFX908-NEXT:    v_lshlrev_b64 v[6:7], 5, v[6:7]
+; GFX908-NEXT:    s_branch .LBB3_2
+; GFX908-NEXT:  .LBB3_1: ; %bb12
+; GFX908-NEXT:    ; in Loop: Header=BB3_2 Depth=1
+; GFX908-NEXT:    v_add_co_u32_e32 v10, vcc, v10, v0
+; GFX908-NEXT:    s_nop 0
+; GFX908-NEXT:    v_accvgpr_read_b32 v3, a1
+; GFX908-NEXT:    v_accvgpr_read_b32 v5, a3
+; GFX908-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v11, vcc
+; GFX908-NEXT:    v_accvgpr_read_b32 v2, a0
+; GFX908-NEXT:    v_accvgpr_read_b32 v4, a2
+; GFX908-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
+; GFX908-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
+; GFX908-NEXT:    s_nop 0
+; GFX908-NEXT:    v_accvgpr_write_b32 a0, v2
+; GFX908-NEXT:    v_accvgpr_write_b32 a1, v3
+; GFX908-NEXT:  .LBB3_2: ; %bb9
+; GFX908-NEXT:    ; =>This Loop Header: Depth=1
+; GFX908-NEXT:    ; Child Loop BB3_5 Depth 2
+; GFX908-NEXT:    s_cbranch_scc0 .LBB3_1
+; GFX908-NEXT:  ; %bb.3: ; %bb14
+; GFX908-NEXT:    ; in Loop: Header=BB3_2 Depth=1
+; GFX908-NEXT:    v_mov_b32_e32 v2, 0
+; GFX908-NEXT:    v_mov_b32_e32 v3, 0
+; GFX908-NEXT:    global_load_dwordx2 v[12:13], v[2:3], off
+; GFX908-NEXT:    s_mov_b32 s7, s6
+; GFX908-NEXT:    v_cmp_gt_i64_e64 s[0:1], 0, v[10:11]
+; GFX908-NEXT:    v_accvgpr_read_b32 v15, a1
+; GFX908-NEXT:    v_mov_b32_e32 v17, s7
+; GFX908-NEXT:    v_mov_b32_e32 v19, s7
+; GFX908-NEXT:    v_accvgpr_read_b32 v14, a0
+; GFX908-NEXT:    v_mov_b32_e32 v16, s6
+; GFX908-NEXT:    v_mov_b32_e32 v18, s6
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_add_co_u32_e32 v22, vcc, 1, v12
+; GFX908-NEXT:    v_addc_co_u32_e32 v20, vcc, 0, v13, vcc
+; GFX908-NEXT:    v_mul_lo_u32 v23, s4, v20
+; GFX908-NEXT:    v_mul_hi_u32 v24, s4, v22
+; GFX908-NEXT:    v_mul_lo_u32 v25, s5, v22
+; GFX908-NEXT:    v_mul_lo_u32 v31, s4, v22
+; GFX908-NEXT:    v_mov_b32_e32 v21, s7
+; GFX908-NEXT:    v_add_u32_e32 v22, v24, v23
+; GFX908-NEXT:    v_add_u32_e32 v33, v22, v25
+; GFX908-NEXT:    v_mov_b32_e32 v23, s7
+; GFX908-NEXT:    v_mov_b32_e32 v20, s6
+; GFX908-NEXT:    v_mov_b32_e32 v22, s6
+; GFX908-NEXT:    s_branch .LBB3_5
+; GFX908-NEXT:  .LBB3_4: ; %bb58
+; GFX908-NEXT:    ; in Loop: Header=BB3_5 Depth=2
+; GFX908-NEXT:    v_add_co_u32_e32 v12, vcc, v12, v30
+; GFX908-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v13, vcc
+; GFX908-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[12:13]
+; GFX908-NEXT:    v_add_co_u32_e64 v14, s[2:3], v14, v6
+; GFX908-NEXT:    v_addc_co_u32_e64 v15, s[2:3], v15, v7, s[2:3]
+; GFX908-NEXT:    s_and_b64 vcc, exec, vcc
+; GFX908-NEXT:    s_cbranch_vccz .LBB3_1
+; GFX908-NEXT:  .LBB3_5: ; %bb16
+; GFX908-NEXT:    ; Parent Loop BB3_2 Depth=1
+; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX908-NEXT:    v_add_co_u32_e32 v24, vcc, v14, v31
+; GFX908-NEXT:    v_addc_co_u32_e32 v25, vcc, v15, v33, vcc
+; GFX908-NEXT:    global_load_dword v35, v[24:25], off offset:-12 glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    global_load_dword v34, v[24:25], off offset:-8 glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    global_load_dword v26, v[24:25], off offset:-4 glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    global_load_dword v24, v[24:25], off glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    ds_read_b64 v[24:25], v1
+; GFX908-NEXT:    ds_read_b64 v[26:27], v0
+; GFX908-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    s_cbranch_vccnz .LBB3_4
+; GFX908-NEXT:  ; %bb.6: ; %bb51
+; GFX908-NEXT:    ; in Loop: Header=BB3_5 Depth=2
+; GFX908-NEXT:    v_cvt_f32_f16_sdwa v8, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX908-NEXT:    v_cvt_f32_f16_e32 v9, v35
+; GFX908-NEXT:    v_cvt_f32_f16_sdwa v35, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX908-NEXT:    v_cvt_f32_f16_e32 v34, v34
+; GFX908-NEXT:    v_add_f32_e32 v4, v28, v24
+; GFX908-NEXT:    v_add_f32_e32 v5, v29, v25
+; GFX908-NEXT:    v_add_f32_e32 v2, 0, v24
+; GFX908-NEXT:    v_add_f32_e32 v3, 0, v25
+; GFX908-NEXT:    v_add_f32_e32 v8, v8, v27
+; GFX908-NEXT:    v_add_f32_e32 v9, v9, v26
+; GFX908-NEXT:    v_add_f32_e32 v25, v35, v25
+; GFX908-NEXT:    v_add_f32_e32 v24, v34, v24
+; GFX908-NEXT:    v_add_f32_e32 v17, v17, v5
+; GFX908-NEXT:    v_add_f32_e32 v16, v16, v4
+; GFX908-NEXT:    v_add_f32_e32 v19, v19, v3
+; GFX908-NEXT:    v_add_f32_e32 v18, v18, v2
+; GFX908-NEXT:    v_add_f32_e32 v20, v20, v9
+; GFX908-NEXT:    v_add_f32_e32 v21, v21, v8
+; GFX908-NEXT:    v_add_f32_e32 v22, v22, v24
+; GFX908-NEXT:    v_add_f32_e32 v23, v23, v25
+; GFX908-NEXT:    s_branch .LBB3_4
+;
+; GFX90A-LABEL: introduced_copy_to_sgpr:
+; GFX90A:       ; %bb.0: ; %bb
+; GFX90A-NEXT:    global_load_ushort v10, v[0:1], off glc
+; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x8
+; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x10
+; GFX90A-NEXT:    s_load_dword s2, s[4:5], 0x18
+; GFX90A-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90A-NEXT:    s_mov_b32 s4, 0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s7
+; GFX90A-NEXT:    s_sub_i32 s5, 0, s7
+; GFX90A-NEXT:    s_lshr_b32 s12, s2, 16
+; GFX90A-NEXT:    v_cvt_f32_f16_e32 v2, s2
+; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX90A-NEXT:    v_cvt_f32_f16_e32 v3, s12
+; GFX90A-NEXT:    s_lshl_b64 s[10:11], s[8:9], 5
+; GFX90A-NEXT:    s_or_b32 s10, s10, 28
+; GFX90A-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX90A-NEXT:    s_lshl_b64 s[2:3], s[0:1], 5
+; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], s[10:11], s[10:11] op_sel:[0,1]
+; GFX90A-NEXT:    v_mul_lo_u32 v8, s5, v0
+; GFX90A-NEXT:    v_mul_hi_u32 v8, v0, v8
+; GFX90A-NEXT:    v_add_u32_e32 v0, v0, v8
+; GFX90A-NEXT:    v_mul_hi_u32 v0, s6, v0
+; GFX90A-NEXT:    v_mul_lo_u32 v8, v0, s7
+; GFX90A-NEXT:    v_sub_u32_e32 v8, s6, v8
+; GFX90A-NEXT:    v_add_u32_e32 v9, 1, v0
+; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s7, v8
+; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc
+; GFX90A-NEXT:    v_subrev_u32_e32 v9, s7, v8
+; GFX90A-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
+; GFX90A-NEXT:    v_add_u32_e32 v9, 1, v0
+; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s7, v8
+; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc
+; GFX90A-NEXT:    v_lshlrev_b64 v[8:9], 5, v[0:1]
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_and_b32_e32 v30, 0xffff, v10
+; GFX90A-NEXT:    v_mul_lo_u32 v11, s1, v30
+; GFX90A-NEXT:    v_mul_hi_u32 v12, s0, v30
+; GFX90A-NEXT:    v_mul_lo_u32 v10, s0, v30
+; GFX90A-NEXT:    v_add_u32_e32 v11, v12, v11
+; GFX90A-NEXT:    v_lshlrev_b64 v[10:11], 5, v[10:11]
+; GFX90A-NEXT:    v_pk_mov_b32 v[12:13], 0, 0
+; GFX90A-NEXT:    s_branch .LBB3_2
+; GFX90A-NEXT:  .LBB3_1: ; %bb12
+; GFX90A-NEXT:    ; in Loop: Header=BB3_2 Depth=1
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
+; GFX90A-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v8
+; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, v7, v9, vcc
+; GFX90A-NEXT:  .LBB3_2: ; %bb9
+; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
+; GFX90A-NEXT:    ; Child Loop BB3_5 Depth 2
+; GFX90A-NEXT:    s_cbranch_scc0 .LBB3_1
+; GFX90A-NEXT:  ; %bb.3: ; %bb14
+; GFX90A-NEXT:    ; in Loop: Header=BB3_2 Depth=1
+; GFX90A-NEXT:    global_load_dwordx2 v[14:15], v[12:13], off
+; GFX90A-NEXT:    s_mov_b32 s5, s4
+; GFX90A-NEXT:    v_cmp_gt_i64_e64 s[0:1], 0, v[4:5]
+; GFX90A-NEXT:    v_pk_mov_b32 v[16:17], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[18:19], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[20:21], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[22:23], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_add_co_u32_e32 v24, vcc, 1, v14
+; GFX90A-NEXT:    v_addc_co_u32_e32 v25, vcc, 0, v15, vcc
+; GFX90A-NEXT:    v_mul_lo_u32 v25, s2, v25
+; GFX90A-NEXT:    v_mul_hi_u32 v26, s2, v24
+; GFX90A-NEXT:    v_mul_lo_u32 v27, s3, v24
+; GFX90A-NEXT:    v_mul_lo_u32 v31, s2, v24
+; GFX90A-NEXT:    v_add_u32_e32 v24, v26, v25
+; GFX90A-NEXT:    v_add_u32_e32 v32, v24, v27
+; GFX90A-NEXT:    v_pk_mov_b32 v[24:25], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    s_branch .LBB3_5
+; GFX90A-NEXT:  .LBB3_4: ; %bb58
+; GFX90A-NEXT:    ; in Loop: Header=BB3_5 Depth=2
+; GFX90A-NEXT:    v_add_co_u32_e32 v14, vcc, v14, v30
+; GFX90A-NEXT:    v_addc_co_u32_e32 v15, vcc, 0, v15, vcc
+; GFX90A-NEXT:    v_add_co_u32_e32 v16, vcc, v16, v10
+; GFX90A-NEXT:    v_addc_co_u32_e32 v17, vcc, v17, v11, vcc
+; GFX90A-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[14:15]
+; GFX90A-NEXT:    s_and_b64 vcc, exec, vcc
+; GFX90A-NEXT:    s_cbranch_vccz .LBB3_1
+; GFX90A-NEXT:  .LBB3_5: ; %bb16
+; GFX90A-NEXT:    ; Parent Loop BB3_2 Depth=1
+; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX90A-NEXT:    v_add_co_u32_e32 v26, vcc, v16, v31
+; GFX90A-NEXT:    v_addc_co_u32_e32 v27, vcc, v17, v32, vcc
+; GFX90A-NEXT:    global_load_dword v34, v[26:27], off offset:-12 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    global_load_dword v33, v[26:27], off offset:-8 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    global_load_dword v28, v[26:27], off offset:-4 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    global_load_dword v28, v[26:27], off glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    ; kill: killed $vgpr26 killed $vgpr27
+; GFX90A-NEXT:    ds_read_b64 v[26:27], v1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    ds_read_b64 v[28:29], v0
+; GFX90A-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_cbranch_vccnz .LBB3_4
+; GFX90A-NEXT:  ; %bb.6: ; %bb51
+; GFX90A-NEXT:    ; in Loop: Header=BB3_5 Depth=2
+; GFX90A-NEXT:    v_cvt_f32_f16_sdwa v35, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX90A-NEXT:    v_cvt_f32_f16_e32 v34, v34
+; GFX90A-NEXT:    v_cvt_f32_f16_sdwa v37, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX90A-NEXT:    v_cvt_f32_f16_e32 v36, v33
+; GFX90A-NEXT:    v_pk_add_f32 v[38:39], v[2:3], v[26:27]
+; GFX90A-NEXT:    v_pk_add_f32 v[40:41], v[26:27], 0 op_sel_hi:[1,0]
+; GFX90A-NEXT:    v_pk_add_f32 v[28:29], v[34:35], v[28:29]
+; GFX90A-NEXT:    v_pk_add_f32 v[26:27], v[36:37], v[26:27]
+; GFX90A-NEXT:    v_pk_add_f32 v[18:19], v[18:19], v[38:39]
+; GFX90A-NEXT:    v_pk_add_f32 v[20:21], v[20:21], v[40:41]
+; GFX90A-NEXT:    v_pk_add_f32 v[22:23], v[22:23], v[28:29]
+; GFX90A-NEXT:    v_pk_add_f32 v[24:25], v[24:25], v[26:27]
+; GFX90A-NEXT:    s_branch .LBB3_4
+bb:
+  %i = load volatile i16, i16 addrspace(4)* undef, align 2
+  %i6 = zext i16 %i to i64
+  %i7 = udiv i32 %arg1, %arg2
+  %i8 = zext i32 %i7 to i64
+  br label %bb9
+
+bb9:                                              ; preds = %bb12, %bb
+  %i10 = phi i64 [ %arg3, %bb ], [ %i13, %bb12 ]
+  %i11 = icmp slt i64 %i10, 0
+  br i1 undef, label %bb14, label %bb12
+
+bb12:                                             ; preds = %bb58, %bb9
+  %i13 = add nuw nsw i64 %i10, %i8
+  br label %bb9
+
+bb14:                                             ; preds = %bb9
+  %i15 = load i64, i64 addrspace(1)* null, align 8
+  br label %bb16
+
+bb16:                                             ; preds = %bb58, %bb14
+  %i17 = phi i64 [ %i65, %bb58 ], [ %i15, %bb14 ]
+  %i18 = phi <2 x float> [ %i59, %bb58 ], [ zeroinitializer, %bb14 ]
+  %i19 = phi <2 x float> [ %i60, %bb58 ], [ zeroinitializer, %bb14 ]
+  %i20 = phi <2 x float> [ %i61, %bb58 ], [ zeroinitializer, %bb14 ]
+  %i21 = phi <2 x float> [ %i62, %bb58 ], [ zeroinitializer, %bb14 ]
+  %i22 = add nsw i64 %i17, 1
+  %i23 = mul nsw i64 %i22, %arg
+  %i24 = add nsw i64 %i23, %i10
+  %i25 = getelementptr inbounds [16 x half], [16 x half] addrspace(1)* null, i64 %i24, i64 8
+  %i26 = bitcast half addrspace(1)* %i25 to <2 x half> addrspace(1)*
+  %i27 = load volatile <2 x half>, <2 x half> addrspace(1)* %i26, align 16
+  %i28 = getelementptr inbounds [16 x half], [16 x half] addrspace(1)* null, i64 %i24, i64 10
+  %i29 = bitcast half addrspace(1)* %i28 to <2 x half> addrspace(1)*
+  %i30 = load volatile <2 x half>, <2 x half> addrspace(1)* %i29, align 4
+  %i31 = getelementptr inbounds [16 x half], [16 x half] addrspace(1)* null, i64 %i24, i64 12
+  %i32 = bitcast half addrspace(1)* %i31 to <2 x half> addrspace(1)*
+  %i33 = load volatile <2 x half>, <2 x half> addrspace(1)* %i32, align 8
+  %i34 = getelementptr inbounds [16 x half], [16 x half] addrspace(1)* null, i64 %i24, i64 14
+  %i35 = bitcast half addrspace(1)* %i34 to <2 x half> addrspace(1)*
+  %i36 = load volatile <2 x half>, <2 x half> addrspace(1)* %i35, align 4
+  %i37 = fpext <2 x half> %arg4 to <2 x float>
+  %i39 = fpext <2 x half> %i27 to <2 x float>
+  %i40 = fpext <2 x half> %i30 to <2 x float>
+  %i41 = fpext <2 x half> %i33 to <2 x float>
+  %i42 = fpext <2 x half> %i36 to <2 x float>
+  %i43 = load volatile <2 x float>, <2 x float> addrspace(3)* null, align 8
+  %i44 = fadd contract <2 x float> %i37, %i43
+  %i45 = fadd contract <2 x float> %i43, zeroinitializer
+  %i46 = load volatile <2 x float>, <2 x float> addrspace(3)* undef, align 32
+  %i47 = fadd contract <2 x float> %i39, %i46
+  %i48 = fadd contract <2 x float> %i40, %i43
+  %i49 = fadd contract <2 x float> %i41, zeroinitializer
+  %i50 = fadd contract <2 x float> %i42, zeroinitializer
+  fence syncscope("workgroup") acquire
+  br i1 %i11, label %bb58, label %bb51
+
+bb51:                                             ; preds = %bb16
+  %i52 = fadd contract <2 x float> %i18, %i44
+  %i53 = fadd contract <2 x float> %i19, %i45
+  %i54 = fadd contract <2 x float> %i20, %i47
+  %i55 = fadd contract <2 x float> %i21, %i48
+  %i56 = fadd contract <2 x float> %i49, zeroinitializer
+  %i57 = fadd contract <2 x float> %i50, zeroinitializer
+  br label %bb58
+
+bb58:                                             ; preds = %bb51, %bb16
+  %i59 = phi <2 x float> [ %i18, %bb16 ], [ %i52, %bb51 ]
+  %i60 = phi <2 x float> [ %i19, %bb16 ], [ %i53, %bb51 ]
+  %i61 = phi <2 x float> [ %i20, %bb16 ], [ %i54, %bb51 ]
+  %i62 = phi <2 x float> [ %i21, %bb16 ], [ %i55, %bb51 ]
+  %i63 = phi <2 x float> [ zeroinitializer, %bb16 ], [ %i56, %bb51 ]
+  %i64 = phi <2 x float> [ zeroinitializer, %bb16 ], [ %i57, %bb51 ]
+  %i65 = add nsw i64 %i17, %i6
+  %i66 = icmp slt i64 %i65, 0
+  br i1 %i66, label %bb16, label %bb12
+}
+
 declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32 immarg, i32 immarg, i32 immarg) #1
 declare i32 @llvm.amdgcn.workitem.id.x() #2
 
 attributes #0 = { "amdgpu-waves-per-eu"="6,6" }
 attributes #1 = { convergent nounwind readnone willreturn }
 attributes #2 = { nounwind readnone willreturn }
+attributes #3 = { "amdgpu-waves-per-eu"="7,7" }

diff  --git a/llvm/test/CodeGen/AMDGPU/regalloc-introduces-copy-sgpr-to-agpr.mir b/llvm/test/CodeGen/AMDGPU/regalloc-introduces-copy-sgpr-to-agpr.mir
new file mode 100644
index 0000000000000..00936443ba9be
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/regalloc-introduces-copy-sgpr-to-agpr.mir
@@ -0,0 +1,471 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -start-before=greedy,0 -stop-after=postrapseudos -o - %s | FileCheck -check-prefix=GFX908 %s
+
+# This testcase has a long sequence of sgpr to vgpr copies with a lot of vgpr
+# pressure, encouraging the allocator to displace some into AGPRs. This
+# introduces SGPR to AGPR copies which require a reserved temporary VGPR to
+# handle.
+
+--- |
+
+  define amdgpu_kernel void @regalloc_introduces_s_to_a_copy() #0 {
+    ret void
+  }
+
+  attributes #0 = { "amdgpu-waves-per-eu"="7,7" }
+
+...
+---
+name:            regalloc_introduces_s_to_a_copy
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+body:             |
+  bb.0:
+    liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31, $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47, $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
+
+    ; GFX908-LABEL: name: regalloc_introduces_s_to_a_copy
+    ; GFX908: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31, $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47, $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63, $vgpr32_vgpr33_vgpr34_vgpr35, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr7
+    ; GFX908-NEXT: {{  $}}
+    ; GFX908-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $sgpr7, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX908-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr16 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr17 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr18 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr19 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr20 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr21 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr22 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr23 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr24 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr25 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr26 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr27 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr28 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr29 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr30 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr31 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr32 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr33 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr34 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr35 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $sgpr4 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr5 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr6 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr7 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr8 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr9 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr10 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr11 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr12 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr13 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr14 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr15 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr16 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr17 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr18 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr19 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr20 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr21 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr22 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr23 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr24 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr25 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr26 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr27 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr28 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr29 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr30 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr31 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr34 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr35 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr36 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr37 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr38 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr39 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr40 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: $vgpr32 = V_MOV_B32_e32 killed $sgpr4, implicit $exec
+    ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr32, implicit $exec, implicit $exec
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 8, 0, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr6, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 12, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr7, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 0, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr8, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 20, 0, 0, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr9, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 24, 0, 0, 0, implicit $exec :: (store (s32) into %stack.5, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr10, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 28, 0, 0, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr11, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.7, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr12, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 36, 0, 0, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr13, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 40, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 44, 0, 0, 0, implicit $exec :: (store (s32) into %stack.10, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 48, 0, 0, 0, implicit $exec :: (store (s32) into %stack.11, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr16, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 52, 0, 0, 0, implicit $exec :: (store (s32) into %stack.12, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr17, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 56, 0, 0, 0, implicit $exec :: (store (s32) into %stack.13, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr18, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 60, 0, 0, 0, implicit $exec :: (store (s32) into %stack.14, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr19, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 64, 0, 0, 0, implicit $exec :: (store (s32) into %stack.15, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr20, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 68, 0, 0, 0, implicit $exec :: (store (s32) into %stack.16, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr21, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 72, 0, 0, 0, implicit $exec :: (store (s32) into %stack.17, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr22, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 76, 0, 0, 0, implicit $exec :: (store (s32) into %stack.18, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr23, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 80, 0, 0, 0, implicit $exec :: (store (s32) into %stack.19, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr24, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 84, 0, 0, 0, implicit $exec :: (store (s32) into %stack.20, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr25, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 88, 0, 0, 0, implicit $exec :: (store (s32) into %stack.21, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr26, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 92, 0, 0, 0, implicit $exec :: (store (s32) into %stack.22, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr27, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 96, 0, 0, 0, implicit $exec :: (store (s32) into %stack.23, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr28, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 100, 0, 0, 0, implicit $exec :: (store (s32) into %stack.24, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr29, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 104, 0, 0, 0, implicit $exec :: (store (s32) into %stack.25, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr30, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 108, 0, 0, 0, implicit $exec :: (store (s32) into %stack.26, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr31, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 112, 0, 0, 0, implicit $exec :: (store (s32) into %stack.27, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 116, 0, 0, 0, implicit $exec :: (store (s32) into %stack.28, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr35, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 120, 0, 0, 0, implicit $exec :: (store (s32) into %stack.29, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr36, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 124, 0, 0, 0, implicit $exec :: (store (s32) into %stack.30, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr37, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 128, 0, 0, 0, implicit $exec :: (store (s32) into %stack.31, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr38, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 132, 0, 0, 0, implicit $exec :: (store (s32) into %stack.32, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr39, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 136, 0, 0, 0, implicit $exec :: (store (s32) into %stack.33, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr40, implicit $exec, implicit $exec
+    ; GFX908-NEXT: S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19, implicit $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27, implicit $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, implicit $vgpr35
+    ; GFX908-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr7, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr8, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr9, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr10, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr11, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr12, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr13, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr14, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr15, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr16, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr17, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr18, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr19, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr20, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr21, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr22, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr23, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr24, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr25, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr26, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr27, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr28, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr29, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr30, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr31, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr32, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr33, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr35, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 8, 0, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
+    ; GFX908-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 12, 0, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
+    ; GFX908-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 0, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
+    ; GFX908-NEXT: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 20, 0, 0, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5)
+    ; GFX908-NEXT: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 24, 0, 0, 0, implicit $exec :: (load (s32) from %stack.5, addrspace 5)
+    ; GFX908-NEXT: $vgpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 28, 0, 0, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5)
+    ; GFX908-NEXT: $vgpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.7, addrspace 5)
+    ; GFX908-NEXT: $vgpr8 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 36, 0, 0, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+    ; GFX908-NEXT: $vgpr9 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 40, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
+    ; GFX908-NEXT: $vgpr10 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 44, 0, 0, 0, implicit $exec :: (load (s32) from %stack.10, addrspace 5)
+    ; GFX908-NEXT: $vgpr11 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 48, 0, 0, 0, implicit $exec :: (load (s32) from %stack.11, addrspace 5)
+    ; GFX908-NEXT: $vgpr12 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 52, 0, 0, 0, implicit $exec :: (load (s32) from %stack.12, addrspace 5)
+    ; GFX908-NEXT: $vgpr13 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 56, 0, 0, 0, implicit $exec :: (load (s32) from %stack.13, addrspace 5)
+    ; GFX908-NEXT: $vgpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 60, 0, 0, 0, implicit $exec :: (load (s32) from %stack.14, addrspace 5)
+    ; GFX908-NEXT: $vgpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 64, 0, 0, 0, implicit $exec :: (load (s32) from %stack.15, addrspace 5)
+    ; GFX908-NEXT: $vgpr16 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 68, 0, 0, 0, implicit $exec :: (load (s32) from %stack.16, addrspace 5)
+    ; GFX908-NEXT: $vgpr17 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 72, 0, 0, 0, implicit $exec :: (load (s32) from %stack.17, addrspace 5)
+    ; GFX908-NEXT: $vgpr18 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 76, 0, 0, 0, implicit $exec :: (load (s32) from %stack.18, addrspace 5)
+    ; GFX908-NEXT: $vgpr19 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 80, 0, 0, 0, implicit $exec :: (load (s32) from %stack.19, addrspace 5)
+    ; GFX908-NEXT: $vgpr20 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 84, 0, 0, 0, implicit $exec :: (load (s32) from %stack.20, addrspace 5)
+    ; GFX908-NEXT: $vgpr21 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 88, 0, 0, 0, implicit $exec :: (load (s32) from %stack.21, addrspace 5)
+    ; GFX908-NEXT: $vgpr22 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 92, 0, 0, 0, implicit $exec :: (load (s32) from %stack.22, addrspace 5)
+    ; GFX908-NEXT: $vgpr23 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 96, 0, 0, 0, implicit $exec :: (load (s32) from %stack.23, addrspace 5)
+    ; GFX908-NEXT: $vgpr24 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 100, 0, 0, 0, implicit $exec :: (load (s32) from %stack.24, addrspace 5)
+    ; GFX908-NEXT: $vgpr25 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 104, 0, 0, 0, implicit $exec :: (load (s32) from %stack.25, addrspace 5)
+    ; GFX908-NEXT: $vgpr26 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 108, 0, 0, 0, implicit $exec :: (load (s32) from %stack.26, addrspace 5)
+    ; GFX908-NEXT: $vgpr27 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 112, 0, 0, 0, implicit $exec :: (load (s32) from %stack.27, addrspace 5)
+    ; GFX908-NEXT: $vgpr28 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 116, 0, 0, 0, implicit $exec :: (load (s32) from %stack.28, addrspace 5)
+    ; GFX908-NEXT: $vgpr29 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 120, 0, 0, 0, implicit $exec :: (load (s32) from %stack.29, addrspace 5)
+    ; GFX908-NEXT: $vgpr30 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 124, 0, 0, 0, implicit $exec :: (load (s32) from %stack.30, addrspace 5)
+    ; GFX908-NEXT: $vgpr31 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 128, 0, 0, 0, implicit $exec :: (load (s32) from %stack.31, addrspace 5)
+    ; GFX908-NEXT: $vgpr33 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 132, 0, 0, 0, implicit $exec :: (load (s32) from %stack.32, addrspace 5)
+    ; GFX908-NEXT: $vgpr35 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 136, 0, 0, 0, implicit $exec :: (load (s32) from %stack.33, addrspace 5)
+    ; GFX908-NEXT: S_NOP 0, implicit renamable $agpr0, implicit killed renamable $vgpr1, implicit killed renamable $vgpr2, implicit killed renamable $vgpr3, implicit killed renamable $vgpr4, implicit killed renamable $vgpr5, implicit killed renamable $vgpr6, implicit killed renamable $vgpr7, implicit killed renamable $vgpr8, implicit killed renamable $vgpr9, implicit killed renamable $vgpr10, implicit killed renamable $vgpr11, implicit killed renamable $vgpr12, implicit killed renamable $vgpr13, implicit killed renamable $vgpr14, implicit killed renamable $vgpr15, implicit killed renamable $vgpr16, implicit killed renamable $vgpr17, implicit killed renamable $vgpr18, implicit killed renamable $vgpr19, implicit killed renamable $vgpr20, implicit killed renamable $vgpr21, implicit killed renamable $vgpr22, implicit killed renamable $vgpr23, implicit killed renamable $vgpr24, implicit killed renamable $vgpr25, implicit killed renamable $vgpr26, implicit killed renamable $vgpr27, implicit killed renamable $vgpr28, implicit killed renamable $vgpr29, implicit killed renamable $vgpr30, implicit killed renamable $vgpr31, implicit killed renamable $vgpr33, implicit killed renamable $vgpr35, implicit killed renamable $vgpr34
+    ; GFX908-NEXT: S_ENDPGM 0, implicit killed renamable $agpr0
+    %v0:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v1:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v2:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v3:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v4:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v5:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v6:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v7:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v8:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v9:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v10:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v11:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v12:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v13:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v14:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v15:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v16:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v17:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v18:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v19:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v20:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v21:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v22:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v23:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v24:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v25:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v26:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v27:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v28:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v29:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v30:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v31:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v32:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v33:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v34:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v35:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %s0:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s1:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s2:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s3:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s4:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s5:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s6:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s7:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s8:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s9:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s10:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s11:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s12:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s13:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s14:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s15:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s16:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s17:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s18:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s19:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s20:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s21:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s22:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s23:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s24:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s25:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s26:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s27:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s28:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s29:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s30:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s31:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s32:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s33:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s34:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %71:vgpr_32 = COPY %s0
+    %72:vgpr_32 = COPY %s1
+    %73:vgpr_32 = COPY %s2
+    %74:vgpr_32 = COPY %s3
+    %75:vgpr_32 = COPY %s4
+    %76:vgpr_32 = COPY %s5
+    %77:vgpr_32 = COPY %s6
+    %78:vgpr_32 = COPY %s7
+    %79:vgpr_32 = COPY %s8
+    %80:vgpr_32 = COPY %s9
+    %81:vgpr_32 = COPY %s10
+    %82:vgpr_32 = COPY %s11
+    %83:vgpr_32 = COPY %s12
+    %84:vgpr_32 = COPY %s13
+    %85:vgpr_32 = COPY %s14
+    %86:vgpr_32 = COPY %s15
+    %87:vgpr_32 = COPY %s16
+    %88:vgpr_32 = COPY %s17
+    %89:vgpr_32 = COPY %s18
+    %90:vgpr_32 = COPY %s19
+    %91:vgpr_32 = COPY %s20
+    %92:vgpr_32 = COPY %s21
+    %93:vgpr_32 = COPY %s22
+    %94:vgpr_32 = COPY %s23
+    %95:vgpr_32 = COPY %s24
+    %96:vgpr_32 = COPY %s25
+    %97:vgpr_32 = COPY %s26
+    %98:vgpr_32 = COPY %s27
+    %99:vgpr_32 = COPY %s28
+    %100:vgpr_32 = COPY %s29
+    %101:vgpr_32 = COPY %s30
+    %102:vgpr_32 = COPY %s31
+    %103:vgpr_32 = COPY %s32
+    %104:vgpr_32 = COPY %s33
+    %105:vgpr_32 = COPY %s34
+    S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19, implicit $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27, implicit $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, implicit $vgpr35
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v1, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v2, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v4, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v5, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v6, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v7, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v8, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v9, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v10, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v11, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v12, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v13, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v14, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v15, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v16, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v17, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v18, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v19, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v20, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v21, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v22, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v23, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v24, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v25, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v26, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v27, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v28, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v29, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v30, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v31, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v32, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v33, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v34, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v35, 0, 0, implicit $exec
+    S_NOP 0, implicit %71, implicit %72, implicit %73, implicit %74, implicit %75, implicit %76, implicit %77, implicit %78, implicit %79, implicit %80, implicit %81, implicit %82, implicit %83, implicit %84, implicit %85, implicit %86, implicit %87, implicit %88, implicit %89, implicit %90, implicit %91, implicit %92, implicit %93, implicit %94, implicit %95, implicit %96, implicit %97, implicit %98, implicit %99, implicit %100, implicit %101, implicit %102, implicit %103, implicit %104, implicit %105
+    S_ENDPGM 0, implicit %71
+
+...

diff  --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll
index da95349c297b2..d0474fe47ebb1 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll
@@ -194,11 +194,13 @@ define amdgpu_kernel void @max_10_vgprs_spill_v32(<32 x float> addrspace(1)* %p)
 ; GFX908-NOT: buffer_
 ; GFX908-DAG: v_accvgpr_read_b32
 
-; GCN:    NumVgprs: 256
+; GFX900: NumVgprs: 256
 ; GFX900: ScratchSize: 148
+; GFX908: NumVgprs: 255
 ; GFX908: ScratchSize: 0
 ; GCN:    VGPRBlocks: 63
-; GCN:    NumVGPRsForWavesPerEU: 256
+; GFX900:    NumVGPRsForWavesPerEU: 256
+; GFX908:    NumVGPRsForWavesPerEU: 255
 define amdgpu_kernel void @max_256_vgprs_spill_9x32(<32 x float> addrspace(1)* %p) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %p1 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p, i32 %tid
@@ -242,11 +244,13 @@ define amdgpu_kernel void @max_256_vgprs_spill_9x32(<32 x float> addrspace(1)* %
 ; GFX908-NOT: buffer_
 ; GFX908-DAG: v_accvgpr_read_b32
 
-; GCN:    NumVgprs: 256
+; GFX900:    NumVgprs: 256
+; GFX908:    NumVgprs: 253
 ; GFX900: ScratchSize: 2052
 ; GFX908: ScratchSize: 0
 ; GCN:    VGPRBlocks: 63
-; GCN:    NumVGPRsForWavesPerEU: 256
+; GFX900:    NumVGPRsForWavesPerEU: 256
+; GFX908:    NumVGPRsForWavesPerEU: 253
 define amdgpu_kernel void @max_256_vgprs_spill_9x32_2bb(<32 x float> addrspace(1)* %p) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %p1 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p, i32 %tid


        


More information about the llvm-commits mailing list