[llvm] [AMDGPU] Enable atomic optimizer for divergent i64 and double values (PR #96934)
Jay Foad via llvm-commits
llvm-commits at lists.llvm.org
Fri Jul 12 00:08:39 PDT 2024
================
@@ -1725,106 +2344,1350 @@ entry:
}
define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(1) %inout) {
-; GFX7LESS-LABEL: add_i64_varying:
-; GFX7LESS: ; %bb.0: ; %entry
-; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000
-; GFX7LESS-NEXT: s_mov_b32 s6, -1
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
-; GFX7LESS-NEXT: s_mov_b32 s10, s6
-; GFX7LESS-NEXT: s_mov_b32 s11, s7
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: s_mov_b32 s8, s2
-; GFX7LESS-NEXT: s_mov_b32 s9, s3
-; GFX7LESS-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: buffer_wbinvl1
-; GFX7LESS-NEXT: s_mov_b32 s4, s0
-; GFX7LESS-NEXT: s_mov_b32 s5, s1
-; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
-; GFX7LESS-NEXT: s_endpgm
+; GFX7LESS_ITERATIVE-LABEL: add_i64_varying:
+; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX7LESS_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop
+; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3]
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s6
+; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6
+; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6
+; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s5, m0
+; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0
+; GFX7LESS_ITERATIVE-NEXT: s_add_u32 s4, s4, s8
+; GFX7LESS_ITERATIVE-NEXT: s_addc_u32 s5, s5, s7
+; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
+; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0
+; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7]
+; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB5_1
+; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4
+; GFX7LESS_ITERATIVE-NEXT: ; %bb.3:
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s10, -1
+; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s8, s2
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s9, s3
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s5
+; GFX7LESS_ITERATIVE-NEXT: buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc
+; GFX7LESS_ITERATIVE-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS_ITERATIVE-NEXT: buffer_wbinvl1
+; GFX7LESS_ITERATIVE-NEXT: .LBB5_4:
+; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4
+; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3
+; GFX7LESS_ITERATIVE-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4
+; GFX7LESS_ITERATIVE-NEXT: v_add_i32_e32 v0, vcc, s5, v1
+; GFX7LESS_ITERATIVE-NEXT: v_addc_u32_e32 v1, vcc, v3, v2, vcc
+; GFX7LESS_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX7LESS_ITERATIVE-NEXT: s_endpgm
+;
+; GFX8_ITERATIVE-LABEL: add_i64_varying:
+; GFX8_ITERATIVE: ; %bb.0: ; %entry
+; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX8_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0
+; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX8_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop
+; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3]
+; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s6
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6
+; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0
+; GFX8_ITERATIVE-NEXT: s_add_u32 s4, s4, s8
+; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s5, m0
+; GFX8_ITERATIVE-NEXT: s_addc_u32 s5, s5, s7
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
+; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX8_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4
+; GFX8_ITERATIVE-NEXT: ; %bb.3:
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s10, -1
+; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s8, s2
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s9, s3
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s5
+; GFX8_ITERATIVE-NEXT: buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc
+; GFX8_ITERATIVE-NEXT: s_waitcnt vmcnt(0)
+; GFX8_ITERATIVE-NEXT: buffer_wbinvl1_vol
+; GFX8_ITERATIVE-NEXT: .LBB5_4:
+; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4
+; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3
+; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4
+; GFX8_ITERATIVE-NEXT: v_add_u32_e32 v0, vcc, s5, v1
+; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX8_ITERATIVE-NEXT: v_addc_u32_e32 v1, vcc, v3, v2, vcc
+; GFX8_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8_ITERATIVE-NEXT: s_endpgm
+;
+; GFX9_ITERATIVE-LABEL: add_i64_varying:
+; GFX9_ITERATIVE: ; %bb.0: ; %entry
+; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX9_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0
+; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX9_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop
+; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3]
+; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s6
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6
+; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0
+; GFX9_ITERATIVE-NEXT: s_add_u32 s4, s4, s8
+; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s5, m0
+; GFX9_ITERATIVE-NEXT: s_addc_u32 s5, s5, s7
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
+; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX9_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4
+; GFX9_ITERATIVE-NEXT: ; %bb.3:
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s10, -1
+; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s8, s2
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s9, s3
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s5
+; GFX9_ITERATIVE-NEXT: buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc
+; GFX9_ITERATIVE-NEXT: s_waitcnt vmcnt(0)
+; GFX9_ITERATIVE-NEXT: buffer_wbinvl1_vol
+; GFX9_ITERATIVE-NEXT: .LBB5_4:
+; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4
+; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3
+; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4
+; GFX9_ITERATIVE-NEXT: v_add_co_u32_e32 v0, vcc, s5, v1
+; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
+; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX9_ITERATIVE-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v2, vcc
+; GFX9_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1064_ITERATIVE-LABEL: add_i64_varying:
+; GFX1064_ITERATIVE: ; %bb.0: ; %entry
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0
+; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1064_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop
+; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3]
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s6
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s5, s6
+; GFX1064_ITERATIVE-NEXT: s_add_u32 s4, s4, s7
+; GFX1064_ITERATIVE-NEXT: s_addc_u32 s5, s5, s8
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4
+; GFX1064_ITERATIVE-NEXT: ; %bb.3:
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4
+; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s5
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s10, -1
+; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s8, s2
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s9, s3
+; GFX1064_ITERATIVE-NEXT: buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc
+; GFX1064_ITERATIVE-NEXT: s_waitcnt vmcnt(0)
+; GFX1064_ITERATIVE-NEXT: buffer_gl1_inv
+; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1064_ITERATIVE-NEXT: .LBB5_4:
+; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1064_ITERATIVE-NEXT: v_add_co_u32 v0, vcc, s2, v1
+; GFX1064_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v2, vcc
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1064_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1064_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1032_ITERATIVE-LABEL: add_i64_varying:
+; GFX1032_ITERATIVE: ; %bb.0: ; %entry
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0
+; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1032_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop
+; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2
+; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
+; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s3
+; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s5, s3
+; GFX1032_ITERATIVE-NEXT: s_add_u32 s4, s4, s6
+; GFX1032_ITERATIVE-NEXT: s_addc_u32 s5, s5, s7
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
+; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s6, vcc_lo
+; GFX1032_ITERATIVE-NEXT: s_xor_b32 s6, exec_lo, s6
+; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4
+; GFX1032_ITERATIVE-NEXT: ; %bb.3:
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4
+; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s5
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s10, -1
+; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s8, s2
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s9, s3
+; GFX1032_ITERATIVE-NEXT: buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc
+; GFX1032_ITERATIVE-NEXT: s_waitcnt vmcnt(0)
+; GFX1032_ITERATIVE-NEXT: buffer_gl1_inv
+; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1032_ITERATIVE-NEXT: .LBB5_4:
+; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s6
+; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1032_ITERATIVE-NEXT: v_add_co_u32 v0, vcc_lo, s2, v1
+; GFX1032_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1032_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1032_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1164_ITERATIVE-LABEL: add_i64_varying:
+; GFX1164_ITERATIVE: ; %bb.0: ; %entry
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0
+; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1164_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop
+; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s6, s[2:3]
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s6
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v2, s5, s6
+; GFX1164_ITERATIVE-NEXT: s_add_u32 s4, s4, s7
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_addc_u32 s5, s5, s8
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[6:7], exec
+; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4
+; GFX1164_ITERATIVE-NEXT: ; %bb.3:
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, s5
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s10, -1
+; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s8, s2
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s9, s3
+; GFX1164_ITERATIVE-NEXT: buffer_atomic_add_u64 v[3:4], off, s[8:11], 0 glc
+; GFX1164_ITERATIVE-NEXT: s_waitcnt vmcnt(0)
+; GFX1164_ITERATIVE-NEXT: buffer_gl1_inv
+; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1164_ITERATIVE-NEXT: .LBB5_4:
+; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_ITERATIVE-NEXT: v_add_co_u32 v0, vcc, s2, v1
+; GFX1164_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v2, vcc
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1164_ITERATIVE-NEXT: s_nop 0
+; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1132_ITERATIVE-LABEL: add_i64_varying:
+; GFX1132_ITERATIVE: ; %bb.0: ; %entry
+; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0
+; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1132_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop
+; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
+; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s3
+; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v2, s5, s3
+; GFX1132_ITERATIVE-NEXT: s_add_u32 s4, s4, s6
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_addc_u32 s5, s5, s7
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s6, exec_lo
+; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132_ITERATIVE-NEXT: s_xor_b32 s6, exec_lo, s6
+; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4
+; GFX1132_ITERATIVE-NEXT: ; %bb.3:
+; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v4, s5
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s10, -1
+; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s8, s2
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s9, s3
+; GFX1132_ITERATIVE-NEXT: buffer_atomic_add_u64 v[3:4], off, s[8:11], 0 glc
+; GFX1132_ITERATIVE-NEXT: s_waitcnt vmcnt(0)
+; GFX1132_ITERATIVE-NEXT: buffer_gl1_inv
+; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv
+; GFX1132_ITERATIVE-NEXT: .LBB5_4:
+; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s6
+; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_ITERATIVE-NEXT: v_add_co_u32 v0, vcc_lo, s2, v1
+; GFX1132_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1132_ITERATIVE-NEXT: s_nop 0
+; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1264_ITERATIVE-LABEL: add_i64_varying:
+; GFX1264_ITERATIVE: ; %bb.0: ; %entry
+; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
+; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0
+; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1264_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop
+; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3]
+; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
+; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10
+; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10
+; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v2, s5, s10
+; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s10
+; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9]
+; GFX1264_ITERATIVE-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[6:7]
+; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1264_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[6:7], exec
+; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1264_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1264_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX1264_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4
+; GFX1264_ITERATIVE-NEXT: ; %bb.3:
+; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4
+; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v4, s5
+; GFX1264_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1264_ITERATIVE-NEXT: s_mov_b32 s10, -1
+; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0
+; GFX1264_ITERATIVE-NEXT: s_mov_b32 s8, s2
+; GFX1264_ITERATIVE-NEXT: s_mov_b32 s9, s3
+; GFX1264_ITERATIVE-NEXT: buffer_atomic_add_u64 v[3:4], off, s[8:11], null th:TH_ATOMIC_RETURN
+; GFX1264_ITERATIVE-NEXT: s_wait_loadcnt 0x0
+; GFX1264_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV
+; GFX1264_ITERATIVE-NEXT: .LBB5_4:
+; GFX1264_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0
+; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264_ITERATIVE-NEXT: v_add_co_u32 v0, vcc, s2, v1
+; GFX1264_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v2, vcc
+; GFX1264_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1264_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1264_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
+; GFX1264_ITERATIVE-NEXT: s_nop 0
+; GFX1264_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1264_ITERATIVE-NEXT: s_endpgm
+;
+; GFX1232_ITERATIVE-LABEL: add_i64_varying:
+; GFX1232_ITERATIVE: ; %bb.0: ; %entry
+; GFX1232_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1232_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo
+; GFX1232_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0
+; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX1232_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop
+; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2
+; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
+; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3
+; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v2, s5, s3
+; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s3
+; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s8
+; GFX1232_ITERATIVE-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[6:7]
+; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
+; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1232_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1232_ITERATIVE-NEXT: s_mov_b32 s6, exec_lo
+; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1232_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1232_ITERATIVE-NEXT: s_xor_b32 s6, exec_lo, s6
+; GFX1232_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4
+; GFX1232_ITERATIVE-NEXT: ; %bb.3:
+; GFX1232_ITERATIVE-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v4, s5
+; GFX1232_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1232_ITERATIVE-NEXT: s_mov_b32 s10, -1
+; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0
+; GFX1232_ITERATIVE-NEXT: s_mov_b32 s8, s2
+; GFX1232_ITERATIVE-NEXT: s_mov_b32 s9, s3
+; GFX1232_ITERATIVE-NEXT: buffer_atomic_add_u64 v[3:4], off, s[8:11], null th:TH_ATOMIC_RETURN
+; GFX1232_ITERATIVE-NEXT: s_wait_loadcnt 0x0
+; GFX1232_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV
+; GFX1232_ITERATIVE-NEXT: .LBB5_4:
+; GFX1232_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s6
+; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0
+; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4
+; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1232_ITERATIVE-NEXT: v_add_co_u32 v0, vcc_lo, s2, v1
+; GFX1232_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
+; GFX1232_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1232_ITERATIVE-NEXT: s_mov_b32 s2, -1
+; GFX1232_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
+; GFX1232_ITERATIVE-NEXT: s_nop 0
+; GFX1232_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1232_ITERATIVE-NEXT: s_endpgm
+;
+; GFX7LESS_DPP-LABEL: add_i64_varying:
+; GFX7LESS_DPP: ; %bb.0: ; %entry
+; GFX7LESS_DPP-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GFX7LESS_DPP-NEXT: s_mov_b32 s7, 0xf000
+; GFX7LESS_DPP-NEXT: s_mov_b32 s6, -1
+; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX7LESS_DPP-NEXT: s_mov_b32 s10, s6
+; GFX7LESS_DPP-NEXT: s_mov_b32 s11, s7
+; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS_DPP-NEXT: s_mov_b32 s8, s2
+; GFX7LESS_DPP-NEXT: s_mov_b32 s9, s3
+; GFX7LESS_DPP-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
+; GFX7LESS_DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS_DPP-NEXT: buffer_wbinvl1
+; GFX7LESS_DPP-NEXT: s_mov_b32 s4, s0
+; GFX7LESS_DPP-NEXT: s_mov_b32 s5, s1
+; GFX7LESS_DPP-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX7LESS_DPP-NEXT: s_endpgm
+;
+; GFX8_DPP-LABEL: add_i64_varying:
+; GFX8_DPP: ; %bb.0: ; %entry
+; GFX8_DPP-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0
+; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v7
+; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v8
+; GFX8_DPP-NEXT: s_not_b64 exec, exec
+; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX8_DPP-NEXT: s_not_b64 exec, exec
+; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT: v_readlane_b32 s7, v4, 63
+; GFX8_DPP-NEXT: v_readlane_b32 s6, v3, 63
+; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
+; GFX8_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8_DPP-NEXT: s_cbranch_execz .LBB5_2
+; GFX8_DPP-NEXT: ; %bb.1:
+; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s7
+; GFX8_DPP-NEXT: s_mov_b32 s11, 0xf000
+; GFX8_DPP-NEXT: s_mov_b32 s10, -1
+; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_DPP-NEXT: s_mov_b32 s8, s2
+; GFX8_DPP-NEXT: s_mov_b32 s9, s3
+; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s6
+; GFX8_DPP-NEXT: buffer_atomic_add_x2 v[7:8], off, s[8:11], 0 glc
+; GFX8_DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX8_DPP-NEXT: buffer_wbinvl1_vol
+; GFX8_DPP-NEXT: .LBB5_2:
+; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v8
+; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v7
+; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4
+; GFX8_DPP-NEXT: v_add_u32_e32 v7, vcc, s5, v7
+; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX8_DPP-NEXT: s_mov_b32 s2, -1
+; GFX8_DPP-NEXT: v_addc_u32_e32 v8, vcc, v0, v8, vcc
+; GFX8_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0
+; GFX8_DPP-NEXT: s_endpgm
+;
+; GFX9_DPP-LABEL: add_i64_varying:
+; GFX9_DPP: ; %bb.0: ; %entry
+; GFX9_DPP-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0
+; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0
+; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7
+; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8
+; GFX9_DPP-NEXT: s_not_b64 exec, exec
+; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX9_DPP-NEXT: s_not_b64 exec, exec
+; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT: v_readlane_b32 s7, v4, 63
+; GFX9_DPP-NEXT: v_readlane_b32 s6, v3, 63
+; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
+; GFX9_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9_DPP-NEXT: s_cbranch_execz .LBB5_2
+; GFX9_DPP-NEXT: ; %bb.1:
+; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s7
+; GFX9_DPP-NEXT: s_mov_b32 s11, 0xf000
+; GFX9_DPP-NEXT: s_mov_b32 s10, -1
+; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_DPP-NEXT: s_mov_b32 s8, s2
+; GFX9_DPP-NEXT: s_mov_b32 s9, s3
+; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s6
+; GFX9_DPP-NEXT: buffer_atomic_add_x2 v[7:8], off, s[8:11], 0 glc
+; GFX9_DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX9_DPP-NEXT: buffer_wbinvl1_vol
+; GFX9_DPP-NEXT: .LBB5_2:
+; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v8
+; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v7
+; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v7, vcc, s5, v7
+; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000
+; GFX9_DPP-NEXT: s_mov_b32 s2, -1
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v8, vcc, v0, v8, vcc
+; GFX9_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0
+; GFX9_DPP-NEXT: s_endpgm
;
-; GFX89-LABEL: add_i64_varying:
-; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
-; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
-; GFX89-NEXT: v_mov_b32_e32 v1, 0
-; GFX89-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
-; GFX89-NEXT: s_waitcnt vmcnt(0)
-; GFX89-NEXT: buffer_wbinvl1_vol
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
-; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
-; GFX89-NEXT: s_endpgm
-;
-; GFX10-LABEL: add_i64_varying:
-; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: s_mov_b32 s7, 0x31016000
-; GFX10-NEXT: s_mov_b32 s6, -1
-; GFX10-NEXT: s_mov_b32 s11, s7
-; GFX10-NEXT: s_mov_b32 s10, s6
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s8, s2
-; GFX10-NEXT: s_mov_b32 s9, s3
-; GFX10-NEXT: s_mov_b32 s4, s0
-; GFX10-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_gl1_inv
-; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: s_mov_b32 s5, s1
-; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
-; GFX10-NEXT: s_endpgm
-;
-; GFX11-LABEL: add_i64_varying:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s11, s7
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: s_mov_b32 s5, s1
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
-;
-; GFX12-LABEL: add_i64_varying:
-; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-NEXT: s_mov_b32 s7, 0x31016000
-; GFX12-NEXT: s_mov_b32 s6, -1
-; GFX12-NEXT: s_mov_b32 s11, s7
-; GFX12-NEXT: s_mov_b32 s10, s6
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s8, s2
-; GFX12-NEXT: s_mov_b32 s9, s3
-; GFX12-NEXT: s_mov_b32 s4, s0
-; GFX12-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_mov_b32 s5, s1
-; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
-; GFX12-NEXT: s_nop 0
-; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-NEXT: s_endpgm
+; GFX1064_DPP-LABEL: add_i64_varying:
+; GFX1064_DPP: ; %bb.0: ; %entry
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10
+; GFX1064_DPP-NEXT: s_not_b64 exec, exec
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064_DPP-NEXT: s_not_b64 exec, exec
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2
+; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 31
+; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 31
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, s4
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s5
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1064_DPP-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_readlane_b32 s6, v4, 15
+; GFX1064_DPP-NEXT: v_readlane_b32 s7, v3, 15
+; GFX1064_DPP-NEXT: v_readlane_b32 s8, v4, 31
+; GFX1064_DPP-NEXT: v_readlane_b32 s9, v3, 31
+; GFX1064_DPP-NEXT: v_readlane_b32 s10, v3, 47
+; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 16
+; GFX1064_DPP-NEXT: v_writelane_b32 v1, s7, 16
+; GFX1064_DPP-NEXT: v_readlane_b32 s6, v3, 63
+; GFX1064_DPP-NEXT: v_readlane_b32 s11, v4, 47
+; GFX1064_DPP-NEXT: v_readlane_b32 s7, v4, 63
+; GFX1064_DPP-NEXT: v_writelane_b32 v2, s8, 32
+; GFX1064_DPP-NEXT: v_writelane_b32 v1, s9, 32
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[8:9], -1
+; GFX1064_DPP-NEXT: s_mov_b64 s[4:5], s[6:7]
+; GFX1064_DPP-NEXT: v_writelane_b32 v2, s11, 48
+; GFX1064_DPP-NEXT: v_writelane_b32 v1, s10, 48
+; GFX1064_DPP-NEXT: s_mov_b64 exec, s[8:9]
+; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
+; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[8:9], vcc
+; GFX1064_DPP-NEXT: s_cbranch_execz .LBB5_2
+; GFX1064_DPP-NEXT: ; %bb.1:
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s5
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s4
+; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_DPP-NEXT: s_mov_b32 s4, s2
+; GFX1064_DPP-NEXT: s_mov_b32 s5, s3
+; GFX1064_DPP-NEXT: buffer_atomic_add_x2 v[9:10], off, s[4:7], 0 glc
+; GFX1064_DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1064_DPP-NEXT: buffer_gl1_inv
+; GFX1064_DPP-NEXT: buffer_gl0_inv
+; GFX1064_DPP-NEXT: .LBB5_2:
+; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064_DPP-NEXT: v_readfirstlane_b32 s2, v9
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, v1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, v2
+; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v10
+; GFX1064_DPP-NEXT: v_add_co_u32 v9, vcc, s2, v11
+; GFX1064_DPP-NEXT: s_mov_b32 s2, s6
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v10, vcc, s3, v12, vcc
+; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0
+; GFX1064_DPP-NEXT: s_endpgm
+;
+; GFX1032_DPP-LABEL: add_i64_varying:
+; GFX1032_DPP: ; %bb.0: ; %entry
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10
+; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1032_DPP-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s6, -1
+; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
+; GFX1032_DPP-NEXT: v_readlane_b32 s4, v3, 31
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_readlane_b32 s8, v4, 15
+; GFX1032_DPP-NEXT: v_readlane_b32 s5, v4, 31
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_readlane_b32 s7, v3, 15
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s6
+; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s6, -1
+; GFX1032_DPP-NEXT: v_writelane_b32 v2, s8, 16
+; GFX1032_DPP-NEXT: v_writelane_b32 v1, s7, 16
+; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s6
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
+; GFX1032_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo
+; GFX1032_DPP-NEXT: s_cbranch_execz .LBB5_2
+; GFX1032_DPP-NEXT: ; %bb.1:
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s5
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s4
+; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_DPP-NEXT: s_mov_b32 s4, s2
+; GFX1032_DPP-NEXT: s_mov_b32 s5, s3
+; GFX1032_DPP-NEXT: buffer_atomic_add_x2 v[9:10], off, s[4:7], 0 glc
+; GFX1032_DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1032_DPP-NEXT: buffer_gl1_inv
+; GFX1032_DPP-NEXT: buffer_gl0_inv
+; GFX1032_DPP-NEXT: .LBB5_2:
+; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8
+; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032_DPP-NEXT: v_readfirstlane_b32 s2, v9
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, v1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, v2
+; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v10
+; GFX1032_DPP-NEXT: v_add_co_u32 v9, vcc_lo, s2, v11
+; GFX1032_DPP-NEXT: s_mov_b32 s2, s6
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, s3, v12, vcc_lo
+; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0
+; GFX1032_DPP-NEXT: s_endpgm
+;
+; GFX1164_DPP-LABEL: add_i64_varying:
+; GFX1164_DPP: ; %bb.0: ; %entry
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v0
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, 0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v8
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v9
+; GFX1164_DPP-NEXT: s_not_b64 exec, exec
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164_DPP-NEXT: s_not_b64 exec, exec
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v2
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v5, vcc
+; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
+; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v5, vcc
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v4
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v6, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 31
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s5
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 31
+; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v7, v3 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s4
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v5, vcc
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164_DPP-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_readlane_b32 s6, v3, 15
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_readlane_b32 s7, v4, 15
+; GFX1164_DPP-NEXT: v_readlane_b32 s8, v3, 31
+; GFX1164_DPP-NEXT: v_readlane_b32 s9, v4, 31
+; GFX1164_DPP-NEXT: v_writelane_b32 v1, s6, 16
+; GFX1164_DPP-NEXT: v_readlane_b32 s6, v3, 63
+; GFX1164_DPP-NEXT: v_writelane_b32 v2, s7, 16
+; GFX1164_DPP-NEXT: v_readlane_b32 s10, v3, 47
+; GFX1164_DPP-NEXT: v_readlane_b32 s11, v4, 47
+; GFX1164_DPP-NEXT: v_readlane_b32 s7, v4, 63
+; GFX1164_DPP-NEXT: v_writelane_b32 v1, s8, 32
+; GFX1164_DPP-NEXT: v_writelane_b32 v2, s9, 32
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[8:9], -1
+; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], s[6:7]
+; GFX1164_DPP-NEXT: v_writelane_b32 v1, s10, 48
+; GFX1164_DPP-NEXT: v_writelane_b32 v2, s11, 48
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[8:9]
+; GFX1164_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1164_DPP-NEXT: s_mov_b64 s[8:9], exec
+; GFX1164_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9
+; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164_DPP-NEXT: s_cbranch_execz .LBB5_2
+; GFX1164_DPP-NEXT: ; %bb.1:
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, s5
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s4
+; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_DPP-NEXT: s_mov_b32 s4, s2
+; GFX1164_DPP-NEXT: s_mov_b32 s5, s3
+; GFX1164_DPP-NEXT: buffer_atomic_add_u64 v[8:9], off, s[4:7], 0 glc
+; GFX1164_DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164_DPP-NEXT: buffer_gl1_inv
+; GFX1164_DPP-NEXT: buffer_gl0_inv
+; GFX1164_DPP-NEXT: .LBB5_2:
+; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164_DPP-NEXT: v_readfirstlane_b32 s2, v8
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, v2
+; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v9
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_add_co_u32 v8, vcc, s2, v10
+; GFX1164_DPP-NEXT: s_mov_b32 s2, s6
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc, s3, v11, vcc
+; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], 0
+; GFX1164_DPP-NEXT: s_nop 0
+; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164_DPP-NEXT: s_endpgm
+;
+; GFX1132_DPP-LABEL: add_i64_varying:
+; GFX1132_DPP: ; %bb.0: ; %entry
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v0
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, 0
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v8
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v9
+; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v5, vcc_lo
+; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v5, vcc_lo
+; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v5, v4
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v6, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132_DPP-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s6, -1
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v7, vcc_lo
+; GFX1132_DPP-NEXT: v_readlane_b32 s4, v3, 31
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_readlane_b32 s7, v3, 15
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_readlane_b32 s8, v4, 15
+; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 31
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s6
+; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s6, -1
+; GFX1132_DPP-NEXT: v_writelane_b32 v1, s7, 16
+; GFX1132_DPP-NEXT: v_writelane_b32 v2, s8, 16
+; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s6
+; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT: s_mov_b32 s8, exec_lo
+; GFX1132_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9
+; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132_DPP-NEXT: s_cbranch_execz .LBB5_2
+; GFX1132_DPP-NEXT: ; %bb.1:
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, s5 :: v_dual_mov_b32 v8, s4
+; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_DPP-NEXT: s_mov_b32 s4, s2
+; GFX1132_DPP-NEXT: s_mov_b32 s5, s3
+; GFX1132_DPP-NEXT: buffer_atomic_add_u64 v[8:9], off, s[4:7], 0 glc
+; GFX1132_DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132_DPP-NEXT: buffer_gl1_inv
+; GFX1132_DPP-NEXT: buffer_gl0_inv
+; GFX1132_DPP-NEXT: .LBB5_2:
+; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8
+; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132_DPP-NEXT: v_readfirstlane_b32 s2, v8
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v1
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v11, v2
+; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v9
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_add_co_u32 v8, vcc_lo, s2, v10
+; GFX1132_DPP-NEXT: s_mov_b32 s2, s6
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, s3, v11, vcc_lo
+; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], 0
+; GFX1132_DPP-NEXT: s_nop 0
+; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132_DPP-NEXT: s_endpgm
+;
+; GFX1264_DPP-LABEL: add_i64_varying:
+; GFX1264_DPP: ; %bb.0: ; %entry
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v8, v0
+; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1264_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v9, 0
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, v8
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, v9
+; GFX1264_DPP-NEXT: s_not_b64 exec, exec
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1264_DPP-NEXT: s_not_b64 exec, exec
+; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v7, v2
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v5, vcc
+; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v5, vcc
+; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v4
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
+; GFX1264_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v6, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1264_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1264_DPP-NEXT: v_readlane_b32 s5, v3, 31
+; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v7, s5
+; GFX1264_DPP-NEXT: v_readlane_b32 s4, v4, 31
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v7, v3 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, s4
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v5, vcc
+; GFX1264_DPP-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1264_DPP-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1264_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1264_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1264_DPP-NEXT: v_readlane_b32 s6, v3, 15
+; GFX1264_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1264_DPP-NEXT: v_readlane_b32 s7, v4, 15
+; GFX1264_DPP-NEXT: v_readlane_b32 s8, v3, 31
+; GFX1264_DPP-NEXT: v_readlane_b32 s9, v4, 31
+; GFX1264_DPP-NEXT: v_writelane_b32 v1, s6, 16
+; GFX1264_DPP-NEXT: v_readlane_b32 s6, v3, 63
+; GFX1264_DPP-NEXT: v_writelane_b32 v2, s7, 16
+; GFX1264_DPP-NEXT: v_readlane_b32 s10, v3, 47
+; GFX1264_DPP-NEXT: v_readlane_b32 s11, v4, 47
+; GFX1264_DPP-NEXT: v_readlane_b32 s7, v4, 63
+; GFX1264_DPP-NEXT: v_writelane_b32 v1, s8, 32
+; GFX1264_DPP-NEXT: v_writelane_b32 v2, s9, 32
+; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1264_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[8:9], -1
+; GFX1264_DPP-NEXT: s_mov_b64 s[4:5], s[6:7]
+; GFX1264_DPP-NEXT: v_writelane_b32 v1, s10, 48
+; GFX1264_DPP-NEXT: v_writelane_b32 v2, s11, 48
+; GFX1264_DPP-NEXT: s_mov_b64 exec, s[8:9]
+; GFX1264_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1264_DPP-NEXT: s_mov_b64 s[8:9], exec
+; GFX1264_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9
+; GFX1264_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1264_DPP-NEXT: s_cbranch_execz .LBB5_2
+; GFX1264_DPP-NEXT: ; %bb.1:
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v9, s5
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v8, s4
+; GFX1264_DPP-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0
+; GFX1264_DPP-NEXT: s_mov_b32 s4, s2
+; GFX1264_DPP-NEXT: s_mov_b32 s5, s3
+; GFX1264_DPP-NEXT: buffer_atomic_add_u64 v[8:9], off, s[4:7], null th:TH_ATOMIC_RETURN
+; GFX1264_DPP-NEXT: s_wait_loadcnt 0x0
+; GFX1264_DPP-NEXT: global_inv scope:SCOPE_DEV
+; GFX1264_DPP-NEXT: .LBB5_2:
+; GFX1264_DPP-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0
+; GFX1264_DPP-NEXT: v_readfirstlane_b32 s2, v8
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v10, v1
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v11, v2
+; GFX1264_DPP-NEXT: v_readfirstlane_b32 s3, v9
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1264_DPP-NEXT: v_add_co_u32 v8, vcc, s2, v10
+; GFX1264_DPP-NEXT: s_mov_b32 s2, s6
+; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc, s3, v11, vcc
+; GFX1264_DPP-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1264_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], null
+; GFX1264_DPP-NEXT: s_nop 0
+; GFX1264_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1264_DPP-NEXT: s_endpgm
+;
+; GFX1232_DPP-LABEL: add_i64_varying:
+; GFX1232_DPP: ; %bb.0: ; %entry
+; GFX1232_DPP-NEXT: v_mov_b32_e32 v8, v0
+; GFX1232_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1232_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1232_DPP-NEXT: v_mov_b32_e32 v2, 0
+; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s2
+; GFX1232_DPP-NEXT: v_mov_b32_e32 v9, 0
+; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, v8
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1232_DPP-NEXT: v_mov_b32_e32 v4, v9
+; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1232_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1232_DPP-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1232_DPP-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v6, v2
+; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1232_DPP-NEXT: v_mov_b32_e32 v7, v2
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v5, vcc_lo
----------------
jayfoad wrote:
As a follow up we should try to improve this DPP code. The docs say that v_add_co_ci_u32_e32 has a DPP form. I don't know why we're not using it here.
https://github.com/llvm/llvm-project/pull/96934
More information about the llvm-commits
mailing list