[llvm] [AMDGPU] Enable atomic optimizer for divergent i64 and double values (PR #96934)

Jay Foad via llvm-commits llvm-commits at lists.llvm.org
Fri Jul 12 00:08:39 PDT 2024


================
@@ -1725,106 +2344,1350 @@ entry:
 }
 
 define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(1) %inout) {
-; GFX7LESS-LABEL: add_i64_varying:
-; GFX7LESS:       ; %bb.0: ; %entry
-; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7LESS-NEXT:    s_mov_b32 s6, -1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7LESS-NEXT:    s_mov_b32 s10, s6
-; GFX7LESS-NEXT:    s_mov_b32 s11, s7
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    s_mov_b32 s8, s2
-; GFX7LESS-NEXT:    s_mov_b32 s9, s3
-; GFX7LESS-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
-; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    buffer_wbinvl1
-; GFX7LESS-NEXT:    s_mov_b32 s4, s0
-; GFX7LESS-NEXT:    s_mov_b32 s5, s1
-; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
-; GFX7LESS-NEXT:    s_endpgm
+; GFX7LESS_ITERATIVE-LABEL: add_i64_varying:
+; GFX7LESS_ITERATIVE:       ; %bb.0: ; %entry
+; GFX7LESS_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
+; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
+; GFX7LESS_ITERATIVE-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7LESS_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
+; GFX7LESS_ITERATIVE-NEXT:  .LBB5_1: ; %ComputeLoop
+; GFX7LESS_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7LESS_ITERATIVE-NEXT:    s_ff1_i32_b64 s6, s[2:3]
+; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 m0, s6
+; GFX7LESS_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s6
+; GFX7LESS_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s6
+; GFX7LESS_ITERATIVE-NEXT:    v_writelane_b32 v2, s5, m0
+; GFX7LESS_ITERATIVE-NEXT:    v_writelane_b32 v1, s4, m0
+; GFX7LESS_ITERATIVE-NEXT:    s_add_u32 s4, s4, s8
+; GFX7LESS_ITERATIVE-NEXT:    s_addc_u32 s5, s5, s7
+; GFX7LESS_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s6
+; GFX7LESS_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX7LESS_ITERATIVE-NEXT:    v_cmp_ne_u64_e64 s[6:7], s[2:3], 0
+; GFX7LESS_ITERATIVE-NEXT:    s_and_b64 vcc, exec, s[6:7]
+; GFX7LESS_ITERATIVE-NEXT:    s_cbranch_vccnz .LBB5_1
+; GFX7LESS_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX7LESS_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GFX7LESS_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
+; GFX7LESS_ITERATIVE-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX7LESS_ITERATIVE-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GFX7LESS_ITERATIVE-NEXT:    s_cbranch_execz .LBB5_4
+; GFX7LESS_ITERATIVE-NEXT:  ; %bb.3:
+; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s11, 0xf000
+; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s10, -1
+; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s8, s2
+; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s9, s3
+; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s4
+; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s5
+; GFX7LESS_ITERATIVE-NEXT:    buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc
+; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS_ITERATIVE-NEXT:    buffer_wbinvl1
+; GFX7LESS_ITERATIVE-NEXT:  .LBB5_4:
+; GFX7LESS_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7LESS_ITERATIVE-NEXT:    s_mov_b32 s2, -1
+; GFX7LESS_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v4
+; GFX7LESS_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s4
+; GFX7LESS_ITERATIVE-NEXT:    v_add_i32_e32 v0, vcc, s5, v1
+; GFX7LESS_ITERATIVE-NEXT:    v_addc_u32_e32 v1, vcc, v3, v2, vcc
+; GFX7LESS_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX7LESS_ITERATIVE-NEXT:    s_endpgm
+;
+; GFX8_ITERATIVE-LABEL: add_i64_varying:
+; GFX8_ITERATIVE:       ; %bb.0: ; %entry
+; GFX8_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
+; GFX8_ITERATIVE-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
+; GFX8_ITERATIVE-NEXT:  .LBB5_1: ; %ComputeLoop
+; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s6, s[2:3]
+; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s6
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s6
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s6
+; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s4, m0
+; GFX8_ITERATIVE-NEXT:    s_add_u32 s4, s4, s8
+; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v2, s5, m0
+; GFX8_ITERATIVE-NEXT:    s_addc_u32 s5, s5, s7
+; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s6
+; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX8_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
+; GFX8_ITERATIVE-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX8_ITERATIVE-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GFX8_ITERATIVE-NEXT:    s_cbranch_execz .LBB5_4
+; GFX8_ITERATIVE-NEXT:  ; %bb.3:
+; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s4
+; GFX8_ITERATIVE-NEXT:    s_mov_b32 s11, 0xf000
+; GFX8_ITERATIVE-NEXT:    s_mov_b32 s10, -1
+; GFX8_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8_ITERATIVE-NEXT:    s_mov_b32 s8, s2
+; GFX8_ITERATIVE-NEXT:    s_mov_b32 s9, s3
+; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s5
+; GFX8_ITERATIVE-NEXT:    buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc
+; GFX8_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
+; GFX8_ITERATIVE-NEXT:    buffer_wbinvl1_vol
+; GFX8_ITERATIVE-NEXT:  .LBB5_4:
+; GFX8_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX8_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v4
+; GFX8_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s4
+; GFX8_ITERATIVE-NEXT:    v_add_u32_e32 v0, vcc, s5, v1
+; GFX8_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
+; GFX8_ITERATIVE-NEXT:    s_mov_b32 s2, -1
+; GFX8_ITERATIVE-NEXT:    v_addc_u32_e32 v1, vcc, v3, v2, vcc
+; GFX8_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8_ITERATIVE-NEXT:    s_endpgm
+;
+; GFX9_ITERATIVE-LABEL: add_i64_varying:
+; GFX9_ITERATIVE:       ; %bb.0: ; %entry
+; GFX9_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9_ITERATIVE-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
+; GFX9_ITERATIVE-NEXT:  .LBB5_1: ; %ComputeLoop
+; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s6, s[2:3]
+; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s6
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s6
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s6
+; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s4, m0
+; GFX9_ITERATIVE-NEXT:    s_add_u32 s4, s4, s8
+; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v2, s5, m0
+; GFX9_ITERATIVE-NEXT:    s_addc_u32 s5, s5, s7
+; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s6
+; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX9_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
+; GFX9_ITERATIVE-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX9_ITERATIVE-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GFX9_ITERATIVE-NEXT:    s_cbranch_execz .LBB5_4
+; GFX9_ITERATIVE-NEXT:  ; %bb.3:
+; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s4
+; GFX9_ITERATIVE-NEXT:    s_mov_b32 s11, 0xf000
+; GFX9_ITERATIVE-NEXT:    s_mov_b32 s10, -1
+; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9_ITERATIVE-NEXT:    s_mov_b32 s8, s2
+; GFX9_ITERATIVE-NEXT:    s_mov_b32 s9, s3
+; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s5
+; GFX9_ITERATIVE-NEXT:    buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc
+; GFX9_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
+; GFX9_ITERATIVE-NEXT:    buffer_wbinvl1_vol
+; GFX9_ITERATIVE-NEXT:  .LBB5_4:
+; GFX9_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v4
+; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s4
+; GFX9_ITERATIVE-NEXT:    v_add_co_u32_e32 v0, vcc, s5, v1
+; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9_ITERATIVE-NEXT:    s_mov_b32 s2, -1
+; GFX9_ITERATIVE-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v2, vcc
+; GFX9_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9_ITERATIVE-NEXT:    s_endpgm
+;
+; GFX1064_ITERATIVE-LABEL: add_i64_varying:
+; GFX1064_ITERATIVE:       ; %bb.0: ; %entry
+; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
+; GFX1064_ITERATIVE-NEXT:  .LBB5_1: ; %ComputeLoop
+; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s6, s[2:3]
+; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s7, v0, s6
+; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v3, s6
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s4, s6
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s5, s6
+; GFX1064_ITERATIVE-NEXT:    s_add_u32 s4, s4, s7
+; GFX1064_ITERATIVE-NEXT:    s_addc_u32 s5, s5, s8
+; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s6
+; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1064_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
+; GFX1064_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_ITERATIVE-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX1064_ITERATIVE-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GFX1064_ITERATIVE-NEXT:    s_cbranch_execz .LBB5_4
+; GFX1064_ITERATIVE-NEXT:  ; %bb.3:
+; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s4
+; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s5
+; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s10, -1
+; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s8, s2
+; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s9, s3
+; GFX1064_ITERATIVE-NEXT:    buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc
+; GFX1064_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064_ITERATIVE-NEXT:    buffer_gl1_inv
+; GFX1064_ITERATIVE-NEXT:    buffer_gl0_inv
+; GFX1064_ITERATIVE-NEXT:  .LBB5_4:
+; GFX1064_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX1064_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v3
+; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v4
+; GFX1064_ITERATIVE-NEXT:    v_add_co_u32 v0, vcc, s2, v1
+; GFX1064_ITERATIVE-NEXT:    v_add_co_ci_u32_e32 v1, vcc, s3, v2, vcc
+; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s2, -1
+; GFX1064_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1064_ITERATIVE-NEXT:    s_endpgm
+;
+; GFX1032_ITERATIVE-LABEL: add_i64_varying:
+; GFX1032_ITERATIVE:       ; %bb.0: ; %entry
+; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1032_ITERATIVE-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
+; GFX1032_ITERATIVE-NEXT:  .LBB5_1: ; %ComputeLoop
+; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s3, s2
+; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
+; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
+; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s4, s3
+; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v2, s5, s3
+; GFX1032_ITERATIVE-NEXT:    s_add_u32 s4, s4, s6
+; GFX1032_ITERATIVE-NEXT:    s_addc_u32 s5, s5, s7
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s3
+; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1032_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
+; GFX1032_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_ITERATIVE-NEXT:    s_and_saveexec_b32 s6, vcc_lo
+; GFX1032_ITERATIVE-NEXT:    s_xor_b32 s6, exec_lo, s6
+; GFX1032_ITERATIVE-NEXT:    s_cbranch_execz .LBB5_4
+; GFX1032_ITERATIVE-NEXT:  ; %bb.3:
+; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s4
+; GFX1032_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s5
+; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s10, -1
+; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s8, s2
+; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s9, s3
+; GFX1032_ITERATIVE-NEXT:    buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc
+; GFX1032_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032_ITERATIVE-NEXT:    buffer_gl1_inv
+; GFX1032_ITERATIVE-NEXT:    buffer_gl0_inv
+; GFX1032_ITERATIVE-NEXT:  .LBB5_4:
+; GFX1032_ITERATIVE-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX1032_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s6
+; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v3
+; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v4
+; GFX1032_ITERATIVE-NEXT:    v_add_co_u32 v0, vcc_lo, s2, v1
+; GFX1032_ITERATIVE-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
+; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s2, -1
+; GFX1032_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1032_ITERATIVE-NEXT:    s_endpgm
+;
+; GFX1164_ITERATIVE-LABEL: add_i64_varying:
+; GFX1164_ITERATIVE:       ; %bb.0: ; %entry
+; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1164_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164_ITERATIVE-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
+; GFX1164_ITERATIVE-NEXT:  .LBB5_1: ; %ComputeLoop
+; GFX1164_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s6, s[2:3]
+; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s7, v0, s6
+; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v3, s6
+; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v1, s4, s6
+; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v2, s5, s6
+; GFX1164_ITERATIVE-NEXT:    s_add_u32 s4, s4, s7
+; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT:    s_addc_u32 s5, s5, s8
+; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s6
+; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
+; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1164_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164_ITERATIVE-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
+; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_ITERATIVE-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1164_ITERATIVE-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GFX1164_ITERATIVE-NEXT:    s_cbranch_execz .LBB5_4
+; GFX1164_ITERATIVE-NEXT:  ; %bb.3:
+; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s4
+; GFX1164_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s5
+; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s10, -1
+; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s8, s2
+; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s9, s3
+; GFX1164_ITERATIVE-NEXT:    buffer_atomic_add_u64 v[3:4], off, s[8:11], 0 glc
+; GFX1164_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164_ITERATIVE-NEXT:    buffer_gl1_inv
+; GFX1164_ITERATIVE-NEXT:    buffer_gl0_inv
+; GFX1164_ITERATIVE-NEXT:  .LBB5_4:
+; GFX1164_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v3
+; GFX1164_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v4
+; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_ITERATIVE-NEXT:    v_add_co_u32 v0, vcc, s2, v1
+; GFX1164_ITERATIVE-NEXT:    v_add_co_ci_u32_e32 v1, vcc, s3, v2, vcc
+; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s2, -1
+; GFX1164_ITERATIVE-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1164_ITERATIVE-NEXT:    s_nop 0
+; GFX1164_ITERATIVE-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164_ITERATIVE-NEXT:    s_endpgm
+;
+; GFX1132_ITERATIVE-LABEL: add_i64_varying:
+; GFX1132_ITERATIVE:       ; %bb.0: ; %entry
+; GFX1132_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132_ITERATIVE-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1132_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
+; GFX1132_ITERATIVE-NEXT:  .LBB5_1: ; %ComputeLoop
+; GFX1132_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
+; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
+; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s4, s3
+; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v2, s5, s3
+; GFX1132_ITERATIVE-NEXT:    s_add_u32 s4, s4, s6
+; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT:    s_addc_u32 s5, s5, s7
+; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s3
+; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1132_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s6, exec_lo
+; GFX1132_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
+; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132_ITERATIVE-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132_ITERATIVE-NEXT:    s_xor_b32 s6, exec_lo, s6
+; GFX1132_ITERATIVE-NEXT:    s_cbranch_execz .LBB5_4
+; GFX1132_ITERATIVE-NEXT:  ; %bb.3:
+; GFX1132_ITERATIVE-NEXT:    v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v4, s5
+; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s10, -1
+; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s8, s2
+; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s9, s3
+; GFX1132_ITERATIVE-NEXT:    buffer_atomic_add_u64 v[3:4], off, s[8:11], 0 glc
+; GFX1132_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132_ITERATIVE-NEXT:    buffer_gl1_inv
+; GFX1132_ITERATIVE-NEXT:    buffer_gl0_inv
+; GFX1132_ITERATIVE-NEXT:  .LBB5_4:
+; GFX1132_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s6
+; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v3
+; GFX1132_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v4
+; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_ITERATIVE-NEXT:    v_add_co_u32 v0, vcc_lo, s2, v1
+; GFX1132_ITERATIVE-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
+; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, -1
+; GFX1132_ITERATIVE-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1132_ITERATIVE-NEXT:    s_nop 0
+; GFX1132_ITERATIVE-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132_ITERATIVE-NEXT:    s_endpgm
+;
+; GFX1264_ITERATIVE-LABEL: add_i64_varying:
+; GFX1264_ITERATIVE:       ; %bb.0: ; %entry
+; GFX1264_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1264_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1264_ITERATIVE-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1264_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
+; GFX1264_ITERATIVE-NEXT:  .LBB5_1: ; %ComputeLoop
+; GFX1264_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1264_ITERATIVE-NEXT:    s_ctz_i32_b64 s10, s[2:3]
+; GFX1264_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s10
+; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s10
+; GFX1264_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s10
+; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v2, s5, s10
+; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v1, s4, s10
+; GFX1264_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[8:9]
+; GFX1264_ITERATIVE-NEXT:    s_add_nc_u64 s[4:5], s[4:5], s[6:7]
+; GFX1264_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1264_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX1264_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1264_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1264_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1264_ITERATIVE-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1264_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
+; GFX1264_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1264_ITERATIVE-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1264_ITERATIVE-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GFX1264_ITERATIVE-NEXT:    s_cbranch_execz .LBB5_4
+; GFX1264_ITERATIVE-NEXT:  ; %bb.3:
+; GFX1264_ITERATIVE-NEXT:    v_mov_b32_e32 v3, s4
+; GFX1264_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s5
+; GFX1264_ITERATIVE-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1264_ITERATIVE-NEXT:    s_mov_b32 s10, -1
+; GFX1264_ITERATIVE-NEXT:    s_wait_kmcnt 0x0
+; GFX1264_ITERATIVE-NEXT:    s_mov_b32 s8, s2
+; GFX1264_ITERATIVE-NEXT:    s_mov_b32 s9, s3
+; GFX1264_ITERATIVE-NEXT:    buffer_atomic_add_u64 v[3:4], off, s[8:11], null th:TH_ATOMIC_RETURN
+; GFX1264_ITERATIVE-NEXT:    s_wait_loadcnt 0x0
+; GFX1264_ITERATIVE-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1264_ITERATIVE-NEXT:  .LBB5_4:
+; GFX1264_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX1264_ITERATIVE-NEXT:    s_wait_kmcnt 0x0
+; GFX1264_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v3
+; GFX1264_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v4
+; GFX1264_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264_ITERATIVE-NEXT:    v_add_co_u32 v0, vcc, s2, v1
+; GFX1264_ITERATIVE-NEXT:    v_add_co_ci_u32_e32 v1, vcc, s3, v2, vcc
+; GFX1264_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1264_ITERATIVE-NEXT:    s_mov_b32 s2, -1
+; GFX1264_ITERATIVE-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], null
+; GFX1264_ITERATIVE-NEXT:    s_nop 0
+; GFX1264_ITERATIVE-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1264_ITERATIVE-NEXT:    s_endpgm
+;
+; GFX1232_ITERATIVE-LABEL: add_i64_varying:
+; GFX1232_ITERATIVE:       ; %bb.0: ; %entry
+; GFX1232_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1232_ITERATIVE-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1232_ITERATIVE-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1232_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
+; GFX1232_ITERATIVE-NEXT:  .LBB5_1: ; %ComputeLoop
+; GFX1232_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1232_ITERATIVE-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX1232_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
+; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
+; GFX1232_ITERATIVE-NEXT:    s_lshl_b32 s8, 1, s3
+; GFX1232_ITERATIVE-NEXT:    v_writelane_b32 v2, s5, s3
+; GFX1232_ITERATIVE-NEXT:    v_writelane_b32 v1, s4, s3
+; GFX1232_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s8
+; GFX1232_ITERATIVE-NEXT:    s_add_nc_u64 s[4:5], s[4:5], s[6:7]
+; GFX1232_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1232_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX1232_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1232_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1232_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1232_ITERATIVE-NEXT:    s_mov_b32 s6, exec_lo
+; GFX1232_ITERATIVE-NEXT:    ; implicit-def: $vgpr3_vgpr4
+; GFX1232_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1232_ITERATIVE-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1232_ITERATIVE-NEXT:    s_xor_b32 s6, exec_lo, s6
+; GFX1232_ITERATIVE-NEXT:    s_cbranch_execz .LBB5_4
+; GFX1232_ITERATIVE-NEXT:  ; %bb.3:
+; GFX1232_ITERATIVE-NEXT:    v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v4, s5
+; GFX1232_ITERATIVE-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1232_ITERATIVE-NEXT:    s_mov_b32 s10, -1
+; GFX1232_ITERATIVE-NEXT:    s_wait_kmcnt 0x0
+; GFX1232_ITERATIVE-NEXT:    s_mov_b32 s8, s2
+; GFX1232_ITERATIVE-NEXT:    s_mov_b32 s9, s3
+; GFX1232_ITERATIVE-NEXT:    buffer_atomic_add_u64 v[3:4], off, s[8:11], null th:TH_ATOMIC_RETURN
+; GFX1232_ITERATIVE-NEXT:    s_wait_loadcnt 0x0
+; GFX1232_ITERATIVE-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1232_ITERATIVE-NEXT:  .LBB5_4:
+; GFX1232_ITERATIVE-NEXT:    s_or_b32 exec_lo, exec_lo, s6
+; GFX1232_ITERATIVE-NEXT:    s_wait_kmcnt 0x0
+; GFX1232_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v3
+; GFX1232_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v4
+; GFX1232_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1232_ITERATIVE-NEXT:    v_add_co_u32 v0, vcc_lo, s2, v1
+; GFX1232_ITERATIVE-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
+; GFX1232_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1232_ITERATIVE-NEXT:    s_mov_b32 s2, -1
+; GFX1232_ITERATIVE-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], null
+; GFX1232_ITERATIVE-NEXT:    s_nop 0
+; GFX1232_ITERATIVE-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1232_ITERATIVE-NEXT:    s_endpgm
+;
+; GFX7LESS_DPP-LABEL: add_i64_varying:
+; GFX7LESS_DPP:       ; %bb.0: ; %entry
+; GFX7LESS_DPP-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GFX7LESS_DPP-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7LESS_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX7LESS_DPP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7LESS_DPP-NEXT:    s_mov_b32 s10, s6
+; GFX7LESS_DPP-NEXT:    s_mov_b32 s11, s7
+; GFX7LESS_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS_DPP-NEXT:    s_mov_b32 s8, s2
+; GFX7LESS_DPP-NEXT:    s_mov_b32 s9, s3
+; GFX7LESS_DPP-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
+; GFX7LESS_DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS_DPP-NEXT:    buffer_wbinvl1
+; GFX7LESS_DPP-NEXT:    s_mov_b32 s4, s0
+; GFX7LESS_DPP-NEXT:    s_mov_b32 s5, s1
+; GFX7LESS_DPP-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX7LESS_DPP-NEXT:    s_endpgm
+;
+; GFX8_DPP-LABEL: add_i64_varying:
+; GFX8_DPP:       ; %bb.0: ; %entry
+; GFX8_DPP-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, v0
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, 0
+; GFX8_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX8_DPP-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, v7
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, v8
+; GFX8_DPP-NEXT:    s_not_b64 exec, exec
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX8_DPP-NEXT:    s_not_b64 exec, exec
+; GFX8_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT:    s_nop 0
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT:    s_nop 0
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT:    s_nop 0
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT:    s_nop 0
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT:    s_nop 0
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT:    s_nop 0
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT:    s_nop 0
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT:    s_nop 0
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT:    s_nop 0
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT:    s_nop 0
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT:    s_nop 0
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT:    v_readlane_b32 s7, v4, 63
+; GFX8_DPP-NEXT:    v_readlane_b32 s6, v3, 63
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
+; GFX8_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX8_DPP-NEXT:    s_cbranch_execz .LBB5_2
+; GFX8_DPP-NEXT:  ; %bb.1:
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, s7
+; GFX8_DPP-NEXT:    s_mov_b32 s11, 0xf000
+; GFX8_DPP-NEXT:    s_mov_b32 s10, -1
+; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8_DPP-NEXT:    s_mov_b32 s8, s2
+; GFX8_DPP-NEXT:    s_mov_b32 s9, s3
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, s6
+; GFX8_DPP-NEXT:    buffer_atomic_add_x2 v[7:8], off, s[8:11], 0 glc
+; GFX8_DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX8_DPP-NEXT:    buffer_wbinvl1_vol
+; GFX8_DPP-NEXT:  .LBB5_2:
+; GFX8_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8_DPP-NEXT:    v_readfirstlane_b32 s4, v8
+; GFX8_DPP-NEXT:    v_readfirstlane_b32 s5, v7
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, v1
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8_DPP-NEXT:    v_add_u32_e32 v7, vcc, s5, v7
+; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8_DPP-NEXT:    s_mov_b32 s3, 0xf000
+; GFX8_DPP-NEXT:    s_mov_b32 s2, -1
+; GFX8_DPP-NEXT:    v_addc_u32_e32 v8, vcc, v0, v8, vcc
+; GFX8_DPP-NEXT:    buffer_store_dwordx2 v[7:8], off, s[0:3], 0
+; GFX8_DPP-NEXT:    s_endpgm
+;
+; GFX9_DPP-LABEL: add_i64_varying:
+; GFX9_DPP:       ; %bb.0: ; %entry
+; GFX9_DPP-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, 0
+; GFX9_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9_DPP-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, v7
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, v8
+; GFX9_DPP-NEXT:    s_not_b64 exec, exec
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9_DPP-NEXT:    s_not_b64 exec, exec
+; GFX9_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT:    s_nop 0
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT:    s_nop 0
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT:    s_nop 0
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT:    s_nop 0
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT:    s_nop 0
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT:    s_nop 0
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT:    s_nop 0
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT:    s_nop 0
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT:    s_nop 0
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT:    s_nop 0
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT:    s_nop 0
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT:    v_readlane_b32 s7, v4, 63
+; GFX9_DPP-NEXT:    v_readlane_b32 s6, v3, 63
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
+; GFX9_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9_DPP-NEXT:    s_cbranch_execz .LBB5_2
+; GFX9_DPP-NEXT:  ; %bb.1:
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, s7
+; GFX9_DPP-NEXT:    s_mov_b32 s11, 0xf000
+; GFX9_DPP-NEXT:    s_mov_b32 s10, -1
+; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9_DPP-NEXT:    s_mov_b32 s8, s2
+; GFX9_DPP-NEXT:    s_mov_b32 s9, s3
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, s6
+; GFX9_DPP-NEXT:    buffer_atomic_add_x2 v[7:8], off, s[8:11], 0 glc
+; GFX9_DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX9_DPP-NEXT:    buffer_wbinvl1_vol
+; GFX9_DPP-NEXT:  .LBB5_2:
+; GFX9_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v8
+; GFX9_DPP-NEXT:    v_readfirstlane_b32 s5, v7
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v1
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9_DPP-NEXT:    v_add_co_u32_e32 v7, vcc, s5, v7
+; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9_DPP-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9_DPP-NEXT:    s_mov_b32 s2, -1
+; GFX9_DPP-NEXT:    v_addc_co_u32_e32 v8, vcc, v0, v8, vcc
+; GFX9_DPP-NEXT:    buffer_store_dwordx2 v[7:8], off, s[0:3], 0
+; GFX9_DPP-NEXT:    s_endpgm
 ;
-; GFX89-LABEL: add_i64_varying:
-; GFX89:       ; %bb.0: ; %entry
-; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT:    s_mov_b32 s7, 0xf000
-; GFX89-NEXT:    s_mov_b32 s6, -1
-; GFX89-NEXT:    s_mov_b32 s10, s6
-; GFX89-NEXT:    s_mov_b32 s11, s7
-; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX89-NEXT:    s_mov_b32 s8, s2
-; GFX89-NEXT:    s_mov_b32 s9, s3
-; GFX89-NEXT:    v_mov_b32_e32 v1, 0
-; GFX89-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
-; GFX89-NEXT:    buffer_wbinvl1_vol
-; GFX89-NEXT:    s_mov_b32 s4, s0
-; GFX89-NEXT:    s_mov_b32 s5, s1
-; GFX89-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
-; GFX89-NEXT:    s_endpgm
-;
-; GFX10-LABEL: add_i64_varying:
-; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s6, -1
-; GFX10-NEXT:    s_mov_b32 s11, s7
-; GFX10-NEXT:    s_mov_b32 s10, s6
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s8, s2
-; GFX10-NEXT:    s_mov_b32 s9, s3
-; GFX10-NEXT:    s_mov_b32 s4, s0
-; GFX10-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl1_inv
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    s_mov_b32 s5, s1
-; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: add_i64_varying:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s6, -1
-; GFX11-NEXT:    s_mov_b32 s11, s7
-; GFX11-NEXT:    s_mov_b32 s10, s6
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s8, s2
-; GFX11-NEXT:    s_mov_b32 s9, s3
-; GFX11-NEXT:    s_mov_b32 s4, s0
-; GFX11-NEXT:    buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl1_inv
-; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    s_mov_b32 s5, s1
-; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
-;
-; GFX12-LABEL: add_i64_varying:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX12-NEXT:    s_mov_b32 s6, -1
-; GFX12-NEXT:    s_mov_b32 s11, s7
-; GFX12-NEXT:    s_mov_b32 s10, s6
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s8, s2
-; GFX12-NEXT:    s_mov_b32 s9, s3
-; GFX12-NEXT:    s_mov_b32 s4, s0
-; GFX12-NEXT:    buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_mov_b32 s5, s1
-; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], null
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-NEXT:    s_endpgm
+; GFX1064_DPP-LABEL: add_i64_varying:
+; GFX1064_DPP:       ; %bb.0: ; %entry
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, v0
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, 0
+; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, v9
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, v10
+; GFX1064_DPP-NEXT:    s_not_b64 exec, exec
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1064_DPP-NEXT:    s_not_b64 exec, exec
+; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v1
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v5
+; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v7
+; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v1
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v5
+; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v7
+; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v1
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v7
+; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v2
+; GFX1064_DPP-NEXT:    v_readlane_b32 s4, v3, 31
+; GFX1064_DPP-NEXT:    v_readlane_b32 s5, v4, 31
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v6, s4
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, s5
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v7, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_add_co_u32 v3, vcc, v3, v5
+; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc
+; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1064_DPP-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1064_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT:    v_readlane_b32 s6, v4, 15
+; GFX1064_DPP-NEXT:    v_readlane_b32 s7, v3, 15
+; GFX1064_DPP-NEXT:    v_readlane_b32 s8, v4, 31
+; GFX1064_DPP-NEXT:    v_readlane_b32 s9, v3, 31
+; GFX1064_DPP-NEXT:    v_readlane_b32 s10, v3, 47
+; GFX1064_DPP-NEXT:    v_writelane_b32 v2, s6, 16
+; GFX1064_DPP-NEXT:    v_writelane_b32 v1, s7, 16
+; GFX1064_DPP-NEXT:    v_readlane_b32 s6, v3, 63
+; GFX1064_DPP-NEXT:    v_readlane_b32 s11, v4, 47
+; GFX1064_DPP-NEXT:    v_readlane_b32 s7, v4, 63
+; GFX1064_DPP-NEXT:    v_writelane_b32 v2, s8, 32
+; GFX1064_DPP-NEXT:    v_writelane_b32 v1, s9, 32
+; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX1064_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; GFX1064_DPP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX1064_DPP-NEXT:    v_writelane_b32 v2, s11, 48
+; GFX1064_DPP-NEXT:    v_writelane_b32 v1, s10, 48
+; GFX1064_DPP-NEXT:    s_mov_b64 exec, s[8:9]
+; GFX1064_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX1064_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
+; GFX1064_DPP-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GFX1064_DPP-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1064_DPP-NEXT:  ; %bb.1:
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v10, s5
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v9, s4
+; GFX1064_DPP-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064_DPP-NEXT:    s_mov_b32 s4, s2
+; GFX1064_DPP-NEXT:    s_mov_b32 s5, s3
+; GFX1064_DPP-NEXT:    buffer_atomic_add_x2 v[9:10], off, s[4:7], 0 glc
+; GFX1064_DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064_DPP-NEXT:    buffer_gl1_inv
+; GFX1064_DPP-NEXT:    buffer_gl0_inv
+; GFX1064_DPP-NEXT:  .LBB5_2:
+; GFX1064_DPP-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX1064_DPP-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s2, v9
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v11, v1
+; GFX1064_DPP-NEXT:    v_mov_b32_e32 v12, v2
+; GFX1064_DPP-NEXT:    v_readfirstlane_b32 s3, v10
+; GFX1064_DPP-NEXT:    v_add_co_u32 v9, vcc, s2, v11
+; GFX1064_DPP-NEXT:    s_mov_b32 s2, s6
+; GFX1064_DPP-NEXT:    v_add_co_ci_u32_e32 v10, vcc, s3, v12, vcc
+; GFX1064_DPP-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1064_DPP-NEXT:    buffer_store_dwordx2 v[9:10], off, s[0:3], 0
+; GFX1064_DPP-NEXT:    s_endpgm
+;
+; GFX1032_DPP-LABEL: add_i64_varying:
+; GFX1032_DPP:       ; %bb.0: ; %entry
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, v0
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, 0
+; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, v9
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, v10
+; GFX1032_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1032_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v1
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v5
+; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v1
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v7
+; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v1
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v5
+; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v7
+; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v1
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1032_DPP-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX1032_DPP-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v7
+; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
+; GFX1032_DPP-NEXT:    v_readlane_b32 s4, v3, 31
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_readlane_b32 s8, v4, 15
+; GFX1032_DPP-NEXT:    v_readlane_b32 s5, v4, 31
+; GFX1032_DPP-NEXT:    v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT:    v_readlane_b32 s7, v3, 15
+; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s6
+; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX1032_DPP-NEXT:    v_writelane_b32 v2, s8, 16
+; GFX1032_DPP-NEXT:    v_writelane_b32 v1, s7, 16
+; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s6
+; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
+; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s8, vcc_lo
+; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1032_DPP-NEXT:  ; %bb.1:
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, s5
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, s4
+; GFX1032_DPP-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032_DPP-NEXT:    s_mov_b32 s4, s2
+; GFX1032_DPP-NEXT:    s_mov_b32 s5, s3
+; GFX1032_DPP-NEXT:    buffer_atomic_add_x2 v[9:10], off, s[4:7], 0 glc
+; GFX1032_DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032_DPP-NEXT:    buffer_gl1_inv
+; GFX1032_DPP-NEXT:    buffer_gl0_inv
+; GFX1032_DPP-NEXT:  .LBB5_2:
+; GFX1032_DPP-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX1032_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s8
+; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s2, v9
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v11, v1
+; GFX1032_DPP-NEXT:    v_mov_b32_e32 v12, v2
+; GFX1032_DPP-NEXT:    v_readfirstlane_b32 s3, v10
+; GFX1032_DPP-NEXT:    v_add_co_u32 v9, vcc_lo, s2, v11
+; GFX1032_DPP-NEXT:    s_mov_b32 s2, s6
+; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, s3, v12, vcc_lo
+; GFX1032_DPP-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1032_DPP-NEXT:    buffer_store_dwordx2 v[9:10], off, s[0:3], 0
+; GFX1032_DPP-NEXT:    s_endpgm
+;
+; GFX1164_DPP-LABEL: add_i64_varying:
+; GFX1164_DPP:       ; %bb.0: ; %entry
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v0
+; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v9, 0
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, v8
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, v9
+; GFX1164_DPP-NEXT:    s_not_b64 exec, exec
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1164_DPP-NEXT:    s_not_b64 exec, exec
+; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, v2
+; GFX1164_DPP-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v5, vcc
+; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
+; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v5, vcc
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v4
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
+; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc, v6, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1164_DPP-NEXT:    v_readlane_b32 s5, v3, 31
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v7, s5
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_readlane_b32 s4, v4, 31
+; GFX1164_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc, v7, v3 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v6, s4
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v5, vcc
+; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1164_DPP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1164_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v3, 15
+; GFX1164_DPP-NEXT:    v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT:    v_readlane_b32 s7, v4, 15
+; GFX1164_DPP-NEXT:    v_readlane_b32 s8, v3, 31
+; GFX1164_DPP-NEXT:    v_readlane_b32 s9, v4, 31
+; GFX1164_DPP-NEXT:    v_writelane_b32 v1, s6, 16
+; GFX1164_DPP-NEXT:    v_readlane_b32 s6, v3, 63
+; GFX1164_DPP-NEXT:    v_writelane_b32 v2, s7, 16
+; GFX1164_DPP-NEXT:    v_readlane_b32 s10, v3, 47
+; GFX1164_DPP-NEXT:    v_readlane_b32 s11, v4, 47
+; GFX1164_DPP-NEXT:    v_readlane_b32 s7, v4, 63
+; GFX1164_DPP-NEXT:    v_writelane_b32 v1, s8, 32
+; GFX1164_DPP-NEXT:    v_writelane_b32 v2, s9, 32
+; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_DPP-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; GFX1164_DPP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX1164_DPP-NEXT:    v_writelane_b32 v1, s10, 48
+; GFX1164_DPP-NEXT:    v_writelane_b32 v2, s11, 48
+; GFX1164_DPP-NEXT:    s_mov_b64 exec, s[8:9]
+; GFX1164_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX1164_DPP-NEXT:    s_mov_b64 s[8:9], exec
+; GFX1164_DPP-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; GFX1164_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1164_DPP-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1164_DPP-NEXT:  ; %bb.1:
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v9, s5
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, s4
+; GFX1164_DPP-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164_DPP-NEXT:    s_mov_b32 s4, s2
+; GFX1164_DPP-NEXT:    s_mov_b32 s5, s3
+; GFX1164_DPP-NEXT:    buffer_atomic_add_u64 v[8:9], off, s[4:7], 0 glc
+; GFX1164_DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164_DPP-NEXT:    buffer_gl1_inv
+; GFX1164_DPP-NEXT:    buffer_gl0_inv
+; GFX1164_DPP-NEXT:  .LBB5_2:
+; GFX1164_DPP-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s2, v8
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v10, v1
+; GFX1164_DPP-NEXT:    v_mov_b32_e32 v11, v2
+; GFX1164_DPP-NEXT:    v_readfirstlane_b32 s3, v9
+; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT:    v_add_co_u32 v8, vcc, s2, v10
+; GFX1164_DPP-NEXT:    s_mov_b32 s2, s6
+; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v9, vcc, s3, v11, vcc
+; GFX1164_DPP-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1164_DPP-NEXT:    buffer_store_b64 v[8:9], off, s[0:3], 0
+; GFX1164_DPP-NEXT:    s_nop 0
+; GFX1164_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164_DPP-NEXT:    s_endpgm
+;
+; GFX1132_DPP-LABEL: add_i64_varying:
+; GFX1132_DPP:       ; %bb.0: ; %entry
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v8, v0
+; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v9, 0
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v3, v8
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, v9
+; GFX1132_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1132_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
+; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v7, v2
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v5, vcc_lo
+; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v5, vcc_lo
+; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v5, v4
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
+; GFX1132_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc_lo, v6, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1132_DPP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v7, vcc_lo
+; GFX1132_DPP-NEXT:    v_readlane_b32 s4, v3, 31
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    v_readlane_b32 s7, v3, 15
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT:    v_readlane_b32 s8, v4, 15
+; GFX1132_DPP-NEXT:    v_readlane_b32 s5, v4, 31
+; GFX1132_DPP-NEXT:    v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s6
+; GFX1132_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX1132_DPP-NEXT:    v_writelane_b32 v1, s7, 16
+; GFX1132_DPP-NEXT:    v_writelane_b32 v2, s8, 16
+; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s6
+; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT:    s_mov_b32 s8, exec_lo
+; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; GFX1132_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1132_DPP-NEXT:  ; %bb.1:
+; GFX1132_DPP-NEXT:    v_dual_mov_b32 v9, s5 :: v_dual_mov_b32 v8, s4
+; GFX1132_DPP-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132_DPP-NEXT:    s_mov_b32 s4, s2
+; GFX1132_DPP-NEXT:    s_mov_b32 s5, s3
+; GFX1132_DPP-NEXT:    buffer_atomic_add_u64 v[8:9], off, s[4:7], 0 glc
+; GFX1132_DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132_DPP-NEXT:    buffer_gl1_inv
+; GFX1132_DPP-NEXT:    buffer_gl0_inv
+; GFX1132_DPP-NEXT:  .LBB5_2:
+; GFX1132_DPP-NEXT:    s_or_b32 exec_lo, exec_lo, s8
+; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s2, v8
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v10, v1
+; GFX1132_DPP-NEXT:    v_mov_b32_e32 v11, v2
+; GFX1132_DPP-NEXT:    v_readfirstlane_b32 s3, v9
+; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT:    v_add_co_u32 v8, vcc_lo, s2, v10
+; GFX1132_DPP-NEXT:    s_mov_b32 s2, s6
+; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, s3, v11, vcc_lo
+; GFX1132_DPP-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1132_DPP-NEXT:    buffer_store_b64 v[8:9], off, s[0:3], 0
+; GFX1132_DPP-NEXT:    s_nop 0
+; GFX1132_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132_DPP-NEXT:    s_endpgm
+;
+; GFX1264_DPP-LABEL: add_i64_varying:
+; GFX1264_DPP:       ; %bb.0: ; %entry
+; GFX1264_DPP-NEXT:    v_mov_b32_e32 v8, v0
+; GFX1264_DPP-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1264_DPP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1264_DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1264_DPP-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1264_DPP-NEXT:    v_mov_b32_e32 v9, 0
+; GFX1264_DPP-NEXT:    v_mov_b32_e32 v3, v8
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1264_DPP-NEXT:    v_mov_b32_e32 v4, v9
+; GFX1264_DPP-NEXT:    s_not_b64 exec, exec
+; GFX1264_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1264_DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1264_DPP-NEXT:    s_not_b64 exec, exec
+; GFX1264_DPP-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX1264_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1264_DPP-NEXT:    v_mov_b32_e32 v7, v2
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v5, vcc
+; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1264_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v5, vcc
+; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264_DPP-NEXT:    v_mov_b32_e32 v6, v3
+; GFX1264_DPP-NEXT:    v_mov_b32_e32 v5, v4
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264_DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
+; GFX1264_DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc, v6, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1264_DPP-NEXT:    v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1264_DPP-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1264_DPP-NEXT:    v_readlane_b32 s5, v3, 31
+; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264_DPP-NEXT:    v_mov_b32_e32 v7, s5
+; GFX1264_DPP-NEXT:    v_readlane_b32 s4, v4, 31
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc, v7, v3 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1264_DPP-NEXT:    v_mov_b32_e32 v6, s4
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_DPP-NEXT:    v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc, v4, v5, vcc
+; GFX1264_DPP-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1264_DPP-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1264_DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1264_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX1264_DPP-NEXT:    v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1264_DPP-NEXT:    v_readlane_b32 s6, v3, 15
+; GFX1264_DPP-NEXT:    v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1264_DPP-NEXT:    v_readlane_b32 s7, v4, 15
+; GFX1264_DPP-NEXT:    v_readlane_b32 s8, v3, 31
+; GFX1264_DPP-NEXT:    v_readlane_b32 s9, v4, 31
+; GFX1264_DPP-NEXT:    v_writelane_b32 v1, s6, 16
+; GFX1264_DPP-NEXT:    v_readlane_b32 s6, v3, 63
+; GFX1264_DPP-NEXT:    v_writelane_b32 v2, s7, 16
+; GFX1264_DPP-NEXT:    v_readlane_b32 s10, v3, 47
+; GFX1264_DPP-NEXT:    v_readlane_b32 s11, v4, 47
+; GFX1264_DPP-NEXT:    v_readlane_b32 s7, v4, 63
+; GFX1264_DPP-NEXT:    v_writelane_b32 v1, s8, 32
+; GFX1264_DPP-NEXT:    v_writelane_b32 v2, s9, 32
+; GFX1264_DPP-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1264_DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1264_DPP-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; GFX1264_DPP-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX1264_DPP-NEXT:    v_writelane_b32 v1, s10, 48
+; GFX1264_DPP-NEXT:    v_writelane_b32 v2, s11, 48
+; GFX1264_DPP-NEXT:    s_mov_b64 exec, s[8:9]
+; GFX1264_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX1264_DPP-NEXT:    s_mov_b64 s[8:9], exec
+; GFX1264_DPP-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; GFX1264_DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1264_DPP-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1264_DPP-NEXT:  ; %bb.1:
+; GFX1264_DPP-NEXT:    v_mov_b32_e32 v9, s5
+; GFX1264_DPP-NEXT:    v_mov_b32_e32 v8, s4
+; GFX1264_DPP-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1264_DPP-NEXT:    s_wait_kmcnt 0x0
+; GFX1264_DPP-NEXT:    s_mov_b32 s4, s2
+; GFX1264_DPP-NEXT:    s_mov_b32 s5, s3
+; GFX1264_DPP-NEXT:    buffer_atomic_add_u64 v[8:9], off, s[4:7], null th:TH_ATOMIC_RETURN
+; GFX1264_DPP-NEXT:    s_wait_loadcnt 0x0
+; GFX1264_DPP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1264_DPP-NEXT:  .LBB5_2:
+; GFX1264_DPP-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX1264_DPP-NEXT:    s_wait_kmcnt 0x0
+; GFX1264_DPP-NEXT:    v_readfirstlane_b32 s2, v8
+; GFX1264_DPP-NEXT:    v_mov_b32_e32 v10, v1
+; GFX1264_DPP-NEXT:    v_mov_b32_e32 v11, v2
+; GFX1264_DPP-NEXT:    v_readfirstlane_b32 s3, v9
+; GFX1264_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1264_DPP-NEXT:    v_add_co_u32 v8, vcc, s2, v10
+; GFX1264_DPP-NEXT:    s_mov_b32 s2, s6
+; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v9, vcc, s3, v11, vcc
+; GFX1264_DPP-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1264_DPP-NEXT:    buffer_store_b64 v[8:9], off, s[0:3], null
+; GFX1264_DPP-NEXT:    s_nop 0
+; GFX1264_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1264_DPP-NEXT:    s_endpgm
+;
+; GFX1232_DPP-LABEL: add_i64_varying:
+; GFX1232_DPP:       ; %bb.0: ; %entry
+; GFX1232_DPP-NEXT:    v_mov_b32_e32 v8, v0
+; GFX1232_DPP-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1232_DPP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1232_DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1232_DPP-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1232_DPP-NEXT:    v_mov_b32_e32 v9, 0
+; GFX1232_DPP-NEXT:    v_mov_b32_e32 v3, v8
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1232_DPP-NEXT:    v_mov_b32_e32 v4, v9
+; GFX1232_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
+; GFX1232_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1232_DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1232_DPP-NEXT:    s_not_b32 exec_lo, exec_lo
+; GFX1232_DPP-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1232_DPP-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v6, v2
+; GFX1232_DPP-NEXT:    v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1232_DPP-NEXT:    v_mov_b32_e32 v7, v2
+; GFX1232_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232_DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v5, vcc_lo
----------------
jayfoad wrote:

As a follow up we should try to improve this DPP code. The docs say that v_add_co_ci_u32_e32 has a DPP form. I don't know why we're not using it here.

https://github.com/llvm/llvm-project/pull/96934


More information about the llvm-commits mailing list