[llvm] [AMDGPU] ISel & PEI for whole wave functions (PR #145858)
Jay Foad via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 10 10:27:50 PDT 2026
================
@@ -0,0 +1,2414 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefix=DAGISEL %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefix=GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck --check-prefix=DAGISEL64 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck --check-prefix=GISEL64 %s
+
+; Make sure the i1 %active is passed through EXEC.
+; The EXEC mask should be set to -1 for the duration of the function
+; and restored to its original value in the epilogue.
+; We will also need to restore the inactive lanes for any allocated VGPRs.
+define amdgpu_gfx_whole_wave i32 @basic_test(i1 %active, i32 %a, i32 %b) {
+; DAGISEL-LABEL: basic_test:
+; DAGISEL: ; %bb.0:
+; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT: s_wait_expcnt 0x0
+; DAGISEL-NEXT: s_wait_samplecnt 0x0
+; DAGISEL-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL-NEXT: s_wait_kmcnt 0x0
+; DAGISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1
+; DAGISEL-NEXT: s_clause 0x1
+; DAGISEL-NEXT: scratch_store_b32 off, v0, s32
+; DAGISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; DAGISEL-NEXT: s_mov_b32 exec_lo, -1
+; DAGISEL-NEXT: s_wait_alu 0xfffe
+; DAGISEL-NEXT: v_dual_cndmask_b32 v0, 5, v0 :: v_dual_cndmask_b32 v1, 3, v1
+; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; DAGISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1
+; DAGISEL-NEXT: s_clause 0x1
+; DAGISEL-NEXT: scratch_load_b32 v0, off, s32
+; DAGISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4
+; DAGISEL-NEXT: s_mov_b32 exec_lo, vcc_lo
+; DAGISEL-NEXT: s_wait_loadcnt 0x0
+; DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: basic_test:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT: s_wait_expcnt 0x0
+; GISEL-NEXT: s_wait_samplecnt 0x0
+; GISEL-NEXT: s_wait_bvhcnt 0x0
+; GISEL-NEXT: s_wait_kmcnt 0x0
+; GISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1
+; GISEL-NEXT: s_clause 0x1
+; GISEL-NEXT: scratch_store_b32 off, v0, s32
+; GISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; GISEL-NEXT: s_mov_b32 exec_lo, -1
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: v_dual_cndmask_b32 v0, 5, v0 :: v_dual_cndmask_b32 v1, 3, v1
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1
+; GISEL-NEXT: s_clause 0x1
+; GISEL-NEXT: scratch_load_b32 v0, off, s32
+; GISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4
+; GISEL-NEXT: s_mov_b32 exec_lo, vcc_lo
+; GISEL-NEXT: s_wait_loadcnt 0x0
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; DAGISEL64-LABEL: basic_test:
+; DAGISEL64: ; %bb.0:
+; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL64-NEXT: s_wait_expcnt 0x0
+; DAGISEL64-NEXT: s_wait_samplecnt 0x0
+; DAGISEL64-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL64-NEXT: s_wait_kmcnt 0x0
+; DAGISEL64-NEXT: s_xor_saveexec_b64 vcc, -1
+; DAGISEL64-NEXT: s_clause 0x1
+; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32
+; DAGISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; DAGISEL64-NEXT: s_mov_b64 exec, -1
+; DAGISEL64-NEXT: s_wait_alu 0xfffe
+; DAGISEL64-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
+; DAGISEL64-NEXT: v_cndmask_b32_e32 v1, 3, v1, vcc
+; DAGISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL64-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; DAGISEL64-NEXT: s_xor_b64 exec, vcc, -1
+; DAGISEL64-NEXT: s_clause 0x1
+; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32
+; DAGISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4
+; DAGISEL64-NEXT: s_mov_b64 exec, vcc
+; DAGISEL64-NEXT: s_wait_loadcnt 0x0
+; DAGISEL64-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL64-LABEL: basic_test:
+; GISEL64: ; %bb.0:
+; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL64-NEXT: s_wait_expcnt 0x0
+; GISEL64-NEXT: s_wait_samplecnt 0x0
+; GISEL64-NEXT: s_wait_bvhcnt 0x0
+; GISEL64-NEXT: s_wait_kmcnt 0x0
+; GISEL64-NEXT: s_xor_saveexec_b64 vcc, -1
+; GISEL64-NEXT: s_clause 0x1
+; GISEL64-NEXT: scratch_store_b32 off, v0, s32
+; GISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; GISEL64-NEXT: s_mov_b64 exec, -1
+; GISEL64-NEXT: s_wait_alu 0xfffe
+; GISEL64-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
+; GISEL64-NEXT: v_cndmask_b32_e32 v1, 3, v1, vcc
+; GISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL64-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GISEL64-NEXT: s_xor_b64 exec, vcc, -1
+; GISEL64-NEXT: s_clause 0x1
+; GISEL64-NEXT: scratch_load_b32 v0, off, s32
+; GISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4
+; GISEL64-NEXT: s_mov_b64 exec, vcc
+; GISEL64-NEXT: s_wait_loadcnt 0x0
+; GISEL64-NEXT: s_setpc_b64 s[30:31]
+ %x = select i1 %active, i32 %a, i32 5
+ %y = select i1 %active, i32 %b, i32 3
+ %ret = call i32 @llvm.amdgcn.update.dpp.i32(i32 %x, i32 %y, i32 1, i32 1, i32 1, i1 false)
+ ret i32 %ret
+}
+
+; Make sure we don't crash if there's only one use for %active.
+define amdgpu_gfx_whole_wave i32 @single_use_of_active(i1 %active, i32 %a, i32 %b) {
+; DAGISEL-LABEL: single_use_of_active:
+; DAGISEL: ; %bb.0:
+; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT: s_wait_expcnt 0x0
+; DAGISEL-NEXT: s_wait_samplecnt 0x0
+; DAGISEL-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL-NEXT: s_wait_kmcnt 0x0
+; DAGISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1
+; DAGISEL-NEXT: s_clause 0x1
+; DAGISEL-NEXT: scratch_store_b32 off, v0, s32
+; DAGISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; DAGISEL-NEXT: s_mov_b32 exec_lo, -1
+; DAGISEL-NEXT: s_wait_alu 0xfffe
+; DAGISEL-NEXT: v_cndmask_b32_e32 v1, 17, v1, vcc_lo
+; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; DAGISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1
+; DAGISEL-NEXT: s_clause 0x1
+; DAGISEL-NEXT: scratch_load_b32 v0, off, s32
+; DAGISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4
+; DAGISEL-NEXT: s_mov_b32 exec_lo, vcc_lo
+; DAGISEL-NEXT: s_wait_loadcnt 0x0
+; DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: single_use_of_active:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT: s_wait_expcnt 0x0
+; GISEL-NEXT: s_wait_samplecnt 0x0
+; GISEL-NEXT: s_wait_bvhcnt 0x0
+; GISEL-NEXT: s_wait_kmcnt 0x0
+; GISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1
+; GISEL-NEXT: s_clause 0x1
+; GISEL-NEXT: scratch_store_b32 off, v0, s32
+; GISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; GISEL-NEXT: s_mov_b32 exec_lo, -1
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: v_cndmask_b32_e32 v1, 17, v1, vcc_lo
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1
+; GISEL-NEXT: s_clause 0x1
+; GISEL-NEXT: scratch_load_b32 v0, off, s32
+; GISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4
+; GISEL-NEXT: s_mov_b32 exec_lo, vcc_lo
+; GISEL-NEXT: s_wait_loadcnt 0x0
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; DAGISEL64-LABEL: single_use_of_active:
+; DAGISEL64: ; %bb.0:
+; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL64-NEXT: s_wait_expcnt 0x0
+; DAGISEL64-NEXT: s_wait_samplecnt 0x0
+; DAGISEL64-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL64-NEXT: s_wait_kmcnt 0x0
+; DAGISEL64-NEXT: s_xor_saveexec_b64 vcc, -1
+; DAGISEL64-NEXT: s_clause 0x1
+; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32
+; DAGISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; DAGISEL64-NEXT: s_mov_b64 exec, -1
+; DAGISEL64-NEXT: s_wait_alu 0xfffe
+; DAGISEL64-NEXT: v_cndmask_b32_e32 v1, 17, v1, vcc
+; DAGISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL64-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; DAGISEL64-NEXT: s_xor_b64 exec, vcc, -1
+; DAGISEL64-NEXT: s_clause 0x1
+; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32
+; DAGISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4
+; DAGISEL64-NEXT: s_mov_b64 exec, vcc
+; DAGISEL64-NEXT: s_wait_loadcnt 0x0
+; DAGISEL64-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL64-LABEL: single_use_of_active:
+; GISEL64: ; %bb.0:
+; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL64-NEXT: s_wait_expcnt 0x0
+; GISEL64-NEXT: s_wait_samplecnt 0x0
+; GISEL64-NEXT: s_wait_bvhcnt 0x0
+; GISEL64-NEXT: s_wait_kmcnt 0x0
+; GISEL64-NEXT: s_xor_saveexec_b64 vcc, -1
+; GISEL64-NEXT: s_clause 0x1
+; GISEL64-NEXT: scratch_store_b32 off, v0, s32
+; GISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; GISEL64-NEXT: s_mov_b64 exec, -1
+; GISEL64-NEXT: s_wait_alu 0xfffe
+; GISEL64-NEXT: v_cndmask_b32_e32 v1, 17, v1, vcc
+; GISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL64-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GISEL64-NEXT: s_xor_b64 exec, vcc, -1
+; GISEL64-NEXT: s_clause 0x1
+; GISEL64-NEXT: scratch_load_b32 v0, off, s32
+; GISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4
+; GISEL64-NEXT: s_mov_b64 exec, vcc
+; GISEL64-NEXT: s_wait_loadcnt 0x0
+; GISEL64-NEXT: s_setpc_b64 s[30:31]
+ %y = select i1 %active, i32 %b, i32 17
+ %ret = call i32 @llvm.amdgcn.update.dpp.i32(i32 %a, i32 %y, i32 1, i32 1, i32 1, i1 false)
+ ret i32 %ret
+}
+
+; Make sure we don't crash if %active is not used at all.
+define amdgpu_gfx_whole_wave i32 @unused_active(i1 %active, i32 %a, i32 %b) {
+; DAGISEL-LABEL: unused_active:
+; DAGISEL: ; %bb.0:
+; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT: s_wait_expcnt 0x0
+; DAGISEL-NEXT: s_wait_samplecnt 0x0
+; DAGISEL-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL-NEXT: s_wait_kmcnt 0x0
+; DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1
+; DAGISEL-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill
+; DAGISEL-NEXT: s_mov_b32 exec_lo, -1
+; DAGISEL-NEXT: v_mov_b32_e32 v0, 14
+; DAGISEL-NEXT: s_wait_alu 0xfffe
+; DAGISEL-NEXT: s_xor_b32 exec_lo, s0, -1
+; DAGISEL-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload
+; DAGISEL-NEXT: s_mov_b32 exec_lo, s0
+; DAGISEL-NEXT: s_wait_loadcnt 0x0
+; DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: unused_active:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT: s_wait_expcnt 0x0
+; GISEL-NEXT: s_wait_samplecnt 0x0
+; GISEL-NEXT: s_wait_bvhcnt 0x0
+; GISEL-NEXT: s_wait_kmcnt 0x0
+; GISEL-NEXT: s_xor_saveexec_b32 s0, -1
+; GISEL-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill
+; GISEL-NEXT: s_mov_b32 exec_lo, -1
+; GISEL-NEXT: v_mov_b32_e32 v0, 14
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: s_xor_b32 exec_lo, s0, -1
+; GISEL-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload
+; GISEL-NEXT: s_mov_b32 exec_lo, s0
+; GISEL-NEXT: s_wait_loadcnt 0x0
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; DAGISEL64-LABEL: unused_active:
+; DAGISEL64: ; %bb.0:
+; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL64-NEXT: s_wait_expcnt 0x0
+; DAGISEL64-NEXT: s_wait_samplecnt 0x0
+; DAGISEL64-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL64-NEXT: s_wait_kmcnt 0x0
+; DAGISEL64-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill
+; DAGISEL64-NEXT: s_mov_b64 exec, -1
+; DAGISEL64-NEXT: v_mov_b32_e32 v0, 14
+; DAGISEL64-NEXT: s_wait_alu 0xfffe
+; DAGISEL64-NEXT: s_xor_b64 exec, s[0:1], -1
+; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload
+; DAGISEL64-NEXT: s_mov_b64 exec, s[0:1]
+; DAGISEL64-NEXT: s_wait_loadcnt 0x0
+; DAGISEL64-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL64-LABEL: unused_active:
+; GISEL64: ; %bb.0:
+; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL64-NEXT: s_wait_expcnt 0x0
+; GISEL64-NEXT: s_wait_samplecnt 0x0
+; GISEL64-NEXT: s_wait_bvhcnt 0x0
+; GISEL64-NEXT: s_wait_kmcnt 0x0
+; GISEL64-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GISEL64-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill
+; GISEL64-NEXT: s_mov_b64 exec, -1
+; GISEL64-NEXT: v_mov_b32_e32 v0, 14
+; GISEL64-NEXT: s_wait_alu 0xfffe
+; GISEL64-NEXT: s_xor_b64 exec, s[0:1], -1
+; GISEL64-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload
+; GISEL64-NEXT: s_mov_b64 exec, s[0:1]
+; GISEL64-NEXT: s_wait_loadcnt 0x0
+; GISEL64-NEXT: s_setpc_b64 s[30:31]
+ ret i32 14
+}
+
+; For any used VGPRs (including those used for SGPR spills), we need to restore the inactive lanes.
+; For CSR VGPRs, we need to restore all lanes.
+define amdgpu_gfx_whole_wave i32 @csr(i1 %active, i32 %a, i32 %b) {
+; DAGISEL-LABEL: csr:
+; DAGISEL: ; %bb.0:
+; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT: s_wait_expcnt 0x0
+; DAGISEL-NEXT: s_wait_samplecnt 0x0
+; DAGISEL-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL-NEXT: s_wait_kmcnt 0x0
+; DAGISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1
+; DAGISEL-NEXT: s_clause 0x3
+; DAGISEL-NEXT: scratch_store_b32 off, v2, s32
+; DAGISEL-NEXT: scratch_store_b32 off, v0, s32 offset:4
+; DAGISEL-NEXT: scratch_store_b32 off, v1, s32 offset:8
+; DAGISEL-NEXT: scratch_store_b32 off, v49, s32 offset:16
+; DAGISEL-NEXT: s_mov_b32 exec_lo, -1
+; DAGISEL-NEXT: scratch_store_b32 off, v40, s32 offset:12 ; 4-byte Folded Spill
+; DAGISEL-NEXT: ;;#ASMSTART
+; DAGISEL-NEXT: ; clobber CSR
+; DAGISEL-NEXT: ;;#ASMEND
+; DAGISEL-NEXT: v_writelane_b32 v2, s20, 0
+; DAGISEL-NEXT: ;;#ASMSTART
+; DAGISEL-NEXT: ; clobber non-CSR
+; DAGISEL-NEXT: ;;#ASMEND
+; DAGISEL-NEXT: scratch_load_b32 v40, off, s32 offset:12 ; 4-byte Folded Reload
+; DAGISEL-NEXT: s_wait_alu 0xfffe
+; DAGISEL-NEXT: v_dual_cndmask_b32 v0, 5, v0 :: v_dual_cndmask_b32 v1, 3, v1
+; DAGISEL-NEXT: v_readlane_b32 s20, v2, 0
+; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; DAGISEL-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; DAGISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1
+; DAGISEL-NEXT: s_clause 0x3
+; DAGISEL-NEXT: scratch_load_b32 v2, off, s32
+; DAGISEL-NEXT: scratch_load_b32 v0, off, s32 offset:4
+; DAGISEL-NEXT: scratch_load_b32 v1, off, s32 offset:8
+; DAGISEL-NEXT: scratch_load_b32 v49, off, s32 offset:16
+; DAGISEL-NEXT: s_mov_b32 exec_lo, vcc_lo
+; DAGISEL-NEXT: s_wait_loadcnt 0x0
+; DAGISEL-NEXT: s_wait_alu 0xf1ff
+; DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: csr:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT: s_wait_expcnt 0x0
+; GISEL-NEXT: s_wait_samplecnt 0x0
+; GISEL-NEXT: s_wait_bvhcnt 0x0
+; GISEL-NEXT: s_wait_kmcnt 0x0
+; GISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1
+; GISEL-NEXT: s_clause 0x3
+; GISEL-NEXT: scratch_store_b32 off, v2, s32
+; GISEL-NEXT: scratch_store_b32 off, v0, s32 offset:4
+; GISEL-NEXT: scratch_store_b32 off, v1, s32 offset:8
+; GISEL-NEXT: scratch_store_b32 off, v49, s32 offset:16
+; GISEL-NEXT: s_mov_b32 exec_lo, -1
+; GISEL-NEXT: scratch_store_b32 off, v40, s32 offset:12 ; 4-byte Folded Spill
+; GISEL-NEXT: ;;#ASMSTART
+; GISEL-NEXT: ; clobber CSR
+; GISEL-NEXT: ;;#ASMEND
+; GISEL-NEXT: v_writelane_b32 v2, s20, 0
+; GISEL-NEXT: ;;#ASMSTART
+; GISEL-NEXT: ; clobber non-CSR
+; GISEL-NEXT: ;;#ASMEND
+; GISEL-NEXT: scratch_load_b32 v40, off, s32 offset:12 ; 4-byte Folded Reload
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: v_dual_cndmask_b32 v0, 5, v0 :: v_dual_cndmask_b32 v1, 3, v1
+; GISEL-NEXT: v_readlane_b32 s20, v2, 0
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GISEL-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1
+; GISEL-NEXT: s_clause 0x3
+; GISEL-NEXT: scratch_load_b32 v2, off, s32
+; GISEL-NEXT: scratch_load_b32 v0, off, s32 offset:4
+; GISEL-NEXT: scratch_load_b32 v1, off, s32 offset:8
+; GISEL-NEXT: scratch_load_b32 v49, off, s32 offset:16
+; GISEL-NEXT: s_mov_b32 exec_lo, vcc_lo
+; GISEL-NEXT: s_wait_loadcnt 0x0
+; GISEL-NEXT: s_wait_alu 0xf1ff
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; DAGISEL64-LABEL: csr:
+; DAGISEL64: ; %bb.0:
+; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL64-NEXT: s_wait_expcnt 0x0
+; DAGISEL64-NEXT: s_wait_samplecnt 0x0
+; DAGISEL64-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL64-NEXT: s_wait_kmcnt 0x0
+; DAGISEL64-NEXT: s_xor_saveexec_b64 vcc, -1
+; DAGISEL64-NEXT: s_clause 0x3
+; DAGISEL64-NEXT: scratch_store_b32 off, v2, s32
+; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32 offset:4
+; DAGISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:8
+; DAGISEL64-NEXT: scratch_store_b32 off, v49, s32 offset:16
+; DAGISEL64-NEXT: s_mov_b64 exec, -1
+; DAGISEL64-NEXT: scratch_store_b32 off, v40, s32 offset:12 ; 4-byte Folded Spill
+; DAGISEL64-NEXT: ;;#ASMSTART
+; DAGISEL64-NEXT: ; clobber CSR
+; DAGISEL64-NEXT: ;;#ASMEND
+; DAGISEL64-NEXT: v_writelane_b32 v2, s20, 0
+; DAGISEL64-NEXT: ;;#ASMSTART
+; DAGISEL64-NEXT: ; clobber non-CSR
+; DAGISEL64-NEXT: ;;#ASMEND
+; DAGISEL64-NEXT: scratch_load_b32 v40, off, s32 offset:12 ; 4-byte Folded Reload
+; DAGISEL64-NEXT: s_wait_alu 0xfffe
+; DAGISEL64-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
+; DAGISEL64-NEXT: v_cndmask_b32_e32 v1, 3, v1, vcc
+; DAGISEL64-NEXT: v_readlane_b32 s20, v2, 0
+; DAGISEL64-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; DAGISEL64-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; DAGISEL64-NEXT: s_xor_b64 exec, vcc, -1
+; DAGISEL64-NEXT: s_clause 0x3
+; DAGISEL64-NEXT: scratch_load_b32 v2, off, s32
+; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32 offset:4
+; DAGISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:8
+; DAGISEL64-NEXT: scratch_load_b32 v49, off, s32 offset:16
+; DAGISEL64-NEXT: s_mov_b64 exec, vcc
+; DAGISEL64-NEXT: s_wait_loadcnt 0x0
+; DAGISEL64-NEXT: s_wait_alu 0xf1ff
+; DAGISEL64-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL64-LABEL: csr:
+; GISEL64: ; %bb.0:
+; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL64-NEXT: s_wait_expcnt 0x0
+; GISEL64-NEXT: s_wait_samplecnt 0x0
+; GISEL64-NEXT: s_wait_bvhcnt 0x0
+; GISEL64-NEXT: s_wait_kmcnt 0x0
+; GISEL64-NEXT: s_xor_saveexec_b64 vcc, -1
+; GISEL64-NEXT: s_clause 0x3
+; GISEL64-NEXT: scratch_store_b32 off, v2, s32
+; GISEL64-NEXT: scratch_store_b32 off, v0, s32 offset:4
+; GISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:8
+; GISEL64-NEXT: scratch_store_b32 off, v49, s32 offset:16
+; GISEL64-NEXT: s_mov_b64 exec, -1
+; GISEL64-NEXT: scratch_store_b32 off, v40, s32 offset:12 ; 4-byte Folded Spill
+; GISEL64-NEXT: ;;#ASMSTART
+; GISEL64-NEXT: ; clobber CSR
+; GISEL64-NEXT: ;;#ASMEND
+; GISEL64-NEXT: v_writelane_b32 v2, s20, 0
+; GISEL64-NEXT: ;;#ASMSTART
+; GISEL64-NEXT: ; clobber non-CSR
+; GISEL64-NEXT: ;;#ASMEND
+; GISEL64-NEXT: scratch_load_b32 v40, off, s32 offset:12 ; 4-byte Folded Reload
+; GISEL64-NEXT: s_wait_alu 0xfffe
+; GISEL64-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
+; GISEL64-NEXT: v_cndmask_b32_e32 v1, 3, v1, vcc
+; GISEL64-NEXT: v_readlane_b32 s20, v2, 0
+; GISEL64-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GISEL64-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GISEL64-NEXT: s_xor_b64 exec, vcc, -1
+; GISEL64-NEXT: s_clause 0x3
+; GISEL64-NEXT: scratch_load_b32 v2, off, s32
+; GISEL64-NEXT: scratch_load_b32 v0, off, s32 offset:4
+; GISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:8
+; GISEL64-NEXT: scratch_load_b32 v49, off, s32 offset:16
+; GISEL64-NEXT: s_mov_b64 exec, vcc
+; GISEL64-NEXT: s_wait_loadcnt 0x0
+; GISEL64-NEXT: s_wait_alu 0xf1ff
+; GISEL64-NEXT: s_setpc_b64 s[30:31]
+ %x = select i1 %active, i32 %a, i32 5
+ %y = select i1 %active, i32 %b, i32 3
+ call void asm sideeffect "; clobber CSR", "~{v40},~{s48}"()
+ call void asm sideeffect "; clobber non-CSR", "~{v49},~{s20}"()
+ %ret = call i32 @llvm.amdgcn.update.dpp.i32(i32 %x, i32 %y, i32 1, i32 1, i32 1, i1 false)
+ ret i32 %ret
+}
+
+; Save and restore all lanes of v40.
+define amdgpu_gfx_whole_wave void @csr_vgpr_only(i1 %active, i32 %a, i32 %b) {
+; DAGISEL-LABEL: csr_vgpr_only:
+; DAGISEL: ; %bb.0:
+; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT: s_wait_expcnt 0x0
+; DAGISEL-NEXT: s_wait_samplecnt 0x0
+; DAGISEL-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL-NEXT: s_wait_kmcnt 0x0
+; DAGISEL-NEXT: s_or_saveexec_b32 s0, -1
+; DAGISEL-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; DAGISEL-NEXT: ;;#ASMSTART
+; DAGISEL-NEXT: ; clobber CSR VGPR
+; DAGISEL-NEXT: ;;#ASMEND
+; DAGISEL-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; DAGISEL-NEXT: s_wait_alu 0xfffe
+; DAGISEL-NEXT: s_mov_b32 exec_lo, s0
+; DAGISEL-NEXT: s_wait_loadcnt 0x0
+; DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: csr_vgpr_only:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT: s_wait_expcnt 0x0
+; GISEL-NEXT: s_wait_samplecnt 0x0
+; GISEL-NEXT: s_wait_bvhcnt 0x0
+; GISEL-NEXT: s_wait_kmcnt 0x0
+; GISEL-NEXT: s_or_saveexec_b32 s0, -1
+; GISEL-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GISEL-NEXT: ;;#ASMSTART
+; GISEL-NEXT: ; clobber CSR VGPR
+; GISEL-NEXT: ;;#ASMEND
+; GISEL-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: s_mov_b32 exec_lo, s0
+; GISEL-NEXT: s_wait_loadcnt 0x0
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; DAGISEL64-LABEL: csr_vgpr_only:
+; DAGISEL64: ; %bb.0:
+; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL64-NEXT: s_wait_expcnt 0x0
+; DAGISEL64-NEXT: s_wait_samplecnt 0x0
+; DAGISEL64-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL64-NEXT: s_wait_kmcnt 0x0
+; DAGISEL64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; DAGISEL64-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; DAGISEL64-NEXT: ;;#ASMSTART
+; DAGISEL64-NEXT: ; clobber CSR VGPR
+; DAGISEL64-NEXT: ;;#ASMEND
+; DAGISEL64-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; DAGISEL64-NEXT: s_wait_alu 0xfffe
+; DAGISEL64-NEXT: s_mov_b64 exec, s[0:1]
+; DAGISEL64-NEXT: s_wait_loadcnt 0x0
+; DAGISEL64-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL64-LABEL: csr_vgpr_only:
+; GISEL64: ; %bb.0:
+; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL64-NEXT: s_wait_expcnt 0x0
+; GISEL64-NEXT: s_wait_samplecnt 0x0
+; GISEL64-NEXT: s_wait_bvhcnt 0x0
+; GISEL64-NEXT: s_wait_kmcnt 0x0
+; GISEL64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GISEL64-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GISEL64-NEXT: ;;#ASMSTART
+; GISEL64-NEXT: ; clobber CSR VGPR
+; GISEL64-NEXT: ;;#ASMEND
+; GISEL64-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GISEL64-NEXT: s_wait_alu 0xfffe
+; GISEL64-NEXT: s_mov_b64 exec, s[0:1]
+; GISEL64-NEXT: s_wait_loadcnt 0x0
+; GISEL64-NEXT: s_setpc_b64 s[30:31]
+ call void asm sideeffect "; clobber CSR VGPR", "~{v40}"()
+ ret void
+}
+
+define amdgpu_gfx_whole_wave void @sgpr_spill_only(i1 %active, i32 %a, i32 %b) {
+; DAGISEL-LABEL: sgpr_spill_only:
+; DAGISEL: ; %bb.0:
+; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT: s_wait_expcnt 0x0
+; DAGISEL-NEXT: s_wait_samplecnt 0x0
+; DAGISEL-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL-NEXT: s_wait_kmcnt 0x0
+; DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1
+; DAGISEL-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill
+; DAGISEL-NEXT: s_mov_b32 exec_lo, -1
+; DAGISEL-NEXT: v_writelane_b32 v0, s68, 0
+; DAGISEL-NEXT: ;;#ASMSTART
+; DAGISEL-NEXT: ; clobber CSR SGPR
+; DAGISEL-NEXT: ;;#ASMEND
+; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL-NEXT: v_readlane_b32 s68, v0, 0
+; DAGISEL-NEXT: s_wait_alu 0xfffe
+; DAGISEL-NEXT: s_xor_b32 exec_lo, s0, -1
+; DAGISEL-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload
+; DAGISEL-NEXT: s_mov_b32 exec_lo, s0
+; DAGISEL-NEXT: s_wait_loadcnt 0x0
+; DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: sgpr_spill_only:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT: s_wait_expcnt 0x0
+; GISEL-NEXT: s_wait_samplecnt 0x0
+; GISEL-NEXT: s_wait_bvhcnt 0x0
+; GISEL-NEXT: s_wait_kmcnt 0x0
+; GISEL-NEXT: s_xor_saveexec_b32 s0, -1
+; GISEL-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill
+; GISEL-NEXT: s_mov_b32 exec_lo, -1
+; GISEL-NEXT: v_writelane_b32 v0, s68, 0
+; GISEL-NEXT: ;;#ASMSTART
+; GISEL-NEXT: ; clobber CSR SGPR
+; GISEL-NEXT: ;;#ASMEND
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT: v_readlane_b32 s68, v0, 0
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: s_xor_b32 exec_lo, s0, -1
+; GISEL-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload
+; GISEL-NEXT: s_mov_b32 exec_lo, s0
+; GISEL-NEXT: s_wait_loadcnt 0x0
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; DAGISEL64-LABEL: sgpr_spill_only:
+; DAGISEL64: ; %bb.0:
+; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL64-NEXT: s_wait_expcnt 0x0
+; DAGISEL64-NEXT: s_wait_samplecnt 0x0
+; DAGISEL64-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL64-NEXT: s_wait_kmcnt 0x0
+; DAGISEL64-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill
+; DAGISEL64-NEXT: s_mov_b64 exec, -1
+; DAGISEL64-NEXT: v_writelane_b32 v0, s68, 0
+; DAGISEL64-NEXT: ;;#ASMSTART
+; DAGISEL64-NEXT: ; clobber CSR SGPR
+; DAGISEL64-NEXT: ;;#ASMEND
+; DAGISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL64-NEXT: v_readlane_b32 s68, v0, 0
+; DAGISEL64-NEXT: s_wait_alu 0xfffe
+; DAGISEL64-NEXT: s_xor_b64 exec, s[0:1], -1
+; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload
+; DAGISEL64-NEXT: s_mov_b64 exec, s[0:1]
+; DAGISEL64-NEXT: s_wait_loadcnt 0x0
+; DAGISEL64-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL64-LABEL: sgpr_spill_only:
+; GISEL64: ; %bb.0:
+; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL64-NEXT: s_wait_expcnt 0x0
+; GISEL64-NEXT: s_wait_samplecnt 0x0
+; GISEL64-NEXT: s_wait_bvhcnt 0x0
+; GISEL64-NEXT: s_wait_kmcnt 0x0
+; GISEL64-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GISEL64-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill
+; GISEL64-NEXT: s_mov_b64 exec, -1
+; GISEL64-NEXT: v_writelane_b32 v0, s68, 0
+; GISEL64-NEXT: ;;#ASMSTART
+; GISEL64-NEXT: ; clobber CSR SGPR
+; GISEL64-NEXT: ;;#ASMEND
+; GISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL64-NEXT: v_readlane_b32 s68, v0, 0
+; GISEL64-NEXT: s_wait_alu 0xfffe
+; GISEL64-NEXT: s_xor_b64 exec, s[0:1], -1
+; GISEL64-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload
+; GISEL64-NEXT: s_mov_b64 exec, s[0:1]
+; GISEL64-NEXT: s_wait_loadcnt 0x0
+; GISEL64-NEXT: s_setpc_b64 s[30:31]
+ call void asm sideeffect "; clobber CSR SGPR", "~{s68}"()
+ ret void
+}
+
+define amdgpu_gfx_whole_wave i32 @multiple_blocks(i1 %active, i32 %a, i32 %b) {
+; DAGISEL-LABEL: multiple_blocks:
+; DAGISEL: ; %bb.0:
+; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT: s_wait_expcnt 0x0
+; DAGISEL-NEXT: s_wait_samplecnt 0x0
+; DAGISEL-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL-NEXT: s_wait_kmcnt 0x0
+; DAGISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1
+; DAGISEL-NEXT: s_clause 0x1
+; DAGISEL-NEXT: scratch_store_b32 off, v0, s32
+; DAGISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; DAGISEL-NEXT: s_mov_b32 exec_lo, -1
+; DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; DAGISEL-NEXT: s_mov_b32 s1, exec_lo
+; DAGISEL-NEXT: v_cmpx_eq_u32_e64 v0, v1
+; DAGISEL-NEXT: ; %bb.1: ; %if.then
+; DAGISEL-NEXT: v_add_nc_u32_e32 v1, v0, v1
+; DAGISEL-NEXT: ; %bb.2: ; %if.end
+; DAGISEL-NEXT: s_wait_alu 0xfffe
+; DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; DAGISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1
+; DAGISEL-NEXT: s_clause 0x1
+; DAGISEL-NEXT: scratch_load_b32 v0, off, s32
+; DAGISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4
+; DAGISEL-NEXT: s_mov_b32 exec_lo, vcc_lo
+; DAGISEL-NEXT: s_wait_loadcnt 0x0
+; DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: multiple_blocks:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT: s_wait_expcnt 0x0
+; GISEL-NEXT: s_wait_samplecnt 0x0
+; GISEL-NEXT: s_wait_bvhcnt 0x0
+; GISEL-NEXT: s_wait_kmcnt 0x0
+; GISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1
+; GISEL-NEXT: s_clause 0x1
+; GISEL-NEXT: scratch_store_b32 off, v0, s32
+; GISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; GISEL-NEXT: s_mov_b32 exec_lo, -1
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-NEXT: s_mov_b32 s1, exec_lo
+; GISEL-NEXT: v_cmpx_eq_u32_e64 v0, v1
+; GISEL-NEXT: ; %bb.1: ; %if.then
+; GISEL-NEXT: v_add_nc_u32_e32 v1, v0, v1
+; GISEL-NEXT: ; %bb.2: ; %if.end
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1
+; GISEL-NEXT: s_clause 0x1
+; GISEL-NEXT: scratch_load_b32 v0, off, s32
+; GISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4
+; GISEL-NEXT: s_mov_b32 exec_lo, vcc_lo
+; GISEL-NEXT: s_wait_loadcnt 0x0
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; DAGISEL64-LABEL: multiple_blocks:
+; DAGISEL64: ; %bb.0:
+; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL64-NEXT: s_wait_expcnt 0x0
+; DAGISEL64-NEXT: s_wait_samplecnt 0x0
+; DAGISEL64-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL64-NEXT: s_wait_kmcnt 0x0
+; DAGISEL64-NEXT: s_xor_saveexec_b64 vcc, -1
+; DAGISEL64-NEXT: s_clause 0x1
+; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32
+; DAGISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; DAGISEL64-NEXT: s_mov_b64 exec, -1
+; DAGISEL64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; DAGISEL64-NEXT: s_mov_b64 s[2:3], exec
+; DAGISEL64-NEXT: v_cmpx_eq_u32_e64 v0, v1
+; DAGISEL64-NEXT: ; %bb.1: ; %if.then
+; DAGISEL64-NEXT: v_add_nc_u32_e32 v1, v0, v1
+; DAGISEL64-NEXT: ; %bb.2: ; %if.end
+; DAGISEL64-NEXT: s_wait_alu 0xfffe
+; DAGISEL64-NEXT: s_or_b64 exec, exec, s[2:3]
+; DAGISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL64-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; DAGISEL64-NEXT: s_xor_b64 exec, vcc, -1
+; DAGISEL64-NEXT: s_clause 0x1
+; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32
+; DAGISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4
+; DAGISEL64-NEXT: s_mov_b64 exec, vcc
+; DAGISEL64-NEXT: s_wait_loadcnt 0x0
+; DAGISEL64-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL64-LABEL: multiple_blocks:
+; GISEL64: ; %bb.0:
+; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL64-NEXT: s_wait_expcnt 0x0
+; GISEL64-NEXT: s_wait_samplecnt 0x0
+; GISEL64-NEXT: s_wait_bvhcnt 0x0
+; GISEL64-NEXT: s_wait_kmcnt 0x0
+; GISEL64-NEXT: s_xor_saveexec_b64 vcc, -1
+; GISEL64-NEXT: s_clause 0x1
+; GISEL64-NEXT: scratch_store_b32 off, v0, s32
+; GISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; GISEL64-NEXT: s_mov_b64 exec, -1
+; GISEL64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL64-NEXT: s_mov_b64 s[2:3], exec
+; GISEL64-NEXT: v_cmpx_eq_u32_e64 v0, v1
+; GISEL64-NEXT: ; %bb.1: ; %if.then
+; GISEL64-NEXT: v_add_nc_u32_e32 v1, v0, v1
+; GISEL64-NEXT: ; %bb.2: ; %if.end
+; GISEL64-NEXT: s_wait_alu 0xfffe
+; GISEL64-NEXT: s_or_b64 exec, exec, s[2:3]
+; GISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL64-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL64-NEXT: s_xor_b64 exec, vcc, -1
+; GISEL64-NEXT: s_clause 0x1
+; GISEL64-NEXT: scratch_load_b32 v0, off, s32
+; GISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4
+; GISEL64-NEXT: s_mov_b64 exec, vcc
+; GISEL64-NEXT: s_wait_loadcnt 0x0
+; GISEL64-NEXT: s_setpc_b64 s[30:31]
+ %c = icmp eq i32 %a, %b
+ br i1 %c, label %if.then, label %if.end
+
+if.then: ; preds = %0
+ %d = add i32 %a, %b
+ br label %if.end
+
+if.end:
+ %f = phi i32 [ %d, %if.then ], [ %b, %0 ]
+ %e = select i1 %active, i32 %a, i32 %f
+ ret i32 %e
+}
+
+define amdgpu_gfx_whole_wave i64 @ret_64(i1 %active, i64 %a, i64 %b) {
+; DAGISEL-LABEL: ret_64:
+; DAGISEL: ; %bb.0:
+; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT: s_wait_expcnt 0x0
+; DAGISEL-NEXT: s_wait_samplecnt 0x0
+; DAGISEL-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL-NEXT: s_wait_kmcnt 0x0
+; DAGISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1
+; DAGISEL-NEXT: s_clause 0x3
+; DAGISEL-NEXT: scratch_store_b32 off, v0, s32
+; DAGISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; DAGISEL-NEXT: scratch_store_b32 off, v2, s32 offset:8
+; DAGISEL-NEXT: scratch_store_b32 off, v3, s32 offset:12
+; DAGISEL-NEXT: s_mov_b32 exec_lo, -1
+; DAGISEL-NEXT: s_wait_alu 0xfffe
+; DAGISEL-NEXT: v_dual_cndmask_b32 v1, 0, v1 :: v_dual_cndmask_b32 v0, 5, v0
+; DAGISEL-NEXT: v_dual_cndmask_b32 v2, 3, v2 :: v_dual_cndmask_b32 v3, 0, v3
+; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; DAGISEL-NEXT: v_mov_b32_dpp v0, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; DAGISEL-NEXT: v_mov_b32_dpp v1, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; DAGISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1
+; DAGISEL-NEXT: s_clause 0x3
+; DAGISEL-NEXT: scratch_load_b32 v0, off, s32
+; DAGISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4
+; DAGISEL-NEXT: scratch_load_b32 v2, off, s32 offset:8
+; DAGISEL-NEXT: scratch_load_b32 v3, off, s32 offset:12
+; DAGISEL-NEXT: s_mov_b32 exec_lo, vcc_lo
+; DAGISEL-NEXT: s_wait_loadcnt 0x0
+; DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: ret_64:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT: s_wait_expcnt 0x0
+; GISEL-NEXT: s_wait_samplecnt 0x0
+; GISEL-NEXT: s_wait_bvhcnt 0x0
+; GISEL-NEXT: s_wait_kmcnt 0x0
+; GISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1
+; GISEL-NEXT: s_clause 0x3
+; GISEL-NEXT: scratch_store_b32 off, v0, s32
+; GISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; GISEL-NEXT: scratch_store_b32 off, v2, s32 offset:8
+; GISEL-NEXT: scratch_store_b32 off, v3, s32 offset:12
+; GISEL-NEXT: s_mov_b32 exec_lo, -1
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: v_dual_cndmask_b32 v0, 5, v0 :: v_dual_cndmask_b32 v1, 0, v1
+; GISEL-NEXT: v_dual_cndmask_b32 v2, 3, v2 :: v_dual_cndmask_b32 v3, 0, v3
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GISEL-NEXT: v_mov_b32_dpp v0, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GISEL-NEXT: v_mov_b32_dpp v1, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1
+; GISEL-NEXT: s_clause 0x3
+; GISEL-NEXT: scratch_load_b32 v0, off, s32
+; GISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4
+; GISEL-NEXT: scratch_load_b32 v2, off, s32 offset:8
+; GISEL-NEXT: scratch_load_b32 v3, off, s32 offset:12
+; GISEL-NEXT: s_mov_b32 exec_lo, vcc_lo
+; GISEL-NEXT: s_wait_loadcnt 0x0
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; DAGISEL64-LABEL: ret_64:
+; DAGISEL64: ; %bb.0:
+; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL64-NEXT: s_wait_expcnt 0x0
+; DAGISEL64-NEXT: s_wait_samplecnt 0x0
+; DAGISEL64-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL64-NEXT: s_wait_kmcnt 0x0
+; DAGISEL64-NEXT: s_xor_saveexec_b64 vcc, -1
+; DAGISEL64-NEXT: s_clause 0x3
+; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32
+; DAGISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; DAGISEL64-NEXT: scratch_store_b32 off, v2, s32 offset:8
+; DAGISEL64-NEXT: scratch_store_b32 off, v3, s32 offset:12
+; DAGISEL64-NEXT: s_mov_b64 exec, -1
+; DAGISEL64-NEXT: s_wait_alu 0xfffe
+; DAGISEL64-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; DAGISEL64-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
+; DAGISEL64-NEXT: v_cndmask_b32_e32 v2, 3, v2, vcc
+; DAGISEL64-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; DAGISEL64-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; DAGISEL64-NEXT: v_mov_b32_dpp v0, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; DAGISEL64-NEXT: v_mov_b32_dpp v1, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; DAGISEL64-NEXT: s_xor_b64 exec, vcc, -1
+; DAGISEL64-NEXT: s_clause 0x3
+; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32
+; DAGISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4
+; DAGISEL64-NEXT: scratch_load_b32 v2, off, s32 offset:8
+; DAGISEL64-NEXT: scratch_load_b32 v3, off, s32 offset:12
+; DAGISEL64-NEXT: s_mov_b64 exec, vcc
+; DAGISEL64-NEXT: s_wait_loadcnt 0x0
+; DAGISEL64-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL64-LABEL: ret_64:
+; GISEL64: ; %bb.0:
+; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL64-NEXT: s_wait_expcnt 0x0
+; GISEL64-NEXT: s_wait_samplecnt 0x0
+; GISEL64-NEXT: s_wait_bvhcnt 0x0
+; GISEL64-NEXT: s_wait_kmcnt 0x0
+; GISEL64-NEXT: s_xor_saveexec_b64 vcc, -1
+; GISEL64-NEXT: s_clause 0x3
+; GISEL64-NEXT: scratch_store_b32 off, v0, s32
+; GISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; GISEL64-NEXT: scratch_store_b32 off, v2, s32 offset:8
+; GISEL64-NEXT: scratch_store_b32 off, v3, s32 offset:12
+; GISEL64-NEXT: s_mov_b64 exec, -1
+; GISEL64-NEXT: s_wait_alu 0xfffe
+; GISEL64-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
+; GISEL64-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GISEL64-NEXT: v_cndmask_b32_e32 v2, 3, v2, vcc
+; GISEL64-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; GISEL64-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GISEL64-NEXT: v_mov_b32_dpp v0, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GISEL64-NEXT: v_mov_b32_dpp v1, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GISEL64-NEXT: s_xor_b64 exec, vcc, -1
+; GISEL64-NEXT: s_clause 0x3
+; GISEL64-NEXT: scratch_load_b32 v0, off, s32
+; GISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4
+; GISEL64-NEXT: scratch_load_b32 v2, off, s32 offset:8
+; GISEL64-NEXT: scratch_load_b32 v3, off, s32 offset:12
+; GISEL64-NEXT: s_mov_b64 exec, vcc
+; GISEL64-NEXT: s_wait_loadcnt 0x0
+; GISEL64-NEXT: s_setpc_b64 s[30:31]
+ %x = select i1 %active, i64 %a, i64 5
+ %y = select i1 %active, i64 %b, i64 3
+ %ret = call i64 @llvm.amdgcn.update.dpp.i64(i64 %x, i64 %y, i32 1, i32 1, i32 1, i1 false)
+ ret i64 %ret
+}
+
+define amdgpu_gfx_whole_wave void @inreg_args(i1 %active, i32 inreg %i32, <4 x i32> inreg %v4i32, float inreg %float, ptr addrspace(5) inreg %ptr, ptr addrspace(5) inreg %ptr2) {
+; DAGISEL-LABEL: inreg_args:
+; DAGISEL: ; %bb.0:
+; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT: s_wait_expcnt 0x0
+; DAGISEL-NEXT: s_wait_samplecnt 0x0
+; DAGISEL-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL-NEXT: s_wait_kmcnt 0x0
+; DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1
+; DAGISEL-NEXT: s_clause 0x5
+; DAGISEL-NEXT: scratch_store_b32 off, v0, s32
+; DAGISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; DAGISEL-NEXT: scratch_store_b32 off, v2, s32 offset:8
+; DAGISEL-NEXT: scratch_store_b32 off, v3, s32 offset:12
+; DAGISEL-NEXT: scratch_store_b32 off, v4, s32 offset:16
+; DAGISEL-NEXT: scratch_store_b32 off, v5, s32 offset:20
+; DAGISEL-NEXT: s_mov_b32 exec_lo, -1
+; DAGISEL-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s9
+; DAGISEL-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
+; DAGISEL-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
+; DAGISEL-NEXT: scratch_store_b32 off, v4, s10
+; DAGISEL-NEXT: s_clause 0x1
+; DAGISEL-NEXT: scratch_store_b128 off, v[0:3], s11
+; DAGISEL-NEXT: scratch_store_b32 off, v5, s11
+; DAGISEL-NEXT: s_wait_alu 0xfffe
+; DAGISEL-NEXT: s_xor_b32 exec_lo, s0, -1
+; DAGISEL-NEXT: s_clause 0x5
+; DAGISEL-NEXT: scratch_load_b32 v0, off, s32
+; DAGISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4
+; DAGISEL-NEXT: scratch_load_b32 v2, off, s32 offset:8
+; DAGISEL-NEXT: scratch_load_b32 v3, off, s32 offset:12
+; DAGISEL-NEXT: scratch_load_b32 v4, off, s32 offset:16
+; DAGISEL-NEXT: scratch_load_b32 v5, off, s32 offset:20
+; DAGISEL-NEXT: s_mov_b32 exec_lo, s0
+; DAGISEL-NEXT: s_wait_loadcnt 0x0
+; DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: inreg_args:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT: s_wait_expcnt 0x0
+; GISEL-NEXT: s_wait_samplecnt 0x0
+; GISEL-NEXT: s_wait_bvhcnt 0x0
+; GISEL-NEXT: s_wait_kmcnt 0x0
+; GISEL-NEXT: s_xor_saveexec_b32 s34, -1
+; GISEL-NEXT: s_clause 0x5
+; GISEL-NEXT: scratch_store_b32 off, v0, s32
+; GISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; GISEL-NEXT: scratch_store_b32 off, v2, s32 offset:8
+; GISEL-NEXT: scratch_store_b32 off, v3, s32 offset:12
+; GISEL-NEXT: scratch_store_b32 off, v4, s32 offset:16
+; GISEL-NEXT: scratch_store_b32 off, v5, s32 offset:20
+; GISEL-NEXT: s_mov_b32 exec_lo, -1
+; GISEL-NEXT: s_mov_b32 s0, s5
+; GISEL-NEXT: s_mov_b32 s1, s6
+; GISEL-NEXT: s_mov_b32 s2, s7
+; GISEL-NEXT: s_mov_b32 s3, s8
+; GISEL-NEXT: v_mov_b32_e32 v4, s4
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GISEL-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GISEL-NEXT: v_mov_b32_e32 v5, s9
+; GISEL-NEXT: scratch_store_b32 off, v4, s10
+; GISEL-NEXT: s_clause 0x1
+; GISEL-NEXT: scratch_store_b128 off, v[0:3], s11
+; GISEL-NEXT: scratch_store_b32 off, v5, s11
+; GISEL-NEXT: s_xor_b32 exec_lo, s34, -1
+; GISEL-NEXT: s_clause 0x5
+; GISEL-NEXT: scratch_load_b32 v0, off, s32
+; GISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4
+; GISEL-NEXT: scratch_load_b32 v2, off, s32 offset:8
+; GISEL-NEXT: scratch_load_b32 v3, off, s32 offset:12
+; GISEL-NEXT: scratch_load_b32 v4, off, s32 offset:16
+; GISEL-NEXT: scratch_load_b32 v5, off, s32 offset:20
+; GISEL-NEXT: s_mov_b32 exec_lo, s34
+; GISEL-NEXT: s_wait_loadcnt 0x0
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; DAGISEL64-LABEL: inreg_args:
+; DAGISEL64: ; %bb.0:
+; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL64-NEXT: s_wait_expcnt 0x0
+; DAGISEL64-NEXT: s_wait_samplecnt 0x0
+; DAGISEL64-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL64-NEXT: s_wait_kmcnt 0x0
+; DAGISEL64-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; DAGISEL64-NEXT: s_clause 0x5
+; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32
+; DAGISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; DAGISEL64-NEXT: scratch_store_b32 off, v2, s32 offset:8
+; DAGISEL64-NEXT: scratch_store_b32 off, v3, s32 offset:12
+; DAGISEL64-NEXT: scratch_store_b32 off, v4, s32 offset:16
+; DAGISEL64-NEXT: scratch_store_b32 off, v5, s32 offset:20
+; DAGISEL64-NEXT: s_mov_b64 exec, -1
+; DAGISEL64-NEXT: v_mov_b32_e32 v4, s4
+; DAGISEL64-NEXT: v_mov_b32_e32 v0, s5
+; DAGISEL64-NEXT: v_mov_b32_e32 v1, s6
+; DAGISEL64-NEXT: v_mov_b32_e32 v2, s7
+; DAGISEL64-NEXT: v_mov_b32_e32 v3, s8
+; DAGISEL64-NEXT: v_mov_b32_e32 v5, s9
+; DAGISEL64-NEXT: scratch_store_b32 off, v4, s10
+; DAGISEL64-NEXT: s_clause 0x1
+; DAGISEL64-NEXT: scratch_store_b128 off, v[0:3], s11
+; DAGISEL64-NEXT: scratch_store_b32 off, v5, s11
+; DAGISEL64-NEXT: s_wait_alu 0xfffe
+; DAGISEL64-NEXT: s_xor_b64 exec, s[0:1], -1
+; DAGISEL64-NEXT: s_clause 0x5
+; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32
+; DAGISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4
+; DAGISEL64-NEXT: scratch_load_b32 v2, off, s32 offset:8
+; DAGISEL64-NEXT: scratch_load_b32 v3, off, s32 offset:12
+; DAGISEL64-NEXT: scratch_load_b32 v4, off, s32 offset:16
+; DAGISEL64-NEXT: scratch_load_b32 v5, off, s32 offset:20
+; DAGISEL64-NEXT: s_mov_b64 exec, s[0:1]
+; DAGISEL64-NEXT: s_wait_loadcnt 0x0
+; DAGISEL64-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL64-LABEL: inreg_args:
+; GISEL64: ; %bb.0:
+; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL64-NEXT: s_wait_expcnt 0x0
+; GISEL64-NEXT: s_wait_samplecnt 0x0
+; GISEL64-NEXT: s_wait_bvhcnt 0x0
+; GISEL64-NEXT: s_wait_kmcnt 0x0
+; GISEL64-NEXT: s_xor_saveexec_b64 s[34:35], -1
+; GISEL64-NEXT: s_clause 0x5
+; GISEL64-NEXT: scratch_store_b32 off, v0, s32
+; GISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; GISEL64-NEXT: scratch_store_b32 off, v2, s32 offset:8
+; GISEL64-NEXT: scratch_store_b32 off, v3, s32 offset:12
+; GISEL64-NEXT: scratch_store_b32 off, v4, s32 offset:16
+; GISEL64-NEXT: scratch_store_b32 off, v5, s32 offset:20
+; GISEL64-NEXT: s_mov_b64 exec, -1
+; GISEL64-NEXT: s_mov_b32 s0, s5
+; GISEL64-NEXT: s_mov_b32 s1, s6
+; GISEL64-NEXT: s_mov_b32 s2, s7
+; GISEL64-NEXT: s_mov_b32 s3, s8
+; GISEL64-NEXT: v_mov_b32_e32 v4, s4
+; GISEL64-NEXT: s_wait_alu 0xfffe
+; GISEL64-NEXT: v_mov_b32_e32 v0, s0
+; GISEL64-NEXT: v_mov_b32_e32 v1, s1
+; GISEL64-NEXT: v_mov_b32_e32 v2, s2
+; GISEL64-NEXT: v_mov_b32_e32 v3, s3
+; GISEL64-NEXT: v_mov_b32_e32 v5, s9
+; GISEL64-NEXT: scratch_store_b32 off, v4, s10
+; GISEL64-NEXT: s_clause 0x1
+; GISEL64-NEXT: scratch_store_b128 off, v[0:3], s11
+; GISEL64-NEXT: scratch_store_b32 off, v5, s11
+; GISEL64-NEXT: s_xor_b64 exec, s[34:35], -1
+; GISEL64-NEXT: s_clause 0x5
+; GISEL64-NEXT: scratch_load_b32 v0, off, s32
+; GISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4
+; GISEL64-NEXT: scratch_load_b32 v2, off, s32 offset:8
+; GISEL64-NEXT: scratch_load_b32 v3, off, s32 offset:12
+; GISEL64-NEXT: scratch_load_b32 v4, off, s32 offset:16
+; GISEL64-NEXT: scratch_load_b32 v5, off, s32 offset:20
+; GISEL64-NEXT: s_mov_b64 exec, s[34:35]
+; GISEL64-NEXT: s_wait_loadcnt 0x0
+; GISEL64-NEXT: s_setpc_b64 s[30:31]
+ store i32 %i32, ptr addrspace(5) %ptr
+ store <4 x i32> %v4i32, ptr addrspace(5) %ptr2
+ store float %float, ptr addrspace(5) %ptr2
+ ret void
+}
+
+declare amdgpu_gfx <2 x half> @gfx_callee(<2 x half> %x, <2 x half> %y)
+
+define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 x half> %x, <2 x half> %y) {
+; DAGISEL-LABEL: call_gfx_from_whole_wave:
+; DAGISEL: ; %bb.0:
+; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT: s_wait_expcnt 0x0
+; DAGISEL-NEXT: s_wait_samplecnt 0x0
+; DAGISEL-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL-NEXT: s_wait_kmcnt 0x0
+; DAGISEL-NEXT: s_mov_b32 s0, s33
+; DAGISEL-NEXT: s_mov_b32 s33, s32
+; DAGISEL-NEXT: s_xor_saveexec_b32 s4, -1
+; DAGISEL-NEXT: s_clause 0x1f
+; DAGISEL-NEXT: scratch_store_b32 off, v0, s33 offset:4
----------------
jayfoad wrote:
Oh I think I see now. `gfx_callee` is called with all lanes active which means it can clobber all lanes of any non-preserved registers it likes. But `call_gfx_from_whole_wave` is not allowed to clobber lanes that were inactive when it was called, so it has to save those lanes around the call to `gfx_callee`.
Given the high cost of supporting this feature, I wonder if we need to allow calls from whole wave function to non-whole-wave functions?
https://github.com/llvm/llvm-project/pull/145858
More information about the llvm-commits
mailing list