[llvm] 20e9e4f - [AMDGPU] si-wqm: Skip only LiveMask COPY

Diana Picus via llvm-commits llvm-commits at lists.llvm.org
Fri Nov 10 00:33:10 PST 2023


Author: Diana Picus
Date: 2023-11-10T09:30:44+01:00
New Revision: 20e9e4f797e72bc6006c68e1eb8dbca0d2191851

URL: https://github.com/llvm/llvm-project/commit/20e9e4f797e72bc6006c68e1eb8dbca0d2191851
DIFF: https://github.com/llvm/llvm-project/commit/20e9e4f797e72bc6006c68e1eb8dbca0d2191851.diff

LOG: [AMDGPU] si-wqm: Skip only LiveMask COPY

si-wqm sometimes needs to save the LiveMask in the entry block. Later
on, while looking for a place to enter WQM/WWM, it unconditionally
skips over the first COPY instruction in the entry block. This is
incorrect for functions where the LiveMask doesn't need to be saved, and
therefore the first COPY is more likely a COPY from a function argument
and might need to be in some non-exact mode.

This patch fixes the issue by also checking that the source of the COPY
is the EXEC register.

This produces different code in 3 of the existing tests:

In wwm-reserved.ll, a SGPR copy is now inside the WWM area rather than
outside. This is benign.

In wave32.ll, we end up with an extra register copy. This is because
the first COPY in the block is now part of the WWM block, so
si-pre-allocate-wwm-regs will allocate a new register for its
destination (when it was outside of the WWM region, the register
allocator could just re-use the same register). We might be able to
improve this in si-pre-allocate-wwm-regs but I haven't looked into it.

The same thing happens in dual-source-blend-export.ll, but for that
one it's harder to see because of the scheduling changes. I've uploaded
the before/after si-wqm output for it here:
https://reviews.llvm.org/differential/diff/553445/

Differential Revision: https://reviews.llvm.org/D158841

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
    llvm/test/CodeGen/AMDGPU/dual-source-blend-export.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll
    llvm/test/CodeGen/AMDGPU/wave32.ll
    llvm/test/CodeGen/AMDGPU/wwm-reserved.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index d1d828bfbdb70da..59d6ccf513bb9e4 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -1318,7 +1318,8 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
   auto II = MBB.getFirstNonPHI(), IE = MBB.end();
   if (IsEntry) {
     // Skip the instruction that saves LiveMask
-    if (II != IE && II->getOpcode() == AMDGPU::COPY)
+    if (II != IE && II->getOpcode() == AMDGPU::COPY &&
+        II->getOperand(1).getReg() == TRI->getExec())
       ++II;
   }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/dual-source-blend-export.ll b/llvm/test/CodeGen/AMDGPU/dual-source-blend-export.ll
index 8b30713e403c23b..545a894b45f7bfa 100644
--- a/llvm/test/CodeGen/AMDGPU/dual-source-blend-export.ll
+++ b/llvm/test/CodeGen/AMDGPU/dual-source-blend-export.ll
@@ -8,35 +8,34 @@ define amdgpu_ps void @_amdgpu_ps_main(i32 inreg %PrimMask, <2 x float> %InterpC
 ; GCN-NEXT:    s_mov_b32 s1, exec_lo
 ; GCN-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GCN-NEXT:    s_mov_b32 m0, s0
-; GCN-NEXT:    v_mbcnt_lo_u32_b32 v7, -1, 0
-; GCN-NEXT:    lds_param_load v3, attr1.x wait_vdst:15
-; GCN-NEXT:    lds_param_load v4, attr1.y wait_vdst:15
-; GCN-NEXT:    lds_param_load v5, attr1.z wait_vdst:15
-; GCN-NEXT:    lds_param_load v6, attr1.w wait_vdst:15
-; GCN-NEXT:    v_mbcnt_hi_u32_b32 v7, -1, v7
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GCN-NEXT:    v_dual_mov_b32 v2, v0 :: v_dual_and_b32 v7, 1, v7
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GCN-NEXT:    v_interp_p10_f32 v8, v4, v2, v4 wait_exp:2
-; GCN-NEXT:    v_interp_p10_f32 v10, v5, v2, v5 wait_exp:1
-; GCN-NEXT:    v_interp_p10_f32 v9, v6, v2, v6
-; GCN-NEXT:    v_interp_p10_f32 v2, v3, v2, v3 wait_exp:7
-; GCN-NEXT:    v_interp_p2_f32 v4, v4, v1, v8 wait_exp:7
+; GCN-NEXT:    v_dual_mov_b32 v2, v1 :: v_dual_mov_b32 v3, v0
+; GCN-NEXT:    lds_param_load v4, attr1.x wait_vdst:15
+; GCN-NEXT:    lds_param_load v5, attr1.y wait_vdst:15
+; GCN-NEXT:    lds_param_load v6, attr1.z wait_vdst:15
+; GCN-NEXT:    lds_param_load v7, attr1.w wait_vdst:15
+; GCN-NEXT:    v_mbcnt_lo_u32_b32 v8, -1, 0
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GCN-NEXT:    v_mbcnt_hi_u32_b32 v8, -1, v8
+; GCN-NEXT:    v_interp_p10_f32 v9, v5, v3, v5 wait_exp:2
+; GCN-NEXT:    v_interp_p10_f32 v11, v6, v3, v6 wait_exp:1
+; GCN-NEXT:    v_interp_p10_f32 v10, v7, v3, v7
+; GCN-NEXT:    v_interp_p10_f32 v3, v4, v3, v4 wait_exp:7
+; GCN-NEXT:    v_interp_p2_f32 v5, v5, v2, v9 wait_exp:7
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GCN-NEXT:    v_interp_p2_f32 v5, v5, v1, v10 wait_exp:7
-; GCN-NEXT:    v_interp_p2_f32 v6, v6, v1, v9 wait_exp:7
+; GCN-NEXT:    v_interp_p2_f32 v6, v6, v2, v11 wait_exp:7
+; GCN-NEXT:    v_interp_p2_f32 v7, v7, v2, v10 wait_exp:7
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GCN-NEXT:    v_interp_p2_f32 v2, v3, v1, v2 wait_exp:7
-; GCN-NEXT:    v_mov_b32_dpp v4, v4 dpp8:[1,0,3,2,5,4,7,6]
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GCN-NEXT:    v_mov_b32_dpp v6, v6 dpp8:[1,0,3,2,5,4,7,6]
-; GCN-NEXT:    v_dual_cndmask_b32 v3, v4, v5 :: v_dual_cndmask_b32 v4, v5, v4
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GCN-NEXT:    v_cndmask_b32_e32 v5, v2, v6, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc_lo
+; GCN-NEXT:    v_interp_p2_f32 v2, v4, v2, v3 wait_exp:7
+; GCN-NEXT:    v_mov_b32_dpp v5, v5 dpp8:[1,0,3,2,5,4,7,6]
+; GCN-NEXT:    v_and_b32_e32 v8, 1, v8
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GCN-NEXT:    v_mov_b32_dpp v7, v7 dpp8:[1,0,3,2,5,4,7,6]
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v8
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GCN-NEXT:    v_dual_cndmask_b32 v3, v5, v6 :: v_dual_cndmask_b32 v4, v6, v5
+; GCN-NEXT:    v_dual_cndmask_b32 v5, v2, v7 :: v_dual_cndmask_b32 v2, v7, v2
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GCN-NEXT:    v_mov_b32_dpp v4, v4 dpp8:[1,0,3,2,5,4,7,6]
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GCN-NEXT:    v_mov_b32_dpp v5, v5 dpp8:[1,0,3,2,5,4,7,6]
 ; GCN-NEXT:    s_mov_b32 exec_lo, s1
 ; GCN-NEXT:    exp dual_src_blend0 v3, v2, off, off

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll
index 5a15725e1be1030..6fd6d6e2e31a1c2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll
@@ -505,6 +505,311 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out,
   ret void
 }
 
+; When lowering function arguments, SelectionDAG will put the COPY for the last argument first.
+; This used to trigger a bug in si-wqm where the first COPY in the entry block was always skipped
+; before entering a strict mode, meaning that we'd only copy the active lanes of the last VGPR
+; argument, so we'd end up using arbitrary values for the inactive lanes.
+define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) %out, i32 %active, i32 %inactive) {
+; GISEL11-LABEL: set_inactive_chain_arg_last_vgpr:
+; GISEL11:       ; %bb.0:
+; GISEL11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL11-NEXT:    s_mov_b32 s32, 0
+; GISEL11-NEXT:    v_dual_mov_b32 v41, v8 :: v_dual_mov_b32 v42, v9
+; GISEL11-NEXT:    v_mov_b32_e32 v43, v10
+; GISEL11-NEXT:    s_or_saveexec_b32 s0, -1
+; GISEL11-NEXT:    v_mov_b32_e32 v40, v11
+; GISEL11-NEXT:    s_mov_b32 exec_lo, s0
+; GISEL11-NEXT:    s_getpc_b64 s[0:1]
+; GISEL11-NEXT:    s_add_u32 s0, s0, gfx_callee at gotpcrel32@lo+4
+; GISEL11-NEXT:    s_addc_u32 s1, s1, gfx_callee at gotpcrel32@hi+12
+; GISEL11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
+; GISEL11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GISEL11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0
+; GISEL11-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 0
+; GISEL11-NEXT:    v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v7, 0
+; GISEL11-NEXT:    v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v9, 0
+; GISEL11-NEXT:    v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v11, 0
+; GISEL11-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GISEL11-NEXT:    v_mov_b32_e32 v12, v43
+; GISEL11-NEXT:    s_not_b32 exec_lo, exec_lo
+; GISEL11-NEXT:    v_mov_b32_e32 v12, v40
+; GISEL11-NEXT:    s_not_b32 exec_lo, exec_lo
+; GISEL11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL11-NEXT:    v_mov_b32_e32 v0, v12
+; GISEL11-NEXT:    global_store_b32 v[41:42], v0, off
+; GISEL11-NEXT:    s_nop 0
+; GISEL11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL11-NEXT:    s_endpgm
+;
+; DAGISEL11-LABEL: set_inactive_chain_arg_last_vgpr:
+; DAGISEL11:       ; %bb.0:
+; DAGISEL11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAGISEL11-NEXT:    s_mov_b32 s32, 0
+; DAGISEL11-NEXT:    s_or_saveexec_b32 s0, -1
+; DAGISEL11-NEXT:    v_mov_b32_e32 v40, v11
+; DAGISEL11-NEXT:    s_mov_b32 exec_lo, s0
+; DAGISEL11-NEXT:    s_getpc_b64 s[0:1]
+; DAGISEL11-NEXT:    s_add_u32 s0, s0, gfx_callee at gotpcrel32@lo+4
+; DAGISEL11-NEXT:    s_addc_u32 s1, s1, gfx_callee at gotpcrel32@hi+12
+; DAGISEL11-NEXT:    v_dual_mov_b32 v43, v10 :: v_dual_mov_b32 v42, v9
+; DAGISEL11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; DAGISEL11-NEXT:    v_dual_mov_b32 v41, v8 :: v_dual_mov_b32 v0, 0
+; DAGISEL11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; DAGISEL11-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v4, 0
+; DAGISEL11-NEXT:    v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v6, 0
+; DAGISEL11-NEXT:    v_dual_mov_b32 v7, 0 :: v_dual_mov_b32 v8, 0
+; DAGISEL11-NEXT:    v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v10, 0
+; DAGISEL11-NEXT:    v_mov_b32_e32 v11, 0
+; DAGISEL11-NEXT:    s_waitcnt lgkmcnt(0)
+; DAGISEL11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; DAGISEL11-NEXT:    v_mov_b32_e32 v12, v43
+; DAGISEL11-NEXT:    s_not_b32 exec_lo, exec_lo
+; DAGISEL11-NEXT:    v_mov_b32_e32 v12, v40
+; DAGISEL11-NEXT:    s_not_b32 exec_lo, exec_lo
+; DAGISEL11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL11-NEXT:    v_mov_b32_e32 v0, v12
+; DAGISEL11-NEXT:    global_store_b32 v[41:42], v0, off
+; DAGISEL11-NEXT:    s_nop 0
+; DAGISEL11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; DAGISEL11-NEXT:    s_endpgm
+;
+; GISEL10-LABEL: set_inactive_chain_arg_last_vgpr:
+; GISEL10:       ; %bb.0:
+; GISEL10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL10-NEXT:    s_mov_b32 s32, 0
+; GISEL10-NEXT:    v_mov_b32_e32 v41, v8
+; GISEL10-NEXT:    v_mov_b32_e32 v42, v9
+; GISEL10-NEXT:    v_mov_b32_e32 v43, v10
+; GISEL10-NEXT:    s_or_saveexec_b32 s0, -1
+; GISEL10-NEXT:    v_mov_b32_e32 v40, v11
+; GISEL10-NEXT:    s_mov_b32 exec_lo, s0
+; GISEL10-NEXT:    s_getpc_b64 s[0:1]
+; GISEL10-NEXT:    s_add_u32 s0, s0, gfx_callee at gotpcrel32@lo+4
+; GISEL10-NEXT:    s_addc_u32 s1, s1, gfx_callee at gotpcrel32@hi+12
+; GISEL10-NEXT:    v_mov_b32_e32 v0, 0
+; GISEL10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GISEL10-NEXT:    v_mov_b32_e32 v1, 0
+; GISEL10-NEXT:    v_mov_b32_e32 v2, 0
+; GISEL10-NEXT:    v_mov_b32_e32 v3, 0
+; GISEL10-NEXT:    v_mov_b32_e32 v4, 0
+; GISEL10-NEXT:    v_mov_b32_e32 v5, 0
+; GISEL10-NEXT:    v_mov_b32_e32 v6, 0
+; GISEL10-NEXT:    v_mov_b32_e32 v7, 0
+; GISEL10-NEXT:    v_mov_b32_e32 v8, 0
+; GISEL10-NEXT:    v_mov_b32_e32 v9, 0
+; GISEL10-NEXT:    v_mov_b32_e32 v10, 0
+; GISEL10-NEXT:    v_mov_b32_e32 v11, 0
+; GISEL10-NEXT:    s_mov_b64 s[0:1], s[48:49]
+; GISEL10-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GISEL10-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GISEL10-NEXT:    v_mov_b32_e32 v12, v43
+; GISEL10-NEXT:    s_not_b32 exec_lo, exec_lo
+; GISEL10-NEXT:    v_mov_b32_e32 v12, v40
+; GISEL10-NEXT:    s_not_b32 exec_lo, exec_lo
+; GISEL10-NEXT:    v_mov_b32_e32 v0, v12
+; GISEL10-NEXT:    global_store_dword v[41:42], v0, off
+; GISEL10-NEXT:    s_endpgm
+;
+; DAGISEL10-LABEL: set_inactive_chain_arg_last_vgpr:
+; DAGISEL10:       ; %bb.0:
+; DAGISEL10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAGISEL10-NEXT:    s_mov_b32 s32, 0
+; DAGISEL10-NEXT:    s_or_saveexec_b32 s0, -1
+; DAGISEL10-NEXT:    v_mov_b32_e32 v40, v11
+; DAGISEL10-NEXT:    s_mov_b32 exec_lo, s0
+; DAGISEL10-NEXT:    s_getpc_b64 s[0:1]
+; DAGISEL10-NEXT:    s_add_u32 s0, s0, gfx_callee at gotpcrel32@lo+4
+; DAGISEL10-NEXT:    s_addc_u32 s1, s1, gfx_callee at gotpcrel32@hi+12
+; DAGISEL10-NEXT:    v_mov_b32_e32 v43, v10
+; DAGISEL10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; DAGISEL10-NEXT:    v_mov_b32_e32 v42, v9
+; DAGISEL10-NEXT:    v_mov_b32_e32 v41, v8
+; DAGISEL10-NEXT:    v_mov_b32_e32 v0, 0
+; DAGISEL10-NEXT:    v_mov_b32_e32 v1, 0
+; DAGISEL10-NEXT:    v_mov_b32_e32 v2, 0
+; DAGISEL10-NEXT:    v_mov_b32_e32 v3, 0
+; DAGISEL10-NEXT:    v_mov_b32_e32 v4, 0
+; DAGISEL10-NEXT:    v_mov_b32_e32 v5, 0
+; DAGISEL10-NEXT:    v_mov_b32_e32 v6, 0
+; DAGISEL10-NEXT:    v_mov_b32_e32 v7, 0
+; DAGISEL10-NEXT:    v_mov_b32_e32 v8, 0
+; DAGISEL10-NEXT:    v_mov_b32_e32 v9, 0
+; DAGISEL10-NEXT:    v_mov_b32_e32 v10, 0
+; DAGISEL10-NEXT:    v_mov_b32_e32 v11, 0
+; DAGISEL10-NEXT:    s_mov_b64 s[0:1], s[48:49]
+; DAGISEL10-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; DAGISEL10-NEXT:    s_waitcnt lgkmcnt(0)
+; DAGISEL10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; DAGISEL10-NEXT:    v_mov_b32_e32 v12, v43
+; DAGISEL10-NEXT:    s_not_b32 exec_lo, exec_lo
+; DAGISEL10-NEXT:    v_mov_b32_e32 v12, v40
+; DAGISEL10-NEXT:    s_not_b32 exec_lo, exec_lo
+; DAGISEL10-NEXT:    v_mov_b32_e32 v0, v12
+; DAGISEL10-NEXT:    global_store_dword v[41:42], v0, off
+; DAGISEL10-NEXT:    s_endpgm
+;
+; GISEL11_W64-LABEL: set_inactive_chain_arg_last_vgpr:
+; GISEL11_W64:       ; %bb.0:
+; GISEL11_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL11_W64-NEXT:    s_mov_b32 s32, 0
+; GISEL11_W64-NEXT:    v_mov_b32_e32 v41, v8
+; GISEL11_W64-NEXT:    v_mov_b32_e32 v42, v9
+; GISEL11_W64-NEXT:    v_mov_b32_e32 v43, v10
+; GISEL11_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GISEL11_W64-NEXT:    v_mov_b32_e32 v40, v11
+; GISEL11_W64-NEXT:    s_mov_b64 exec, s[0:1]
+; GISEL11_W64-NEXT:    s_getpc_b64 s[0:1]
+; GISEL11_W64-NEXT:    s_add_u32 s0, s0, gfx_callee at gotpcrel32@lo+4
+; GISEL11_W64-NEXT:    s_addc_u32 s1, s1, gfx_callee at gotpcrel32@hi+12
+; GISEL11_W64-NEXT:    v_mov_b32_e32 v0, 0
+; GISEL11_W64-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GISEL11_W64-NEXT:    v_mov_b32_e32 v1, 0
+; GISEL11_W64-NEXT:    v_mov_b32_e32 v2, 0
+; GISEL11_W64-NEXT:    v_mov_b32_e32 v3, 0
+; GISEL11_W64-NEXT:    v_mov_b32_e32 v4, 0
+; GISEL11_W64-NEXT:    v_mov_b32_e32 v5, 0
+; GISEL11_W64-NEXT:    v_mov_b32_e32 v6, 0
+; GISEL11_W64-NEXT:    v_mov_b32_e32 v7, 0
+; GISEL11_W64-NEXT:    v_mov_b32_e32 v8, 0
+; GISEL11_W64-NEXT:    v_mov_b32_e32 v9, 0
+; GISEL11_W64-NEXT:    v_mov_b32_e32 v10, 0
+; GISEL11_W64-NEXT:    v_mov_b32_e32 v11, 0
+; GISEL11_W64-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL11_W64-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GISEL11_W64-NEXT:    v_mov_b32_e32 v12, v43
+; GISEL11_W64-NEXT:    s_not_b64 exec, exec
+; GISEL11_W64-NEXT:    v_mov_b32_e32 v12, v40
+; GISEL11_W64-NEXT:    s_not_b64 exec, exec
+; GISEL11_W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL11_W64-NEXT:    v_mov_b32_e32 v0, v12
+; GISEL11_W64-NEXT:    global_store_b32 v[41:42], v0, off
+; GISEL11_W64-NEXT:    s_nop 0
+; GISEL11_W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL11_W64-NEXT:    s_endpgm
+;
+; DAGISEL11_W64-LABEL: set_inactive_chain_arg_last_vgpr:
+; DAGISEL11_W64:       ; %bb.0:
+; DAGISEL11_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAGISEL11_W64-NEXT:    s_mov_b32 s32, 0
+; DAGISEL11_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v40, v11
+; DAGISEL11_W64-NEXT:    s_mov_b64 exec, s[0:1]
+; DAGISEL11_W64-NEXT:    s_getpc_b64 s[0:1]
+; DAGISEL11_W64-NEXT:    s_add_u32 s0, s0, gfx_callee at gotpcrel32@lo+4
+; DAGISEL11_W64-NEXT:    s_addc_u32 s1, s1, gfx_callee at gotpcrel32@hi+12
+; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v43, v10
+; DAGISEL11_W64-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v42, v9
+; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v41, v8
+; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v0, 0
+; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v1, 0
+; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v2, 0
+; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v3, 0
+; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v4, 0
+; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v5, 0
+; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v6, 0
+; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v7, 0
+; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v8, 0
+; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v9, 0
+; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v10, 0
+; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v11, 0
+; DAGISEL11_W64-NEXT:    s_waitcnt lgkmcnt(0)
+; DAGISEL11_W64-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v12, v43
+; DAGISEL11_W64-NEXT:    s_not_b64 exec, exec
+; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v12, v40
+; DAGISEL11_W64-NEXT:    s_not_b64 exec, exec
+; DAGISEL11_W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v0, v12
+; DAGISEL11_W64-NEXT:    global_store_b32 v[41:42], v0, off
+; DAGISEL11_W64-NEXT:    s_nop 0
+; DAGISEL11_W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; DAGISEL11_W64-NEXT:    s_endpgm
+;
+; GISEL10_W64-LABEL: set_inactive_chain_arg_last_vgpr:
+; GISEL10_W64:       ; %bb.0:
+; GISEL10_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL10_W64-NEXT:    s_mov_b32 s32, 0
+; GISEL10_W64-NEXT:    v_mov_b32_e32 v41, v8
+; GISEL10_W64-NEXT:    v_mov_b32_e32 v42, v9
+; GISEL10_W64-NEXT:    v_mov_b32_e32 v43, v10
+; GISEL10_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GISEL10_W64-NEXT:    v_mov_b32_e32 v40, v11
+; GISEL10_W64-NEXT:    s_mov_b64 exec, s[0:1]
+; GISEL10_W64-NEXT:    s_getpc_b64 s[0:1]
+; GISEL10_W64-NEXT:    s_add_u32 s0, s0, gfx_callee at gotpcrel32@lo+4
+; GISEL10_W64-NEXT:    s_addc_u32 s1, s1, gfx_callee at gotpcrel32@hi+12
+; GISEL10_W64-NEXT:    v_mov_b32_e32 v0, 0
+; GISEL10_W64-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GISEL10_W64-NEXT:    v_mov_b32_e32 v1, 0
+; GISEL10_W64-NEXT:    v_mov_b32_e32 v2, 0
+; GISEL10_W64-NEXT:    v_mov_b32_e32 v3, 0
+; GISEL10_W64-NEXT:    v_mov_b32_e32 v4, 0
+; GISEL10_W64-NEXT:    v_mov_b32_e32 v5, 0
+; GISEL10_W64-NEXT:    v_mov_b32_e32 v6, 0
+; GISEL10_W64-NEXT:    v_mov_b32_e32 v7, 0
+; GISEL10_W64-NEXT:    v_mov_b32_e32 v8, 0
+; GISEL10_W64-NEXT:    v_mov_b32_e32 v9, 0
+; GISEL10_W64-NEXT:    v_mov_b32_e32 v10, 0
+; GISEL10_W64-NEXT:    v_mov_b32_e32 v11, 0
+; GISEL10_W64-NEXT:    s_mov_b64 s[0:1], s[48:49]
+; GISEL10_W64-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GISEL10_W64-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL10_W64-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GISEL10_W64-NEXT:    v_mov_b32_e32 v12, v43
+; GISEL10_W64-NEXT:    s_not_b64 exec, exec
+; GISEL10_W64-NEXT:    v_mov_b32_e32 v12, v40
+; GISEL10_W64-NEXT:    s_not_b64 exec, exec
+; GISEL10_W64-NEXT:    v_mov_b32_e32 v0, v12
+; GISEL10_W64-NEXT:    global_store_dword v[41:42], v0, off
+; GISEL10_W64-NEXT:    s_endpgm
+;
+; DAGISEL10_W64-LABEL: set_inactive_chain_arg_last_vgpr:
+; DAGISEL10_W64:       ; %bb.0:
+; DAGISEL10_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAGISEL10_W64-NEXT:    s_mov_b32 s32, 0
+; DAGISEL10_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v40, v11
+; DAGISEL10_W64-NEXT:    s_mov_b64 exec, s[0:1]
+; DAGISEL10_W64-NEXT:    s_getpc_b64 s[0:1]
+; DAGISEL10_W64-NEXT:    s_add_u32 s0, s0, gfx_callee at gotpcrel32@lo+4
+; DAGISEL10_W64-NEXT:    s_addc_u32 s1, s1, gfx_callee at gotpcrel32@hi+12
+; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v43, v10
+; DAGISEL10_W64-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v42, v9
+; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v41, v8
+; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v0, 0
+; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v1, 0
+; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v2, 0
+; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v3, 0
+; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v4, 0
+; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v5, 0
+; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v6, 0
+; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v7, 0
+; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v8, 0
+; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v9, 0
+; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v10, 0
+; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v11, 0
+; DAGISEL10_W64-NEXT:    s_mov_b64 s[0:1], s[48:49]
+; DAGISEL10_W64-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; DAGISEL10_W64-NEXT:    s_waitcnt lgkmcnt(0)
+; DAGISEL10_W64-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v12, v43
+; DAGISEL10_W64-NEXT:    s_not_b64 exec, exec
+; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v12, v40
+; DAGISEL10_W64-NEXT:    s_not_b64 exec, exec
+; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v0, v12
+; DAGISEL10_W64-NEXT:    global_store_dword v[41:42], v0, off
+; DAGISEL10_W64-NEXT:    s_endpgm
+  call amdgpu_gfx void @gfx_callee(<12 x i32> zeroinitializer)
+  %tmp = call i32 @llvm.amdgcn.set.inactive.chain.arg.i32(i32 %active, i32 %inactive) #0
+  %wwm = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp)
+  store i32 %wwm, ptr addrspace(1) %out
+  ret void
+}
+
 declare i32 @llvm.amdgcn.set.inactive.chain.arg.i32(i32, i32) #0
 declare i64 @llvm.amdgcn.set.inactive.chain.arg.i64(i64, i64) #0
 declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32 immarg, i32 immarg, i32 immarg, i1 immarg)

diff  --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index cadc23414dcac11..c3ac778f82e0499 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -1885,8 +1885,9 @@ define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1, float %src0,
 ; GFX1032-LABEL: test_wwm1:
 ; GFX1032:       ; %bb.0: ; %main_body
 ; GFX1032-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1032-NEXT:    v_add_f32_e32 v2, v2, v1
+; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1032-NEXT:    v_mov_b32_e32 v3, v0
+; GFX1032-NEXT:    v_add_f32_e32 v2, v3, v2
 ; GFX1032-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1032-NEXT:    ; return to shader part epilog
@@ -1894,8 +1895,9 @@ define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1, float %src0,
 ; GFX1064-LABEL: test_wwm1:
 ; GFX1064:       ; %bb.0: ; %main_body
 ; GFX1064-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1064-NEXT:    v_add_f32_e32 v2, v2, v1
+; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1064-NEXT:    v_mov_b32_e32 v3, v0
+; GFX1064-NEXT:    v_add_f32_e32 v2, v3, v2
 ; GFX1064-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1064-NEXT:    ; return to shader part epilog
@@ -1970,8 +1972,9 @@ define amdgpu_ps float @test_strict_wwm1(i32 inreg %idx0, i32 inreg %idx1, float
 ; GFX1032-LABEL: test_strict_wwm1:
 ; GFX1032:       ; %bb.0: ; %main_body
 ; GFX1032-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1032-NEXT:    v_add_f32_e32 v2, v2, v1
+; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1032-NEXT:    v_mov_b32_e32 v3, v0
+; GFX1032-NEXT:    v_add_f32_e32 v2, v3, v2
 ; GFX1032-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1032-NEXT:    ; return to shader part epilog
@@ -1979,8 +1982,9 @@ define amdgpu_ps float @test_strict_wwm1(i32 inreg %idx0, i32 inreg %idx1, float
 ; GFX1064-LABEL: test_strict_wwm1:
 ; GFX1064:       ; %bb.0: ; %main_body
 ; GFX1064-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1064-NEXT:    v_add_f32_e32 v2, v2, v1
+; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1064-NEXT:    v_mov_b32_e32 v3, v0
+; GFX1064-NEXT:    v_add_f32_e32 v2, v3, v2
 ; GFX1064-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX1064-NEXT:    ; return to shader part epilog

diff  --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
index 7a489f343d6f426..743fb3e60a94dd5 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
@@ -336,10 +336,10 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) {
 ; GFX9-O0-NEXT:    s_add_u32 s24, s24, s11
 ; GFX9-O0-NEXT:    s_addc_u32 s25, s25, 0
 ; GFX9-O0-NEXT:    ; implicit-def: $vgpr7 : SGPR spill to VGPR lane
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[12:13], -1
+; GFX9-O0-NEXT:    v_writelane_b32 v7, s12, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v7, s13, 1
 ; GFX9-O0-NEXT:    s_mov_b32 s14, s10
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[10:11], -1
-; GFX9-O0-NEXT:    v_writelane_b32 v7, s10, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v7, s11, 1
 ; GFX9-O0-NEXT:    s_mov_b32 s13, s9
 ; GFX9-O0-NEXT:    s_mov_b32 s12, s8
 ; GFX9-O0-NEXT:    s_mov_b64 s[10:11], s[6:7]
@@ -570,10 +570,10 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar
 ; GFX9-O0-NEXT:    s_add_u32 s24, s24, s11
 ; GFX9-O0-NEXT:    s_addc_u32 s25, s25, 0
 ; GFX9-O0-NEXT:    ; implicit-def: $vgpr12 : SGPR spill to VGPR lane
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[12:13], -1
+; GFX9-O0-NEXT:    v_writelane_b32 v12, s12, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v12, s13, 1
 ; GFX9-O0-NEXT:    s_mov_b32 s14, s10
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[10:11], -1
-; GFX9-O0-NEXT:    v_writelane_b32 v12, s10, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v12, s11, 1
 ; GFX9-O0-NEXT:    s_mov_b32 s13, s9
 ; GFX9-O0-NEXT:    s_mov_b32 s12, s8
 ; GFX9-O0-NEXT:    s_mov_b64 s[10:11], s[6:7]
@@ -1204,10 +1204,10 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in
 ; GFX9-O0-NEXT:    s_add_u32 s24, s24, s11
 ; GFX9-O0-NEXT:    s_addc_u32 s25, s25, 0
 ; GFX9-O0-NEXT:    ; implicit-def: $vgpr7 : SGPR spill to VGPR lane
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[12:13], -1
+; GFX9-O0-NEXT:    v_writelane_b32 v7, s12, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v7, s13, 1
 ; GFX9-O0-NEXT:    s_mov_b32 s14, s10
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[10:11], -1
-; GFX9-O0-NEXT:    v_writelane_b32 v7, s10, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v7, s11, 1
 ; GFX9-O0-NEXT:    s_mov_b32 s13, s9
 ; GFX9-O0-NEXT:    s_mov_b32 s12, s8
 ; GFX9-O0-NEXT:    s_mov_b64 s[10:11], s[6:7]
@@ -1438,10 +1438,10 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6
 ; GFX9-O0-NEXT:    s_add_u32 s24, s24, s11
 ; GFX9-O0-NEXT:    s_addc_u32 s25, s25, 0
 ; GFX9-O0-NEXT:    ; implicit-def: $vgpr12 : SGPR spill to VGPR lane
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[12:13], -1
+; GFX9-O0-NEXT:    v_writelane_b32 v12, s12, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v12, s13, 1
 ; GFX9-O0-NEXT:    s_mov_b32 s14, s10
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[10:11], -1
-; GFX9-O0-NEXT:    v_writelane_b32 v12, s10, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v12, s11, 1
 ; GFX9-O0-NEXT:    s_mov_b32 s13, s9
 ; GFX9-O0-NEXT:    s_mov_b32 s12, s8
 ; GFX9-O0-NEXT:    s_mov_b64 s[10:11], s[6:7]


        


More information about the llvm-commits mailing list