[llvm] [AMDGPU] Ensure that V_SET_INACTIVE inactive input is WWM computed (PR #98858)

Carl Ritson via llvm-commits llvm-commits at lists.llvm.org
Sun Jul 14 22:58:33 PDT 2024


https://github.com/perlfu created https://github.com/llvm/llvm-project/pull/98858

WWM global flag must be set to ensure V_SET_INACTIVE inactive lane input is computed in WWM.
Full lowering may be skipped if global flag is not present.

>From 574004452bc1a3ce1e0be12764ffb67e4d3cb668 Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson at amd.com>
Date: Sun, 14 Jul 2024 11:08:21 +0900
Subject: [PATCH] [AMDGPU] Ensure that V_SET_INACTIVE inactive input is WWM
 computed

WWM global flag must be set to ensure V_SET_INACTIVE inactive lane
input is computed in WWM.
Full lowering may be skipped if global flag is not present.
---
 llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp    |   1 +
 .../GlobalISel/llvm.amdgcn.set.inactive.ll    | 124 ++++++-----
 .../llvm.amdgcn.set.inactive.chain.arg.ll     | 192 +++++++++++++-----
 3 files changed, 217 insertions(+), 100 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 19e761ef45b25..188c69a8733f9 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -564,6 +564,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
           }
         }
         SetInactiveInstrs.push_back(&MI);
+        GlobalFlags |= StateStrictWWM;
       } else if (TII->isDisableWQM(MI)) {
         BBI.Needs |= StateExact;
         if (!(BBI.InNeeds & StateExact)) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
index 0c60be9d94591..db79d27759fd6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
@@ -96,14 +96,16 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
 define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) {
 ; GCN-LABEL: set_inactive_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s3, s[0:1], 0x2c
+; GCN-NEXT:    s_load_dword s4, s[0:1], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x40400000
+; GCN-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x40400000
+; GCN-NEXT:    s_mov_b64 exec, s[2:3]
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s3
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, v1
+; GCN-NEXT:    v_mov_b32_e32 v0, v0
 ; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -117,16 +119,18 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
 ; GCN-LABEL: set_inactive_f64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NEXT:    s_mov_b32 s4, 0xcccccccd
-; GCN-NEXT:    s_mov_b32 s5, 0x4010cccc
-; GCN-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-NEXT:    v_mov_b32_e32 v3, s5
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    s_mov_b32 s6, 0xcccccccd
+; GCN-NEXT:    s_mov_b32 s7, 0x4010cccc
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, v2
-; GCN-NEXT:    v_mov_b32_e32 v1, v3
+; GCN-NEXT:    v_mov_b32_e32 v0, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, v1
 ; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
@@ -140,14 +144,16 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
 define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %in) {
 ; GCN-LABEL: set_inactive_v2i16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s3, s[0:1], 0x2c
+; GCN-NEXT:    s_load_dword s4, s[0:1], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x10001
+; GCN-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x10001
+; GCN-NEXT:    s_mov_b64 exec, s[2:3]
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s3
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, v1
+; GCN-NEXT:    v_mov_b32_e32 v0, v0
 ; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -160,14 +166,16 @@ define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %
 define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> %in) {
 ; GCN-LABEL: set_inactive_v2f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s3, s[0:1], 0x2c
+; GCN-NEXT:    s_load_dword s4, s[0:1], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x3c003c00
+; GCN-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x3c003c00
+; GCN-NEXT:    s_mov_b64 exec, s[2:3]
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s3
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, v1
+; GCN-NEXT:    v_mov_b32_e32 v0, v0
 ; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -181,16 +189,18 @@ define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> %
 ; GCN-LABEL: set_inactive_v2i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NEXT:    s_mov_b32 s4, 1
-; GCN-NEXT:    s_mov_b32 s5, s4
-; GCN-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-NEXT:    v_mov_b32_e32 v3, s5
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    s_mov_b32 s6, 1
+; GCN-NEXT:    s_mov_b32 s7, s6
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, v2
-; GCN-NEXT:    v_mov_b32_e32 v1, v3
+; GCN-NEXT:    v_mov_b32_e32 v0, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, v1
 ; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
@@ -205,16 +215,18 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float>
 ; GCN-LABEL: set_inactive_v2f32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NEXT:    s_mov_b32 s4, 1.0
-; GCN-NEXT:    s_mov_b32 s5, s4
-; GCN-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-NEXT:    v_mov_b32_e32 v3, s5
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    s_mov_b32 s6, 1.0
+; GCN-NEXT:    s_mov_b32 s7, s6
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, v2
-; GCN-NEXT:    v_mov_b32_e32 v1, v3
+; GCN-NEXT:    v_mov_b32_e32 v0, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, v1
 ; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
@@ -228,14 +240,16 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float>
 define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in) {
 ; GCN-LABEL: set_inactive_v2bf16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s3, s[0:1], 0x2c
+; GCN-NEXT:    s_load_dword s4, s[0:1], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x3f803f80
+; GCN-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x3f803f80
+; GCN-NEXT:    s_mov_b64 exec, s[2:3]
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s3
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, v1
+; GCN-NEXT:    v_mov_b32_e32 v0, v0
 ; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -249,16 +263,18 @@ define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> %
 ; GCN-LABEL: set_inactive_v4i16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NEXT:    s_mov_b32 s4, 0x10001
-; GCN-NEXT:    s_mov_b32 s5, s4
-; GCN-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-NEXT:    v_mov_b32_e32 v3, s5
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    s_mov_b32 s6, 0x10001
+; GCN-NEXT:    s_mov_b32 s7, s6
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, v2
-; GCN-NEXT:    v_mov_b32_e32 v1, v3
+; GCN-NEXT:    v_mov_b32_e32 v0, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, v1
 ; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
@@ -273,16 +289,18 @@ define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half>
 ; GCN-LABEL: set_inactive_v4f16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NEXT:    s_mov_b32 s4, 0x3c003c00
-; GCN-NEXT:    s_mov_b32 s5, s4
-; GCN-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-NEXT:    v_mov_b32_e32 v3, s5
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    s_mov_b32 s6, 0x3c003c00
+; GCN-NEXT:    s_mov_b32 s7, s6
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, v2
-; GCN-NEXT:    v_mov_b32_e32 v1, v3
+; GCN-NEXT:    v_mov_b32_e32 v0, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, v1
 ; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
@@ -297,16 +315,18 @@ define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloa
 ; GCN-LABEL: set_inactive_v4bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NEXT:    s_mov_b32 s4, 0x3f803f80
-; GCN-NEXT:    s_mov_b32 s5, s4
-; GCN-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-NEXT:    v_mov_b32_e32 v3, s5
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    s_mov_b32 s6, 0x3f803f80
+; GCN-NEXT:    s_mov_b32 s7, s6
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, v2
-; GCN-NEXT:    v_mov_b32_e32 v1, v3
+; GCN-NEXT:    v_mov_b32_e32 v0, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, v1
 ; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll
index b3acd4949301e..e6acd9996b09a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll
@@ -12,9 +12,13 @@ define amdgpu_cs_chain void @set_inactive_chain_arg(ptr addrspace(1) %out, i32 %
 ; GFX11-LABEL: set_inactive_chain_arg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX11-NEXT:    v_mov_b32_e32 v0, v10
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v11
 ; GFX11-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX11-NEXT:    v_mov_b32_e32 v0, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v0, v0
 ; GFX11-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX11-NEXT:    global_store_b32 v[8:9], v0, off
 ; GFX11-NEXT:    s_endpgm
@@ -22,9 +26,12 @@ define amdgpu_cs_chain void @set_inactive_chain_arg(ptr addrspace(1) %out, i32 %
 ; GFX10-LABEL: set_inactive_chain_arg:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX10-NEXT:    v_mov_b32_e32 v0, v10
+; GFX10-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v11
 ; GFX10-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX10-NEXT:    v_mov_b32_e32 v0, v10
+; GFX10-NEXT:    v_mov_b32_e32 v0, v0
 ; GFX10-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX10-NEXT:    global_store_dword v[8:9], v0, off
 ; GFX10-NEXT:    s_endpgm
@@ -32,9 +39,13 @@ define amdgpu_cs_chain void @set_inactive_chain_arg(ptr addrspace(1) %out, i32 %
 ; GFX11_W64-LABEL: set_inactive_chain_arg:
 ; GFX11_W64:       ; %bb.0:
 ; GFX11_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX11_W64-NEXT:    v_mov_b32_e32 v0, v10
+; GFX11_W64-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX11_W64-NEXT:    v_mov_b32_e32 v0, v11
 ; GFX11_W64-NEXT:    s_not_b64 exec, exec
-; GFX11_W64-NEXT:    v_mov_b32_e32 v0, v10
+; GFX11_W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11_W64-NEXT:    v_mov_b32_e32 v0, v0
 ; GFX11_W64-NEXT:    s_not_b64 exec, exec
 ; GFX11_W64-NEXT:    global_store_b32 v[8:9], v0, off
 ; GFX11_W64-NEXT:    s_endpgm
@@ -42,9 +53,12 @@ define amdgpu_cs_chain void @set_inactive_chain_arg(ptr addrspace(1) %out, i32 %
 ; GFX10_W64-LABEL: set_inactive_chain_arg:
 ; GFX10_W64:       ; %bb.0:
 ; GFX10_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX10_W64-NEXT:    v_mov_b32_e32 v0, v10
+; GFX10_W64-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v0, v11
 ; GFX10_W64-NEXT:    s_not_b64 exec, exec
-; GFX10_W64-NEXT:    v_mov_b32_e32 v0, v10
+; GFX10_W64-NEXT:    v_mov_b32_e32 v0, v0
 ; GFX10_W64-NEXT:    s_not_b64 exec, exec
 ; GFX10_W64-NEXT:    global_store_dword v[8:9], v0, off
 ; GFX10_W64-NEXT:    s_endpgm
@@ -54,53 +68,135 @@ define amdgpu_cs_chain void @set_inactive_chain_arg(ptr addrspace(1) %out, i32 %
 }
 
 define amdgpu_cs_chain void @set_inactive_chain_arg_64(ptr addrspace(1) %out, i64 %inactive, i64 %active) {
-; GFX11-LABEL: set_inactive_chain_arg_64:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v0, v12
-; GFX11-NEXT:    v_mov_b32_e32 v1, v13
-; GFX11-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX11-NEXT:    v_mov_b32_e32 v0, v10
-; GFX11-NEXT:    v_mov_b32_e32 v1, v11
-; GFX11-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX11-NEXT:    global_store_b64 v[8:9], v[0:1], off
-; GFX11-NEXT:    s_endpgm
+; GISEL11-LABEL: set_inactive_chain_arg_64:
+; GISEL11:       ; %bb.0:
+; GISEL11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL11-NEXT:    s_or_saveexec_b32 s0, -1
+; GISEL11-NEXT:    v_dual_mov_b32 v0, v10 :: v_dual_mov_b32 v1, v11
+; GISEL11-NEXT:    s_mov_b32 exec_lo, s0
+; GISEL11-NEXT:    v_mov_b32_e32 v0, v12
+; GISEL11-NEXT:    v_mov_b32_e32 v1, v13
+; GISEL11-NEXT:    s_not_b32 exec_lo, exec_lo
+; GISEL11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GISEL11-NEXT:    v_mov_b32_e32 v0, v0
+; GISEL11-NEXT:    v_mov_b32_e32 v1, v1
+; GISEL11-NEXT:    s_not_b32 exec_lo, exec_lo
+; GISEL11-NEXT:    global_store_b64 v[8:9], v[0:1], off
+; GISEL11-NEXT:    s_endpgm
 ;
-; GFX10-LABEL: set_inactive_chain_arg_64:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, v12
-; GFX10-NEXT:    v_mov_b32_e32 v1, v13
-; GFX10-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX10-NEXT:    v_mov_b32_e32 v0, v10
-; GFX10-NEXT:    v_mov_b32_e32 v1, v11
-; GFX10-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX10-NEXT:    global_store_dwordx2 v[8:9], v[0:1], off
-; GFX10-NEXT:    s_endpgm
+; DAGISEL11-LABEL: set_inactive_chain_arg_64:
+; DAGISEL11:       ; %bb.0:
+; DAGISEL11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAGISEL11-NEXT:    s_or_saveexec_b32 s0, -1
+; DAGISEL11-NEXT:    v_dual_mov_b32 v1, v11 :: v_dual_mov_b32 v0, v10
+; DAGISEL11-NEXT:    s_mov_b32 exec_lo, s0
+; DAGISEL11-NEXT:    v_mov_b32_e32 v0, v12
+; DAGISEL11-NEXT:    v_mov_b32_e32 v1, v13
+; DAGISEL11-NEXT:    s_not_b32 exec_lo, exec_lo
+; DAGISEL11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; DAGISEL11-NEXT:    v_mov_b32_e32 v0, v0
+; DAGISEL11-NEXT:    v_mov_b32_e32 v1, v1
+; DAGISEL11-NEXT:    s_not_b32 exec_lo, exec_lo
+; DAGISEL11-NEXT:    global_store_b64 v[8:9], v[0:1], off
+; DAGISEL11-NEXT:    s_endpgm
 ;
-; GFX11_W64-LABEL: set_inactive_chain_arg_64:
-; GFX11_W64:       ; %bb.0:
-; GFX11_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11_W64-NEXT:    v_mov_b32_e32 v0, v12
-; GFX11_W64-NEXT:    v_mov_b32_e32 v1, v13
-; GFX11_W64-NEXT:    s_not_b64 exec, exec
-; GFX11_W64-NEXT:    v_mov_b32_e32 v0, v10
-; GFX11_W64-NEXT:    v_mov_b32_e32 v1, v11
-; GFX11_W64-NEXT:    s_not_b64 exec, exec
-; GFX11_W64-NEXT:    global_store_b64 v[8:9], v[0:1], off
-; GFX11_W64-NEXT:    s_endpgm
+; GISEL10-LABEL: set_inactive_chain_arg_64:
+; GISEL10:       ; %bb.0:
+; GISEL10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL10-NEXT:    s_or_saveexec_b32 s0, -1
+; GISEL10-NEXT:    v_mov_b32_e32 v0, v10
+; GISEL10-NEXT:    v_mov_b32_e32 v1, v11
+; GISEL10-NEXT:    s_mov_b32 exec_lo, s0
+; GISEL10-NEXT:    v_mov_b32_e32 v0, v12
+; GISEL10-NEXT:    v_mov_b32_e32 v1, v13
+; GISEL10-NEXT:    s_not_b32 exec_lo, exec_lo
+; GISEL10-NEXT:    v_mov_b32_e32 v0, v0
+; GISEL10-NEXT:    v_mov_b32_e32 v1, v1
+; GISEL10-NEXT:    s_not_b32 exec_lo, exec_lo
+; GISEL10-NEXT:    global_store_dwordx2 v[8:9], v[0:1], off
+; GISEL10-NEXT:    s_endpgm
 ;
-; GFX10_W64-LABEL: set_inactive_chain_arg_64:
-; GFX10_W64:       ; %bb.0:
-; GFX10_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_W64-NEXT:    v_mov_b32_e32 v0, v12
-; GFX10_W64-NEXT:    v_mov_b32_e32 v1, v13
-; GFX10_W64-NEXT:    s_not_b64 exec, exec
-; GFX10_W64-NEXT:    v_mov_b32_e32 v0, v10
-; GFX10_W64-NEXT:    v_mov_b32_e32 v1, v11
-; GFX10_W64-NEXT:    s_not_b64 exec, exec
-; GFX10_W64-NEXT:    global_store_dwordx2 v[8:9], v[0:1], off
-; GFX10_W64-NEXT:    s_endpgm
+; DAGISEL10-LABEL: set_inactive_chain_arg_64:
+; DAGISEL10:       ; %bb.0:
+; DAGISEL10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAGISEL10-NEXT:    s_or_saveexec_b32 s0, -1
+; DAGISEL10-NEXT:    v_mov_b32_e32 v1, v11
+; DAGISEL10-NEXT:    v_mov_b32_e32 v0, v10
+; DAGISEL10-NEXT:    s_mov_b32 exec_lo, s0
+; DAGISEL10-NEXT:    v_mov_b32_e32 v0, v12
+; DAGISEL10-NEXT:    v_mov_b32_e32 v1, v13
+; DAGISEL10-NEXT:    s_not_b32 exec_lo, exec_lo
+; DAGISEL10-NEXT:    v_mov_b32_e32 v0, v0
+; DAGISEL10-NEXT:    v_mov_b32_e32 v1, v1
+; DAGISEL10-NEXT:    s_not_b32 exec_lo, exec_lo
+; DAGISEL10-NEXT:    global_store_dwordx2 v[8:9], v[0:1], off
+; DAGISEL10-NEXT:    s_endpgm
+;
+; GISEL11_W64-LABEL: set_inactive_chain_arg_64:
+; GISEL11_W64:       ; %bb.0:
+; GISEL11_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL11_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GISEL11_W64-NEXT:    v_mov_b32_e32 v0, v10
+; GISEL11_W64-NEXT:    v_mov_b32_e32 v1, v11
+; GISEL11_W64-NEXT:    s_mov_b64 exec, s[0:1]
+; GISEL11_W64-NEXT:    v_mov_b32_e32 v0, v12
+; GISEL11_W64-NEXT:    v_mov_b32_e32 v1, v13
+; GISEL11_W64-NEXT:    s_not_b64 exec, exec
+; GISEL11_W64-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GISEL11_W64-NEXT:    v_mov_b32_e32 v0, v0
+; GISEL11_W64-NEXT:    v_mov_b32_e32 v1, v1
+; GISEL11_W64-NEXT:    s_not_b64 exec, exec
+; GISEL11_W64-NEXT:    global_store_b64 v[8:9], v[0:1], off
+; GISEL11_W64-NEXT:    s_endpgm
+;
+; DAGISEL11_W64-LABEL: set_inactive_chain_arg_64:
+; DAGISEL11_W64:       ; %bb.0:
+; DAGISEL11_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAGISEL11_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v1, v11
+; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v0, v10
+; DAGISEL11_W64-NEXT:    s_mov_b64 exec, s[0:1]
+; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v0, v12
+; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v1, v13
+; DAGISEL11_W64-NEXT:    s_not_b64 exec, exec
+; DAGISEL11_W64-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v0, v0
+; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v1, v1
+; DAGISEL11_W64-NEXT:    s_not_b64 exec, exec
+; DAGISEL11_W64-NEXT:    global_store_b64 v[8:9], v[0:1], off
+; DAGISEL11_W64-NEXT:    s_endpgm
+;
+; GISEL10_W64-LABEL: set_inactive_chain_arg_64:
+; GISEL10_W64:       ; %bb.0:
+; GISEL10_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL10_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GISEL10_W64-NEXT:    v_mov_b32_e32 v0, v10
+; GISEL10_W64-NEXT:    v_mov_b32_e32 v1, v11
+; GISEL10_W64-NEXT:    s_mov_b64 exec, s[0:1]
+; GISEL10_W64-NEXT:    v_mov_b32_e32 v0, v12
+; GISEL10_W64-NEXT:    v_mov_b32_e32 v1, v13
+; GISEL10_W64-NEXT:    s_not_b64 exec, exec
+; GISEL10_W64-NEXT:    v_mov_b32_e32 v0, v0
+; GISEL10_W64-NEXT:    v_mov_b32_e32 v1, v1
+; GISEL10_W64-NEXT:    s_not_b64 exec, exec
+; GISEL10_W64-NEXT:    global_store_dwordx2 v[8:9], v[0:1], off
+; GISEL10_W64-NEXT:    s_endpgm
+;
+; DAGISEL10_W64-LABEL: set_inactive_chain_arg_64:
+; DAGISEL10_W64:       ; %bb.0:
+; DAGISEL10_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAGISEL10_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v1, v11
+; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v0, v10
+; DAGISEL10_W64-NEXT:    s_mov_b64 exec, s[0:1]
+; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v0, v12
+; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v1, v13
+; DAGISEL10_W64-NEXT:    s_not_b64 exec, exec
+; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v0, v0
+; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v1, v1
+; DAGISEL10_W64-NEXT:    s_not_b64 exec, exec
+; DAGISEL10_W64-NEXT:    global_store_dwordx2 v[8:9], v[0:1], off
+; DAGISEL10_W64-NEXT:    s_endpgm
   %tmp = call i64 @llvm.amdgcn.set.inactive.chain.arg.i64(i64 %active, i64 %inactive) #0
   store i64 %tmp, ptr addrspace(1) %out
   ret void



More information about the llvm-commits mailing list