[llvm] [AMDGPU] Fix computed kill mask (PR #122736)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Jan 13 08:26:26 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Piotr Sobczak (piotrAMD)
<details>
<summary>Changes</summary>
Replace S_XOR with S_ANDN2 when computing the kill mask in demote/kill lowering. This has the effect of AND'ing demote/kill condition with exec which is needed for proper live mask update.
The S_XOR is inadequate because it may return true for lane with exec=0.
This patch fixes an image corruption in game.
I think the issue went unnoticed because demote/kill condition is often naturally dependent on exec, so AND'ing with exec is usually not required.
---
Full diff: https://github.com/llvm/llvm-project/pull/122736.diff
9 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll (+8-8)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll (+8-8)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.vote.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/si-annotate-cf-kill.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/skip-if-dead.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/wave32.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/wqm.ll (+2-2)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 9fbb847da2af1c..263f6497b9a7a8 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -960,7 +960,7 @@ MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB,
// so exec mask needs to be factored in.
TmpReg = MRI->createVirtualRegister(TRI->getBoolRC());
ComputeKilledMaskMI =
- BuildMI(MBB, MI, DL, TII->get(XorOpc), TmpReg).add(Op).addReg(Exec);
+ BuildMI(MBB, MI, DL, TII->get(AndN2Opc), TmpReg).addReg(Exec).add(Op);
MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
.addReg(LiveMaskReg)
.addReg(TmpReg);
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll
index 0f60f40bd337be..e79177c5df5ebe 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll
@@ -78,7 +78,7 @@ define amdgpu_ps void @dynamic_exact(float %arg0, float %arg1) {
; SI: ; %bb.0: ; %.entry
; SI-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1
; SI-NEXT: s_mov_b64 s[2:3], exec
-; SI-NEXT: s_xor_b64 s[0:1], s[0:1], exec
+; SI-NEXT: s_andn2_b64 s[0:1], exec, s[0:1]
; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1]
; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0
; SI-NEXT: s_cbranch_scc0 .LBB1_2
@@ -96,7 +96,7 @@ define amdgpu_ps void @dynamic_exact(float %arg0, float %arg1) {
; GFX9: ; %bb.0: ; %.entry
; GFX9-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1
; GFX9-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-NEXT: s_xor_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT: s_andn2_b64 s[0:1], exec, s[0:1]
; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1]
; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0
; GFX9-NEXT: s_cbranch_scc0 .LBB1_2
@@ -115,7 +115,7 @@ define amdgpu_ps void @dynamic_exact(float %arg0, float %arg1) {
; GFX10-32-NEXT: v_cmp_le_f32_e64 s0, 0, v1
; GFX10-32-NEXT: s_mov_b32 s1, exec_lo
; GFX10-32-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v0
-; GFX10-32-NEXT: s_xor_b32 s0, s0, exec_lo
+; GFX10-32-NEXT: s_andn2_b32 s0, exec_lo, s0
; GFX10-32-NEXT: s_andn2_b32 s1, s1, s0
; GFX10-32-NEXT: s_cbranch_scc0 .LBB1_2
; GFX10-32-NEXT: ; %bb.1: ; %.entry
@@ -133,7 +133,7 @@ define amdgpu_ps void @dynamic_exact(float %arg0, float %arg1) {
; GFX10-64-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1
; GFX10-64-NEXT: s_mov_b64 s[2:3], exec
; GFX10-64-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0
-; GFX10-64-NEXT: s_xor_b64 s[0:1], s[0:1], exec
+; GFX10-64-NEXT: s_andn2_b64 s[0:1], exec, s[0:1]
; GFX10-64-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1]
; GFX10-64-NEXT: s_cbranch_scc0 .LBB1_2
; GFX10-64-NEXT: ; %bb.1: ; %.entry
@@ -556,7 +556,7 @@ define amdgpu_ps <4 x float> @wqm_demote_dynamic(<8 x i32> inreg %rsrc, <4 x i32
; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0
-; SI-NEXT: s_xor_b64 s[14:15], vcc, exec
+; SI-NEXT: s_andn2_b64 s[14:15], exec, vcc
; SI-NEXT: s_andn2_b64 s[12:13], s[12:13], s[14:15]
; SI-NEXT: s_cbranch_scc0 .LBB5_2
; SI-NEXT: ; %bb.1: ; %.entry
@@ -580,7 +580,7 @@ define amdgpu_ps <4 x float> @wqm_demote_dynamic(<8 x i32> inreg %rsrc, <4 x i32
; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0
-; GFX9-NEXT: s_xor_b64 s[14:15], vcc, exec
+; GFX9-NEXT: s_andn2_b64 s[14:15], exec, vcc
; GFX9-NEXT: s_andn2_b64 s[12:13], s[12:13], s[14:15]
; GFX9-NEXT: s_cbranch_scc0 .LBB5_2
; GFX9-NEXT: ; %bb.1: ; %.entry
@@ -604,7 +604,7 @@ define amdgpu_ps <4 x float> @wqm_demote_dynamic(<8 x i32> inreg %rsrc, <4 x i32
; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-32-NEXT: s_waitcnt vmcnt(0)
; GFX10-32-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v0
-; GFX10-32-NEXT: s_xor_b32 s13, vcc_lo, exec_lo
+; GFX10-32-NEXT: s_andn2_b32 s13, exec_lo, vcc_lo
; GFX10-32-NEXT: s_andn2_b32 s12, s12, s13
; GFX10-32-NEXT: s_cbranch_scc0 .LBB5_2
; GFX10-32-NEXT: ; %bb.1: ; %.entry
@@ -628,7 +628,7 @@ define amdgpu_ps <4 x float> @wqm_demote_dynamic(<8 x i32> inreg %rsrc, <4 x i32
; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-64-NEXT: s_waitcnt vmcnt(0)
; GFX10-64-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0
-; GFX10-64-NEXT: s_xor_b64 s[14:15], vcc, exec
+; GFX10-64-NEXT: s_andn2_b64 s[14:15], exec, vcc
; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], s[14:15]
; GFX10-64-NEXT: s_cbranch_scc0 .LBB5_2
; GFX10-64-NEXT: ; %bb.1: ; %.entry
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll
index e0dacd47e51c8d..94aad397284ff0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll
@@ -55,7 +55,7 @@ define amdgpu_gs void @false() {
; GCN: v_cmp_lt_i32
; GCN: v_cmp_lt_i32
; GCN: s_or_b64 s[0:1]
-; GCN: s_xor_b64 s[0:1], s[0:1], exec
+; GCN: s_and{{n2|_not1}}_b64 s[0:1], exec, s[0:1]
; GCN: s_and{{n2|_not1}}_b64 s[2:3], s[2:3], s[0:1]
; GCN: s_and_b64 exec, exec, s[2:3]
define amdgpu_gs void @and(i32 %a, i32 %b, i32 %c, i32 %d) {
@@ -238,7 +238,7 @@ define amdgpu_ps void @fcmp_x2(float %a) #0 {
; GCN: v_cmp_neq_f32_e32 vcc, 0
; GCN-DAG: s_wqm_b64 s[2:3], vcc
; GCN-DAG: s_mov_b64 s[0:1], exec
-; GCN: s_xor_b64 s[2:3], s[2:3], exec
+; GCN: s_and{{n2|_not1}}_b64 s[2:3], exec, s[2:3]
; GCN: s_and{{n2|_not1}}_b64 s[0:1], s[0:1], s[2:3]
; GCN: s_and_b64 exec, exec, s[0:1]
define amdgpu_ps float @wqm(float %a) {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
index 004a720b9ab486..13ce979a954c24 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
@@ -78,7 +78,7 @@ define amdgpu_ps void @dynamic_exact(float %arg0, float %arg1) {
; SI: ; %bb.0: ; %.entry
; SI-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1
; SI-NEXT: s_mov_b64 s[2:3], exec
-; SI-NEXT: s_xor_b64 s[0:1], s[0:1], exec
+; SI-NEXT: s_andn2_b64 s[0:1], exec, s[0:1]
; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1]
; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0
; SI-NEXT: s_cbranch_scc0 .LBB1_2
@@ -96,7 +96,7 @@ define amdgpu_ps void @dynamic_exact(float %arg0, float %arg1) {
; GFX9: ; %bb.0: ; %.entry
; GFX9-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1
; GFX9-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-NEXT: s_xor_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT: s_andn2_b64 s[0:1], exec, s[0:1]
; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1]
; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0
; GFX9-NEXT: s_cbranch_scc0 .LBB1_2
@@ -115,7 +115,7 @@ define amdgpu_ps void @dynamic_exact(float %arg0, float %arg1) {
; GFX10-32-NEXT: v_cmp_le_f32_e64 s0, 0, v1
; GFX10-32-NEXT: s_mov_b32 s1, exec_lo
; GFX10-32-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v0
-; GFX10-32-NEXT: s_xor_b32 s0, s0, exec_lo
+; GFX10-32-NEXT: s_andn2_b32 s0, exec_lo, s0
; GFX10-32-NEXT: s_andn2_b32 s1, s1, s0
; GFX10-32-NEXT: s_cbranch_scc0 .LBB1_2
; GFX10-32-NEXT: ; %bb.1: ; %.entry
@@ -133,7 +133,7 @@ define amdgpu_ps void @dynamic_exact(float %arg0, float %arg1) {
; GFX10-64-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1
; GFX10-64-NEXT: s_mov_b64 s[2:3], exec
; GFX10-64-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0
-; GFX10-64-NEXT: s_xor_b64 s[0:1], s[0:1], exec
+; GFX10-64-NEXT: s_andn2_b64 s[0:1], exec, s[0:1]
; GFX10-64-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1]
; GFX10-64-NEXT: s_cbranch_scc0 .LBB1_2
; GFX10-64-NEXT: ; %bb.1: ; %.entry
@@ -557,7 +557,7 @@ define amdgpu_ps <4 x float> @wqm_demote_dynamic(<8 x i32> inreg %rsrc, <4 x i32
; SI-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0
-; SI-NEXT: s_xor_b64 s[14:15], vcc, exec
+; SI-NEXT: s_andn2_b64 s[14:15], exec, vcc
; SI-NEXT: s_andn2_b64 s[12:13], s[12:13], s[14:15]
; SI-NEXT: s_cbranch_scc0 .LBB5_2
; SI-NEXT: ; %bb.1: ; %.entry
@@ -581,7 +581,7 @@ define amdgpu_ps <4 x float> @wqm_demote_dynamic(<8 x i32> inreg %rsrc, <4 x i32
; GFX9-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0
-; GFX9-NEXT: s_xor_b64 s[14:15], vcc, exec
+; GFX9-NEXT: s_andn2_b64 s[14:15], exec, vcc
; GFX9-NEXT: s_andn2_b64 s[12:13], s[12:13], s[14:15]
; GFX9-NEXT: s_cbranch_scc0 .LBB5_2
; GFX9-NEXT: ; %bb.1: ; %.entry
@@ -605,7 +605,7 @@ define amdgpu_ps <4 x float> @wqm_demote_dynamic(<8 x i32> inreg %rsrc, <4 x i32
; GFX10-32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
; GFX10-32-NEXT: s_waitcnt vmcnt(0)
; GFX10-32-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v0
-; GFX10-32-NEXT: s_xor_b32 s13, vcc_lo, exec_lo
+; GFX10-32-NEXT: s_andn2_b32 s13, exec_lo, vcc_lo
; GFX10-32-NEXT: s_andn2_b32 s12, s12, s13
; GFX10-32-NEXT: s_cbranch_scc0 .LBB5_2
; GFX10-32-NEXT: ; %bb.1: ; %.entry
@@ -629,7 +629,7 @@ define amdgpu_ps <4 x float> @wqm_demote_dynamic(<8 x i32> inreg %rsrc, <4 x i32
; GFX10-64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
; GFX10-64-NEXT: s_waitcnt vmcnt(0)
; GFX10-64-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0
-; GFX10-64-NEXT: s_xor_b64 s[14:15], vcc, exec
+; GFX10-64-NEXT: s_andn2_b64 s[14:15], exec, vcc
; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], s[14:15]
; GFX10-64-NEXT: s_cbranch_scc0 .LBB5_2
; GFX10-64-NEXT: ; %bb.1: ; %.entry
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.vote.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.vote.ll
index a4da690b2af1f2..34c6149fe92f95 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.vote.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.vote.ll
@@ -43,12 +43,12 @@ main_body:
;CHECK: v_cmp_eq_u32_e32 [[CMP:[^,]+]], v0, v1
;WAVE64: s_wqm_b64 [[WQM:[^,]+]], [[CMP]]
-;WAVE64: s_xor_b64 [[KILL:[^,]+]], [[WQM]], exec
+;WAVE64: s_andn2_b64 [[KILL:[^,]+]], exec, [[WQM]]
;WAVE64: s_andn2_b64 [[MASK:[^,]+]], [[EXEC:[^,]+]], [[KILL]]
;WAVE64: s_and_b64 exec, exec, [[MASK]]
;WAVE32: s_wqm_b32 [[WQM:[^,]+]], [[CMP]]
-;WAVE32: s_xor_b32 [[KILL:[^,]+]], [[WQM]], exec
+;WAVE32: s_and{{n2|_not1}}_b32 [[KILL:[^,]+]], exec_lo, [[WQM]]
;WAVE32: s_and{{n2|_not1}}_b32 [[MASK:[^,]+]], [[EXEC:[^,]+]], [[KILL]]
;WAVE32: s_and_b32 exec_lo, exec_lo, [[MASK]]
diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf-kill.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf-kill.ll
index cef959f45437db..53698ff71de620 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf-kill.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf-kill.ll
@@ -17,7 +17,7 @@ define amdgpu_ps float @uniform_kill(float %a, i32 %b, float %c) {
; SI-NEXT: ; %bb.2: ; %endif1
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: s_wqm_b64 s[4:5], s[2:3]
-; SI-NEXT: s_xor_b64 s[4:5], s[4:5], exec
+; SI-NEXT: s_andn2_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
; SI-NEXT: s_cbranch_scc0 .LBB0_6
; SI-NEXT: ; %bb.3: ; %endif1
@@ -59,7 +59,7 @@ define amdgpu_ps float @uniform_kill(float %a, i32 %b, float %c) {
; FLAT-NEXT: ; %bb.2: ; %endif1
; FLAT-NEXT: s_or_b64 exec, exec, s[4:5]
; FLAT-NEXT: s_wqm_b64 s[4:5], s[2:3]
-; FLAT-NEXT: s_xor_b64 s[4:5], s[4:5], exec
+; FLAT-NEXT: s_andn2_b64 s[4:5], exec, s[4:5]
; FLAT-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
; FLAT-NEXT: s_cbranch_scc0 .LBB0_6
; FLAT-NEXT: ; %bb.3: ; %endif1
diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
index fdcb6c941e16ec..715ea57d473f5b 100644
--- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
+++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
@@ -761,7 +761,7 @@ define amdgpu_ps float @test_kill_control_flow_return(i32 inreg %arg) #0 {
; SI-NEXT: s_cmp_eq_u32 s0, 1
; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
; SI-NEXT: s_mov_b64 s[2:3], exec
-; SI-NEXT: s_xor_b64 s[4:5], s[4:5], exec
+; SI-NEXT: s_andn2_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
; SI-NEXT: s_cbranch_scc0 .LBB9_4
; SI-NEXT: ; %bb.1: ; %entry
@@ -798,7 +798,7 @@ define amdgpu_ps float @test_kill_control_flow_return(i32 inreg %arg) #0 {
; GFX10-WAVE64-NEXT: s_cmp_eq_u32 s0, 1
; GFX10-WAVE64-NEXT: s_mov_b64 s[2:3], exec
; GFX10-WAVE64-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX10-WAVE64-NEXT: s_xor_b64 s[4:5], s[4:5], exec
+; GFX10-WAVE64-NEXT: s_andn2_b64 s[4:5], exec, s[4:5]
; GFX10-WAVE64-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
; GFX10-WAVE64-NEXT: s_cbranch_scc0 .LBB9_4
; GFX10-WAVE64-NEXT: ; %bb.1: ; %entry
@@ -835,7 +835,7 @@ define amdgpu_ps float @test_kill_control_flow_return(i32 inreg %arg) #0 {
; GFX10-WAVE32-NEXT: s_cmp_eq_u32 s0, 1
; GFX10-WAVE32-NEXT: s_mov_b32 s1, exec_lo
; GFX10-WAVE32-NEXT: s_cselect_b32 s2, -1, 0
-; GFX10-WAVE32-NEXT: s_xor_b32 s2, s2, exec_lo
+; GFX10-WAVE32-NEXT: s_andn2_b32 s2, exec_lo, s2
; GFX10-WAVE32-NEXT: s_andn2_b32 s1, s1, s2
; GFX10-WAVE32-NEXT: s_cbranch_scc0 .LBB9_4
; GFX10-WAVE32-NEXT: ; %bb.1: ; %entry
@@ -873,7 +873,7 @@ define amdgpu_ps float @test_kill_control_flow_return(i32 inreg %arg) #0 {
; GFX11-NEXT: s_mov_b64 s[2:3], exec
; GFX11-NEXT: s_cselect_b64 s[4:5], -1, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b64 s[4:5], s[4:5], exec
+; GFX11-NEXT: s_and_not1_b64 s[4:5], exec, s[4:5]
; GFX11-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5]
; GFX11-NEXT: s_cbranch_scc0 .LBB9_4
; GFX11-NEXT: ; %bb.1: ; %entry
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index 9b13ce6ab69cc7..4e17be1ebb312e 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -1767,7 +1767,7 @@ define amdgpu_gs void @test_kill_i1_terminator_i1(i32 %a, i32 %b, i32 %c, i32 %d
; GFX1032-NEXT: v_cmp_lt_i32_e64 s0, v2, v3
; GFX1032-NEXT: s_mov_b32 s1, exec_lo
; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-NEXT: s_xor_b32 s0, s0, exec_lo
+; GFX1032-NEXT: s_andn2_b32 s0, exec_lo, s0
; GFX1032-NEXT: s_andn2_b32 s1, s1, s0
; GFX1032-NEXT: s_and_b32 exec_lo, exec_lo, s1
; GFX1032-NEXT: v_mov_b32_e32 v0, 0
@@ -1783,7 +1783,7 @@ define amdgpu_gs void @test_kill_i1_terminator_i1(i32 %a, i32 %b, i32 %c, i32 %d
; GFX1064-NEXT: v_cmp_lt_i32_e64 s[0:1], v2, v3
; GFX1064-NEXT: s_mov_b64 s[2:3], exec
; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-NEXT: s_xor_b64 s[0:1], s[0:1], exec
+; GFX1064-NEXT: s_andn2_b64 s[0:1], exec, s[0:1]
; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1]
; GFX1064-NEXT: s_and_b64 exec, exec, s[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v0, 0
@@ -2256,7 +2256,7 @@ define amdgpu_ps void @test_wqm_vote(float %a) {
; GFX1032-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-NEXT: v_mov_b32_e32 v0, 0
; GFX1032-NEXT: s_wqm_b32 s1, vcc_lo
-; GFX1032-NEXT: s_xor_b32 s1, s1, exec_lo
+; GFX1032-NEXT: s_andn2_b32 s1, exec_lo, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: s_cbranch_scc0 .LBB44_2
; GFX1032-NEXT: ; %bb.1:
@@ -2274,7 +2274,7 @@ define amdgpu_ps void @test_wqm_vote(float %a) {
; GFX1064-NEXT: s_mov_b64 s[0:1], exec
; GFX1064-NEXT: v_mov_b32_e32 v0, 0
; GFX1064-NEXT: s_wqm_b64 s[2:3], vcc
-; GFX1064-NEXT: s_xor_b64 s[2:3], s[2:3], exec
+; GFX1064-NEXT: s_andn2_b64 s[2:3], exec, s[2:3]
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
; GFX1064-NEXT: s_cbranch_scc0 .LBB44_2
; GFX1064-NEXT: ; %bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index d18a2288ef244f..14395648de3db7 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -2938,7 +2938,7 @@ define amdgpu_ps float @test_strict_wqm_within_wqm_with_kill(<8 x i32> inreg %rs
; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
-; GFX9-W64-NEXT: s_xor_b64 s[0:1], vcc, exec
+; GFX9-W64-NEXT: s_andn2_b64 s[0:1], exec, vcc
; GFX9-W64-NEXT: s_andn2_b64 s[12:13], s[12:13], s[0:1]
; GFX9-W64-NEXT: s_cbranch_scc0 .LBB51_2
; GFX9-W64-NEXT: ; %bb.1: ; %main_body
@@ -2973,7 +2973,7 @@ define amdgpu_ps float @test_strict_wqm_within_wqm_with_kill(<8 x i32> inreg %rs
; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
-; GFX10-W32-NEXT: s_xor_b32 s0, vcc_lo, exec_lo
+; GFX10-W32-NEXT: s_andn2_b32 s0, exec_lo, vcc_lo
; GFX10-W32-NEXT: s_andn2_b32 s12, s12, s0
; GFX10-W32-NEXT: s_cbranch_scc0 .LBB51_2
; GFX10-W32-NEXT: ; %bb.1: ; %main_body
``````````
</details>
https://github.com/llvm/llvm-project/pull/122736
More information about the llvm-commits
mailing list