[llvm] AMDGPU: Mark wqm_vote as SourceOfDivergence (PR #188831)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Mar 26 12:50:11 PDT 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: vangthao95
<details>
<summary>Changes</summary>
wqm_vote returns a lane mask. Marking it as SourceOfDivergence ensures the result is always treated as a per-lane value instead of a scalar boolean value to downstream users.
---
Full diff: https://github.com/llvm/llvm-project/pull/188831.diff
4 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td (+1)
- (modified) llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll (+7)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll (+32-44)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.vote.ll (+118-31)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index d1252f4154713..bef9a27868143 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -312,6 +312,7 @@ def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_cmpswap>;
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_cond_sub_u32>;
def : SourceOfDivergence<int_amdgcn_ps_live>;
def : SourceOfDivergence<int_amdgcn_live_mask>;
+def : SourceOfDivergence<int_amdgcn_wqm_vote>;
def : SourceOfDivergence<int_amdgcn_ds_swizzle>;
def : SourceOfDivergence<int_amdgcn_ds_ordered_add>;
def : SourceOfDivergence<int_amdgcn_ds_ordered_swap>;
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
index 46cb8cc1312dc..5fce455e114cb 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
@@ -839,5 +839,12 @@ define amdgpu_cs void @call_whole_wave(ptr addrspace(1) %out) {
declare amdgpu_gfx_whole_wave i32 @wwf(i1, i32) #0
+; CHECK: DIVERGENT: %v = call i1 @llvm.amdgcn.wqm.vote(i1 %c)
+define amdgpu_kernel void @wqm_vote(i32 %a, i32 %b) #1 {
+ %c = icmp eq i32 %a, %b
+ %v = call i1 @llvm.amdgcn.wqm.vote(i1 %c) #1
+ ret void
+}
+
attributes #0 = { nounwind convergent }
attributes #1 = { nounwind readnone convergent }
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
index e7ade6614795c..ce270ec2f24ac 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
@@ -41,9 +41,8 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa
; GFX7-NEXT: .LBB0_4: ; %Flow
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_wqm_b64 s[4:5], -1
-; GFX7-NEXT: s_and_b64 s[4:5], s[4:5], s[4:5]
-; GFX7-NEXT: s_andn2_b64 vcc, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_vccnz .LBB0_6
+; GFX7-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; GFX7-NEXT: s_cbranch_execz .LBB0_6
; GFX7-NEXT: ; %bb.5: ; %if
; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX7-NEXT: .LBB0_6: ; %UnifiedReturnBlock
@@ -76,9 +75,8 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa
; GFX89-NEXT: .LBB0_4: ; %Flow
; GFX89-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX89-NEXT: s_wqm_b64 s[4:5], -1
-; GFX89-NEXT: s_and_b64 s[4:5], s[4:5], s[4:5]
-; GFX89-NEXT: s_andn2_b64 vcc, exec, s[4:5]
-; GFX89-NEXT: s_cbranch_vccnz .LBB0_6
+; GFX89-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; GFX89-NEXT: s_cbranch_execz .LBB0_6
; GFX89-NEXT: ; %bb.5: ; %if
; GFX89-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX89-NEXT: .LBB0_6: ; %UnifiedReturnBlock
@@ -112,9 +110,8 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa
; GFX1064-NEXT: .LBB0_4: ; %Flow
; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX1064-NEXT: s_wqm_b64 s[4:5], -1
-; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], s[4:5]
-; GFX1064-NEXT: s_andn2_b64 vcc, exec, s[4:5]
-; GFX1064-NEXT: s_cbranch_vccnz .LBB0_6
+; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; GFX1064-NEXT: s_cbranch_execz .LBB0_6
; GFX1064-NEXT: ; %bb.5: ; %if
; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX1064-NEXT: .LBB0_6: ; %UnifiedReturnBlock
@@ -147,9 +144,8 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa
; GFX1032-NEXT: .LBB0_4: ; %Flow
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s8
; GFX1032-NEXT: s_wqm_b32 s4, -1
-; GFX1032-NEXT: s_and_b32 s4, s4, s4
-; GFX1032-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4
-; GFX1032-NEXT: s_cbranch_vccnz .LBB0_6
+; GFX1032-NEXT: s_and_saveexec_b32 s5, s4
+; GFX1032-NEXT: s_cbranch_execz .LBB0_6
; GFX1032-NEXT: ; %bb.5: ; %if
; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX1032-NEXT: .LBB0_6: ; %UnifiedReturnBlock
@@ -186,10 +182,9 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa
; GFX1164-NEXT: .LBB0_4: ; %Flow
; GFX1164-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX1164-NEXT: s_wqm_b64 s[4:5], -1
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], s[4:5]
-; GFX1164-NEXT: s_and_not1_b64 vcc, exec, s[4:5]
-; GFX1164-NEXT: s_cbranch_vccnz .LBB0_6
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; GFX1164-NEXT: s_cbranch_execz .LBB0_6
; GFX1164-NEXT: ; %bb.5: ; %if
; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX1164-NEXT: .LBB0_6: ; %UnifiedReturnBlock
@@ -225,10 +220,9 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa
; GFX1132-NEXT: .LBB0_4: ; %Flow
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s8
; GFX1132-NEXT: s_wqm_b32 s4, -1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_b32 s4, s4, s4
-; GFX1132-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX1132-NEXT: s_cbranch_vccnz .LBB0_6
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_and_saveexec_b32 s5, s4
+; GFX1132-NEXT: s_cbranch_execz .LBB0_6
; GFX1132-NEXT: ; %bb.5: ; %if
; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX1132-NEXT: .LBB0_6: ; %UnifiedReturnBlock
@@ -250,14 +244,14 @@ else:
define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspace(8) inreg %inout, i32 %val) {
; GFX7-LABEL: add_i32_varying:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_wqm_b64 s[8:9], -1
; GFX7-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc
-; GFX7-NEXT: s_andn2_b64 vcc, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_vccnz .LBB1_2
+; GFX7-NEXT: s_wqm_b64 s[4:5], -1
+; GFX7-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; GFX7-NEXT: s_cbranch_execz .LBB1_2
; GFX7-NEXT: ; %bb.1: ; %if
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX7-NEXT: .LBB1_2: ; %else
+; GFX7-NEXT: .LBB1_2: ; %UnifiedReturnBlock
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: add_i32_varying:
@@ -307,9 +301,8 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
; GFX8-NEXT: .LBB1_4: ; %Flow
; GFX8-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX8-NEXT: s_wqm_b64 s[4:5], -1
-; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], s[4:5]
-; GFX8-NEXT: s_andn2_b64 vcc, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_vccnz .LBB1_6
+; GFX8-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; GFX8-NEXT: s_cbranch_execz .LBB1_6
; GFX8-NEXT: ; %bb.5: ; %if
; GFX8-NEXT: buffer_store_dword v3, off, s[0:3], 0
; GFX8-NEXT: .LBB1_6: ; %UnifiedReturnBlock
@@ -362,9 +355,8 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
; GFX9-NEXT: .LBB1_4: ; %Flow
; GFX9-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX9-NEXT: s_wqm_b64 s[4:5], -1
-; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], s[4:5]
-; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5]
-; GFX9-NEXT: s_cbranch_vccnz .LBB1_6
+; GFX9-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; GFX9-NEXT: s_cbranch_execz .LBB1_6
; GFX9-NEXT: ; %bb.5: ; %if
; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], 0
; GFX9-NEXT: .LBB1_6: ; %UnifiedReturnBlock
@@ -422,9 +414,8 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
; GFX1064-NEXT: .LBB1_4: ; %Flow
; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX1064-NEXT: s_wqm_b64 s[4:5], -1
-; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], s[4:5]
-; GFX1064-NEXT: s_andn2_b64 vcc, exec, s[4:5]
-; GFX1064-NEXT: s_cbranch_vccnz .LBB1_6
+; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; GFX1064-NEXT: s_cbranch_execz .LBB1_6
; GFX1064-NEXT: ; %bb.5: ; %if
; GFX1064-NEXT: buffer_store_dword v4, off, s[0:3], 0
; GFX1064-NEXT: .LBB1_6: ; %UnifiedReturnBlock
@@ -472,9 +463,8 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
; GFX1032-NEXT: .LBB1_4: ; %Flow
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s8
; GFX1032-NEXT: s_wqm_b32 s4, -1
-; GFX1032-NEXT: s_and_b32 s4, s4, s4
-; GFX1032-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4
-; GFX1032-NEXT: s_cbranch_vccnz .LBB1_6
+; GFX1032-NEXT: s_and_saveexec_b32 s5, s4
+; GFX1032-NEXT: s_cbranch_execz .LBB1_6
; GFX1032-NEXT: ; %bb.5: ; %if
; GFX1032-NEXT: buffer_store_dword v4, off, s[0:3], 0
; GFX1032-NEXT: .LBB1_6: ; %UnifiedReturnBlock
@@ -542,10 +532,9 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
; GFX1164-NEXT: .LBB1_4: ; %Flow
; GFX1164-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX1164-NEXT: s_wqm_b64 s[4:5], -1
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], s[4:5]
-; GFX1164-NEXT: s_and_not1_b64 vcc, exec, s[4:5]
-; GFX1164-NEXT: s_cbranch_vccnz .LBB1_6
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; GFX1164-NEXT: s_cbranch_execz .LBB1_6
; GFX1164-NEXT: ; %bb.5: ; %if
; GFX1164-NEXT: buffer_store_b32 v4, off, s[0:3], 0
; GFX1164-NEXT: .LBB1_6: ; %UnifiedReturnBlock
@@ -599,10 +588,9 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
; GFX1132-NEXT: .LBB1_4: ; %Flow
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s8
; GFX1132-NEXT: s_wqm_b32 s4, -1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_b32 s4, s4, s4
-; GFX1132-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX1132-NEXT: s_cbranch_vccnz .LBB1_6
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_and_saveexec_b32 s5, s4
+; GFX1132-NEXT: s_cbranch_execz .LBB1_6
; GFX1132-NEXT: ; %bb.5: ; %if
; GFX1132-NEXT: buffer_store_b32 v4, off, s[0:3], 0
; GFX1132-NEXT: .LBB1_6: ; %UnifiedReturnBlock
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.vote.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.vote.ll
index f437cd2152f13..933a9678e2cee 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.vote.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.vote.ll
@@ -1,16 +1,33 @@
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=CHECK,WAVE64 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=CHECK,WAVE64 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=CHECK,WAVE32 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=CHECK,WAVE32 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=CHECK,WAVE32 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=CHECK,WAVE32 %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=WAVE64 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=WAVE64 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
-;CHECK-LABEL: {{^}}ret:
-;CHECK: v_cmp_eq_u32_e32 [[CMP:[^,]+]], v0, v1
-;WAVE64: s_wqm_b64 [[WQM:[^,]+]], [[CMP]]
-;WAVE32: s_wqm_b32 [[WQM:[^,]+]], [[CMP]]
-;CHECK: v_cndmask_b32_e64 v0, 0, 1.0, [[WQM]]
define amdgpu_ps float @ret(i32 %v0, i32 %v1) #1 {
+; WAVE64-LABEL: ret:
+; WAVE64: ; %bb.0: ; %main_body
+; WAVE64-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; WAVE64-NEXT: s_wqm_b64 s[0:1], vcc
+; WAVE64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s[0:1]
+; WAVE64-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: ret:
+; GFX10: ; %bb.0: ; %main_body
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-NEXT: s_wqm_b32 s0, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: ret:
+; GFX11: ; %bb.0: ; %main_body
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX11-NEXT: s_wqm_b32 s0, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0
+; GFX11-NEXT: ; return to shader part epilog
main_body:
%c = icmp eq i32 %v0, %v1
%w = call i1 @llvm.amdgcn.wqm.vote(i1 %c)
@@ -18,20 +35,50 @@ main_body:
ret float %r
}
-;CHECK-LABEL: {{^}}true:
-;WAVE64: s_wqm_b64
-;WAVE32: s_wqm_b32
define amdgpu_ps float @true() #1 {
+; WAVE64-LABEL: true:
+; WAVE64: ; %bb.0: ; %main_body
+; WAVE64-NEXT: s_wqm_b64 s[0:1], -1
+; WAVE64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s[0:1]
+; WAVE64-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: true:
+; GFX10: ; %bb.0: ; %main_body
+; GFX10-NEXT: s_wqm_b32 s0, -1
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: true:
+; GFX11: ; %bb.0: ; %main_body
+; GFX11-NEXT: s_wqm_b32 s0, -1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0
+; GFX11-NEXT: ; return to shader part epilog
main_body:
%w = call i1 @llvm.amdgcn.wqm.vote(i1 true)
%r = select i1 %w, float 1.0, float 0.0
ret float %r
}
-;CHECK-LABEL: {{^}}false:
-;WAVE64: s_wqm_b64
-;WAVE32: s_wqm_b32
define amdgpu_ps float @false() #1 {
+; WAVE64-LABEL: false:
+; WAVE64: ; %bb.0: ; %main_body
+; WAVE64-NEXT: s_wqm_b64 s[0:1], 0
+; WAVE64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s[0:1]
+; WAVE64-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: false:
+; GFX10: ; %bb.0: ; %main_body
+; GFX10-NEXT: s_wqm_b32 s0, 0
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: false:
+; GFX11: ; %bb.0: ; %main_body
+; GFX11-NEXT: s_wqm_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0
+; GFX11-NEXT: ; return to shader part epilog
main_body:
%w = call i1 @llvm.amdgcn.wqm.vote(i1 false)
%r = select i1 %w, float 1.0, float 0.0
@@ -39,21 +86,61 @@ main_body:
}
; Note: an almost identical test for this exists in llvm.amdgcn.kill.ll
-;CHECK-LABEL: {{^}}kill:
-;CHECK: v_cmp_eq_u32_e32 [[CMP:[^,]+]], v0, v1
-
-;WAVE64: s_wqm_b64 [[WQM:[^,]+]], [[CMP]]
-;WAVE64: s_andn2_b64 [[KILL:[^,]+]], exec, [[WQM]]
-;WAVE64: s_andn2_b64 [[MASK:[^,]+]], [[EXEC:[^,]+]], [[KILL]]
-;WAVE64: s_and_b64 exec, exec, [[MASK]]
-
-;WAVE32: s_wqm_b32 [[WQM:[^,]+]], [[CMP]]
-;WAVE32: s_and{{n2|_not1}}_b32 [[KILL:[^,]+]], exec_lo, [[WQM]]
-;WAVE32: s_and{{n2|_not1}}_b32 [[MASK:[^,]+]], [[EXEC:[^,]+]], [[KILL]]
-;WAVE32: s_and_b32 exec_lo, exec_lo, [[MASK]]
-
-;CHECK: s_endpgm
define amdgpu_ps float @kill(i32 %v0, i32 %v1) #1 {
+; WAVE64-LABEL: kill:
+; WAVE64: ; %bb.0: ; %main_body
+; WAVE64-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; WAVE64-NEXT: s_wqm_b64 s[2:3], vcc
+; WAVE64-NEXT: s_mov_b64 s[0:1], exec
+; WAVE64-NEXT: s_andn2_b64 s[2:3], exec, s[2:3]
+; WAVE64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; WAVE64-NEXT: s_cbranch_scc0 .LBB3_2
+; WAVE64-NEXT: ; %bb.1: ; %main_body
+; WAVE64-NEXT: s_and_b64 exec, exec, s[0:1]
+; WAVE64-NEXT: v_mov_b32_e32 v0, 0
+; WAVE64-NEXT: s_branch .LBB3_3
+; WAVE64-NEXT: .LBB3_2:
+; WAVE64-NEXT: s_mov_b64 exec, 0
+; WAVE64-NEXT: exp null, off, off, off, off done vm
+; WAVE64-NEXT: s_endpgm
+; WAVE64-NEXT: .LBB3_3:
+;
+; GFX10-LABEL: kill:
+; GFX10: ; %bb.0: ; %main_body
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-NEXT: s_mov_b32 s0, exec_lo
+; GFX10-NEXT: s_wqm_b32 s1, vcc_lo
+; GFX10-NEXT: s_andn2_b32 s1, exec_lo, s1
+; GFX10-NEXT: s_andn2_b32 s0, s0, s1
+; GFX10-NEXT: s_cbranch_scc0 .LBB3_2
+; GFX10-NEXT: ; %bb.1: ; %main_body
+; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s0
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_branch .LBB3_3
+; GFX10-NEXT: .LBB3_2:
+; GFX10-NEXT: s_mov_b32 exec_lo, 0
+; GFX10-NEXT: exp null, off, off, off, off done vm
+; GFX10-NEXT: s_endpgm
+; GFX10-NEXT: .LBB3_3:
+;
+; GFX11-LABEL: kill:
+; GFX11: ; %bb.0: ; %main_body
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_wqm_b32 s1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s1
+; GFX11-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX11-NEXT: s_cbranch_scc0 .LBB3_2
+; GFX11-NEXT: ; %bb.1: ; %main_body
+; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_branch .LBB3_3
+; GFX11-NEXT: .LBB3_2:
+; GFX11-NEXT: s_mov_b32 exec_lo, 0
+; GFX11-NEXT: exp mrt0, off, off, off, off done
+; GFX11-NEXT: s_endpgm
+; GFX11-NEXT: .LBB3_3:
main_body:
%c = icmp eq i32 %v0, %v1
%w = call i1 @llvm.amdgcn.wqm.vote(i1 %c)
``````````
</details>
https://github.com/llvm/llvm-project/pull/188831
More information about the llvm-commits
mailing list