[llvm] 2f4d44b - AMDGPU: add test to show wwm register overwrite issue

Sat Feb 5 20:38:38 PST 2022

Author: Ruiling Song
Date: 2022-02-06T12:38:26+08:00
New Revision: 2f4d44bcd4a122f8f3c5539b08bdbdb20b72dc26

URL: https://github.com/llvm/llvm-project/commit/2f4d44bcd4a122f8f3c5539b08bdbdb20b72dc26
DIFF: https://github.com/llvm/llvm-project/commit/2f4d44bcd4a122f8f3c5539b08bdbdb20b72dc26.diff

LOG: AMDGPU: add test to show wwm register overwrite issue

Pre-commit the test to make the diff easy to read later.

Differential Revision: https://reviews.llvm.org/D117527

Added: 
    llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll

Modified: 
    

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll b/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll
new file mode 100644
index 0000000000000..e5d17d107076e

--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll
@@ -0,0 +1,136 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN
+
+define amdgpu_cs void @if_then(<4 x i32> inreg %input, <4 x i32> inreg %output, <3 x i32> %LocalInvocationId) {
+; GCN-LABEL: if_then:
+; GCN:       ; %bb.0: ; %.entry
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GCN-NEXT:  ; %bb.1: ; %.bb0
+; GCN-NEXT:    v_mov_b32_e32 v1, 1
+; GCN-NEXT:  ; %bb.2: ; %.merge
+; GCN-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 3, v0
+; GCN-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GCN-NEXT:    s_cbranch_execz .LBB0_4
+; GCN-NEXT:  ; %bb.3: ; %.then
+; GCN-NEXT:    s_not_b32 exec_lo, exec_lo
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_not_b32 exec_lo, exec_lo
+; GCN-NEXT:    s_or_saveexec_b32 s1, -1
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GCN-NEXT:    s_mov_b32 exec_lo, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, v2
+; GCN-NEXT:    v_mov_b32_e32 v3, -1
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    buffer_store_dword v3, v0, s[4:7], 0 offen
+; GCN-NEXT:  .LBB0_4: ; %.end
+; GCN-NEXT:    s_waitcnt_depctr 0xffe3
+; GCN-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GCN-NEXT:    v_mov_b32_e32 v0, -1
+; GCN-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen
+; GCN-NEXT:    s_endpgm
+.entry:
+  %LocalInvocationId.i0 = extractelement <3 x i32> %LocalInvocationId, i32 0
+  %.not10002 = icmp eq i32 %LocalInvocationId.i0, 0
+  %i530 = icmp ult i32 %LocalInvocationId.i0, 4
+  br i1 %.not10002, label %.merge, label %.bb0
+
+.bb0:
+  br label %.merge
+
+.merge:
+  %src = phi i32 [ 0, %.entry ], [ 1, %.bb0 ]
+  br i1 %i530, label %.end, label %.then
+
+.then:
+  %i562 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src, i32 0)
+  %i563 = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %i562, i32 273, i32 15, i32 15, i1 false)
+  %i564 = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %i563)
+  call void @llvm.amdgcn.raw.buffer.store.i32(i32 -1, <4 x i32> %output, i32 %i564, i32 0, i32 0)
+  br label %.end
+
+.end:
+  %idx = phi i32 [ 0, %.then ], [ %src, %.merge ]
+  call void @llvm.amdgcn.raw.buffer.store.i32(i32 -1, <4 x i32> %output, i32 %idx, i32 0, i32 0)
+  ret void
+}
+
+
+define amdgpu_cs void @if_else_vgpr_opt(<4 x i32> inreg %input, <4 x i32> inreg %output, <3 x i32> %LocalInvocationId) {
+; GCN-LABEL: if_else_vgpr_opt:
+; GCN:       ; %bb.0: ; %.entry
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GCN-NEXT:  ; %bb.1: ; %.bb0
+; GCN-NEXT:    v_mov_b32_e32 v2, 1
+; GCN-NEXT:  ; %bb.2: ; %.merge
+; GCN-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 3, v0
+; GCN-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GCN-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GCN-NEXT:    s_cbranch_execz .LBB1_4
+; GCN-NEXT:  ; %bb.3: ; %.else
+; GCN-NEXT:    s_or_saveexec_b32 s1, -1
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_mov_b32 exec_lo, s1
+; GCN-NEXT:    s_not_b32 exec_lo, exec_lo
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    s_not_b32 exec_lo, exec_lo
+; GCN-NEXT:    s_or_saveexec_b32 s1, -1
+; GCN-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GCN-NEXT:    s_mov_b32 exec_lo, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, v1
+; GCN-NEXT:    v_mov_b32_e32 v3, -1
+; GCN-NEXT:    ; implicit-def: $vgpr2
+; GCN-NEXT:    buffer_store_dword v3, v0, s[4:7], 0 offen
+; GCN-NEXT:  .LBB1_4: ; %Flow
+; GCN-NEXT:    s_or_saveexec_b32 s0, s0
+; GCN-NEXT:    s_waitcnt_depctr 0xffe3
+; GCN-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GCN-NEXT:    s_cbranch_execz .LBB1_6
+; GCN-NEXT:  ; %bb.5: ; %.then
+; GCN-NEXT:    v_mov_b32_e32 v0, -1
+; GCN-NEXT:    buffer_store_dword v0, v2, s[4:7], 0 offen
+; GCN-NEXT:  .LBB1_6: ; %.end
+; GCN-NEXT:    s_endpgm
+.entry:
+  %LocalInvocationId.i0 = extractelement <3 x i32> %LocalInvocationId, i32 0
+  %.not10002 = icmp eq i32 %LocalInvocationId.i0, 0
+  %i530 = icmp ult i32 %LocalInvocationId.i0, 4
+  br i1 %.not10002, label %.merge, label %.bb0
+
+.bb0:
+  br label %.merge
+
+.merge:
+  %src = phi i32 [ 0, %.entry ], [ 1, %.bb0 ]
+  br i1 %i530, label %.then, label %.else
+
+.then:
+  call void @llvm.amdgcn.raw.buffer.store.i32(i32 -1, <4 x i32> %output, i32 %src, i32 0, i32 0)
+  br label %.end
+
+.else:
+  %i562 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src, i32 0)
+  %i563 = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %i562, i32 273, i32 15, i32 15, i1 false)
+  %i564 = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %i563)
+  call void @llvm.amdgcn.raw.buffer.store.i32(i32 -1, <4 x i32> %output, i32 %i564, i32 0, i32 0)
+  br label %.end
+
+.end:
+  ret void
+}
+
+declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #0
+declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) #0
+declare i32 @llvm.amdgcn.strict.wwm.i32(i32) #1
+declare void @llvm.amdgcn.raw.buffer.store.i32(i32, <4 x i32>, i32, i32 immarg, i32 immarg) #2
+
+attributes #0 = { convergent nounwind readnone willreturn }
+attributes #1 = { convergent nounwind readnone speculatable willreturn }
+attributes #2 = { nounwind willreturn writeonly }
+