[llvm] [AMDGPU][SIPreEmitPeephole] Missing condition in mustRetainExeczBranch (PR #121787)
Juan Manuel Martinez CaamaƱo via llvm-commits
llvm-commits at lists.llvm.org
Mon Jan 6 08:05:55 PST 2025
https://github.com/jmmartinez created https://github.com/llvm/llvm-project/pull/121787
If the code in the "then" block is modifying the exec mask, we must retain the s_cbranch_execz branch.
Consider this example:
s_cbranch_execz after
s_or_b32 exec_lo, exec_lo, -1
after:
...
If the branch is removed, when we reach after exec is never zero, while before it would have been zero.
I stumbled upon this bug by accident. I'm trying to see if the bug is more general than this (this should be a problem for any SALU operation writing to a non-SSA register) and how to test it.
>From 5fda6ec7064674fea17781e93f9b8c7c52a59e58 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?= <juamarti at amd.com>
Date: Mon, 6 Jan 2025 16:48:55 +0100
Subject: [PATCH] [AMDGPU][SIPreEmitPeephole] Missing condition in
mustRetainExeczBranch
If the code in the "then" block is modifying the exec mask, we must
retain the s_cbranch_execz branch.
Consider this example:
s_cbranch_execz after
s_or_b32 exec_lo, exec_lo, -1
after:
...
If the branch is removed, when we reach after exec is never zero, while
before it would have been zero.
---
llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp | 3 ++
llvm/test/CodeGen/AMDGPU/cse-convergent.ll | 3 +-
.../AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll | 35 +++++++++++++------
.../AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll | 7 ++--
.../test/CodeGen/AMDGPU/wwm-reserved-spill.ll | 3 +-
llvm/test/CodeGen/AMDGPU/wwm-reserved.ll | 6 ++--
6 files changed, 41 insertions(+), 16 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
index 2bb70c138a50c4..8c074f72fb02e7 100644
--- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -369,6 +369,9 @@ bool SIPreEmitPeephole::mustRetainExeczBranch(
if (MI.isMetaInstruction())
continue;
+ if (MI.modifiesRegister(AMDGPU::EXEC, nullptr))
+ return true;
+
if (TII->hasUnwantedEffectsWhenEXECEmpty(MI))
return true;
diff --git a/llvm/test/CodeGen/AMDGPU/cse-convergent.ll b/llvm/test/CodeGen/AMDGPU/cse-convergent.ll
index 7aca63d34f51bf..52f1ed7e991169 100644
--- a/llvm/test/CodeGen/AMDGPU/cse-convergent.ll
+++ b/llvm/test/CodeGen/AMDGPU/cse-convergent.ll
@@ -19,6 +19,7 @@ define i32 @test(i32 %val, i32 %cond) {
; GCN-NEXT: v_mov_b32_e32 v4, v2
; GCN-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
; GCN-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GCN-NEXT: s_cbranch_execz .LBB0_2
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: s_or_saveexec_b32 s5, -1
; GCN-NEXT: v_mov_b32_e32 v2, 0
@@ -26,7 +27,7 @@ define i32 @test(i32 %val, i32 %cond) {
; GCN-NEXT: v_mov_b32_dpp v2, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
; GCN-NEXT: s_mov_b32 exec_lo, s5
; GCN-NEXT: v_mov_b32_e32 v5, v2
-; GCN-NEXT: ; %bb.2: ; %end
+; GCN-NEXT: .LBB0_2: ; %end
; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GCN-NEXT: v_add_nc_u32_e32 v0, v4, v5
; GCN-NEXT: s_xor_saveexec_b32 s4, -1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll
index 353f4d90cad1f2..1d1b075bdac60b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll
@@ -119,6 +119,7 @@ define amdgpu_cs_chain void @wwm_in_shader(<3 x i32> inreg %sgpr, ptr inreg %cal
; GISEL12-NEXT: s_mov_b32 s7, s4
; GISEL12-NEXT: s_wait_alu 0xfffe
; GISEL12-NEXT: s_and_saveexec_b32 s3, s8
+; GISEL12-NEXT: s_cbranch_execz .LBB1_2
; GISEL12-NEXT: ; %bb.1: ; %shader
; GISEL12-NEXT: s_or_saveexec_b32 s4, -1
; GISEL12-NEXT: s_wait_alu 0xfffe
@@ -129,7 +130,8 @@ define amdgpu_cs_chain void @wwm_in_shader(<3 x i32> inreg %sgpr, ptr inreg %cal
; GISEL12-NEXT: s_mov_b32 exec_lo, s4
; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL12-NEXT: v_dual_mov_b32 v11, v0 :: v_dual_add_nc_u32 v10, 42, v10
-; GISEL12-NEXT: ; %bb.2: ; %tail
+; GISEL12-NEXT: .LBB1_2: ; %tail
+; GISEL12-NEXT: s_wait_alu 0xfffe
; GISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s3
; GISEL12-NEXT: s_mov_b32 exec_lo, s5
; GISEL12-NEXT: s_wait_alu 0xfffe
@@ -148,6 +150,7 @@ define amdgpu_cs_chain void @wwm_in_shader(<3 x i32> inreg %sgpr, ptr inreg %cal
; DAGISEL12-NEXT: s_mov_b32 s6, s3
; DAGISEL12-NEXT: s_wait_alu 0xfffe
; DAGISEL12-NEXT: s_and_saveexec_b32 s3, s8
+; DAGISEL12-NEXT: s_cbranch_execz .LBB1_2
; DAGISEL12-NEXT: ; %bb.1: ; %shader
; DAGISEL12-NEXT: s_or_saveexec_b32 s4, -1
; DAGISEL12-NEXT: s_wait_alu 0xfffe
@@ -156,7 +159,8 @@ define amdgpu_cs_chain void @wwm_in_shader(<3 x i32> inreg %sgpr, ptr inreg %cal
; DAGISEL12-NEXT: v_cmp_ne_u32_e64 s8, 0, v0
; DAGISEL12-NEXT: s_mov_b32 exec_lo, s4
; DAGISEL12-NEXT: v_dual_mov_b32 v11, s8 :: v_dual_add_nc_u32 v10, 42, v10
-; DAGISEL12-NEXT: ; %bb.2: ; %tail
+; DAGISEL12-NEXT: .LBB1_2: ; %tail
+; DAGISEL12-NEXT: s_wait_alu 0xfffe
; DAGISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s3
; DAGISEL12-NEXT: s_mov_b32 exec_lo, s5
; DAGISEL12-NEXT: s_wait_alu 0xfffe
@@ -171,6 +175,7 @@ define amdgpu_cs_chain void @wwm_in_shader(<3 x i32> inreg %sgpr, ptr inreg %cal
; GISEL10-NEXT: s_mov_b32 s6, s3
; GISEL10-NEXT: s_mov_b32 s7, s4
; GISEL10-NEXT: s_and_saveexec_b32 s3, s8
+; GISEL10-NEXT: s_cbranch_execz .LBB1_2
; GISEL10-NEXT: ; %bb.1: ; %shader
; GISEL10-NEXT: s_or_saveexec_b32 s4, -1
; GISEL10-NEXT: v_cndmask_b32_e64 v0, 0x47, v10, s4
@@ -179,7 +184,7 @@ define amdgpu_cs_chain void @wwm_in_shader(<3 x i32> inreg %sgpr, ptr inreg %cal
; GISEL10-NEXT: s_mov_b32 exec_lo, s4
; GISEL10-NEXT: v_add_nc_u32_e32 v10, 42, v10
; GISEL10-NEXT: v_mov_b32_e32 v11, v0
-; GISEL10-NEXT: ; %bb.2: ; %tail
+; GISEL10-NEXT: .LBB1_2: ; %tail
; GISEL10-NEXT: s_or_b32 exec_lo, exec_lo, s3
; GISEL10-NEXT: s_mov_b32 exec_lo, s5
; GISEL10-NEXT: s_setpc_b64 s[6:7]
@@ -193,6 +198,7 @@ define amdgpu_cs_chain void @wwm_in_shader(<3 x i32> inreg %sgpr, ptr inreg %cal
; DAGISEL10-NEXT: s_mov_b32 s7, s4
; DAGISEL10-NEXT: s_mov_b32 s6, s3
; DAGISEL10-NEXT: s_and_saveexec_b32 s3, s8
+; DAGISEL10-NEXT: s_cbranch_execz .LBB1_2
; DAGISEL10-NEXT: ; %bb.1: ; %shader
; DAGISEL10-NEXT: s_or_saveexec_b32 s4, -1
; DAGISEL10-NEXT: v_cndmask_b32_e64 v0, 0x47, v10, s4
@@ -200,7 +206,7 @@ define amdgpu_cs_chain void @wwm_in_shader(<3 x i32> inreg %sgpr, ptr inreg %cal
; DAGISEL10-NEXT: s_mov_b32 exec_lo, s4
; DAGISEL10-NEXT: v_add_nc_u32_e32 v10, 42, v10
; DAGISEL10-NEXT: v_mov_b32_e32 v11, s8
-; DAGISEL10-NEXT: ; %bb.2: ; %tail
+; DAGISEL10-NEXT: .LBB1_2: ; %tail
; DAGISEL10-NEXT: s_or_b32 exec_lo, exec_lo, s3
; DAGISEL10-NEXT: s_mov_b32 exec_lo, s5
; DAGISEL10-NEXT: s_setpc_b64 s[6:7]
@@ -240,6 +246,7 @@ define amdgpu_cs_chain void @phi_whole_struct(<3 x i32> inreg %sgpr, ptr inreg %
; GISEL12-NEXT: s_mov_b32 s7, s4
; GISEL12-NEXT: s_wait_alu 0xfffe
; GISEL12-NEXT: s_and_saveexec_b32 s3, s8
+; GISEL12-NEXT: s_cbranch_execz .LBB2_2
; GISEL12-NEXT: ; %bb.1: ; %shader
; GISEL12-NEXT: s_or_saveexec_b32 s4, -1
; GISEL12-NEXT: s_wait_alu 0xfffe
@@ -250,7 +257,8 @@ define amdgpu_cs_chain void @phi_whole_struct(<3 x i32> inreg %sgpr, ptr inreg %
; GISEL12-NEXT: s_mov_b32 exec_lo, s4
; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL12-NEXT: v_dual_mov_b32 v11, v0 :: v_dual_add_nc_u32 v10, 42, v12
-; GISEL12-NEXT: ; %bb.2: ; %tail
+; GISEL12-NEXT: .LBB2_2: ; %tail
+; GISEL12-NEXT: s_wait_alu 0xfffe
; GISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s3
; GISEL12-NEXT: s_mov_b32 exec_lo, s5
; GISEL12-NEXT: s_wait_alu 0xfffe
@@ -268,6 +276,7 @@ define amdgpu_cs_chain void @phi_whole_struct(<3 x i32> inreg %sgpr, ptr inreg %
; DAGISEL12-NEXT: s_mov_b32 s6, s3
; DAGISEL12-NEXT: s_wait_alu 0xfffe
; DAGISEL12-NEXT: s_and_saveexec_b32 s3, s8
+; DAGISEL12-NEXT: s_cbranch_execz .LBB2_2
; DAGISEL12-NEXT: ; %bb.1: ; %shader
; DAGISEL12-NEXT: s_or_saveexec_b32 s4, -1
; DAGISEL12-NEXT: s_wait_alu 0xfffe
@@ -276,7 +285,8 @@ define amdgpu_cs_chain void @phi_whole_struct(<3 x i32> inreg %sgpr, ptr inreg %
; DAGISEL12-NEXT: v_cmp_ne_u32_e64 s8, 0, v0
; DAGISEL12-NEXT: s_mov_b32 exec_lo, s4
; DAGISEL12-NEXT: v_dual_mov_b32 v11, s8 :: v_dual_add_nc_u32 v10, 42, v12
-; DAGISEL12-NEXT: ; %bb.2: ; %tail
+; DAGISEL12-NEXT: .LBB2_2: ; %tail
+; DAGISEL12-NEXT: s_wait_alu 0xfffe
; DAGISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s3
; DAGISEL12-NEXT: s_mov_b32 exec_lo, s5
; DAGISEL12-NEXT: s_wait_alu 0xfffe
@@ -289,6 +299,7 @@ define amdgpu_cs_chain void @phi_whole_struct(<3 x i32> inreg %sgpr, ptr inreg %
; GISEL10-NEXT: s_mov_b32 s6, s3
; GISEL10-NEXT: s_mov_b32 s7, s4
; GISEL10-NEXT: s_and_saveexec_b32 s3, s8
+; GISEL10-NEXT: s_cbranch_execz .LBB2_2
; GISEL10-NEXT: ; %bb.1: ; %shader
; GISEL10-NEXT: s_or_saveexec_b32 s4, -1
; GISEL10-NEXT: v_cndmask_b32_e64 v0, 0x47, v12, s4
@@ -297,7 +308,7 @@ define amdgpu_cs_chain void @phi_whole_struct(<3 x i32> inreg %sgpr, ptr inreg %
; GISEL10-NEXT: s_mov_b32 exec_lo, s4
; GISEL10-NEXT: v_add_nc_u32_e32 v10, 42, v12
; GISEL10-NEXT: v_mov_b32_e32 v11, v0
-; GISEL10-NEXT: ; %bb.2: ; %tail
+; GISEL10-NEXT: .LBB2_2: ; %tail
; GISEL10-NEXT: s_or_b32 exec_lo, exec_lo, s3
; GISEL10-NEXT: s_mov_b32 exec_lo, s5
; GISEL10-NEXT: s_setpc_b64 s[6:7]
@@ -309,6 +320,7 @@ define amdgpu_cs_chain void @phi_whole_struct(<3 x i32> inreg %sgpr, ptr inreg %
; DAGISEL10-NEXT: s_mov_b32 s7, s4
; DAGISEL10-NEXT: s_mov_b32 s6, s3
; DAGISEL10-NEXT: s_and_saveexec_b32 s3, s8
+; DAGISEL10-NEXT: s_cbranch_execz .LBB2_2
; DAGISEL10-NEXT: ; %bb.1: ; %shader
; DAGISEL10-NEXT: s_or_saveexec_b32 s4, -1
; DAGISEL10-NEXT: v_cndmask_b32_e64 v0, 0x47, v12, s4
@@ -316,7 +328,7 @@ define amdgpu_cs_chain void @phi_whole_struct(<3 x i32> inreg %sgpr, ptr inreg %
; DAGISEL10-NEXT: s_mov_b32 exec_lo, s4
; DAGISEL10-NEXT: v_add_nc_u32_e32 v10, 42, v12
; DAGISEL10-NEXT: v_mov_b32_e32 v11, s8
-; DAGISEL10-NEXT: ; %bb.2: ; %tail
+; DAGISEL10-NEXT: .LBB2_2: ; %tail
; DAGISEL10-NEXT: s_or_b32 exec_lo, exec_lo, s3
; DAGISEL10-NEXT: s_mov_b32 exec_lo, s5
; DAGISEL10-NEXT: s_setpc_b64 s[6:7]
@@ -390,6 +402,7 @@ define amdgpu_cs_chain void @control_flow(<3 x i32> inreg %sgpr, ptr inreg %call
; GISEL12-NEXT: v_cmpx_lt_i32_e64 v12, v13
; GISEL12-NEXT: s_wait_alu 0xfffe
; GISEL12-NEXT: s_xor_b32 s3, exec_lo, s3
+; GISEL12-NEXT: s_cbranch_execz .LBB3_6
; GISEL12-NEXT: ; %bb.5: ; %tail.else
; GISEL12-NEXT: s_or_saveexec_b32 s4, -1
; GISEL12-NEXT: v_mov_b32_e32 v0, 15
@@ -397,7 +410,8 @@ define amdgpu_cs_chain void @control_flow(<3 x i32> inreg %sgpr, ptr inreg %call
; GISEL12-NEXT: s_mov_b32 exec_lo, s4
; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL12-NEXT: v_mov_b32_e32 v8, v0
-; GISEL12-NEXT: ; %bb.6: ; %Flow
+; GISEL12-NEXT: .LBB3_6: ; %Flow
+; GISEL12-NEXT: s_wait_alu 0xfffe
; GISEL12-NEXT: s_and_not1_saveexec_b32 s3, s3
; GISEL12-NEXT: ; %bb.7: ; %tail.then
; GISEL12-NEXT: s_mov_b32 s4, 44
@@ -501,12 +515,13 @@ define amdgpu_cs_chain void @control_flow(<3 x i32> inreg %sgpr, ptr inreg %call
; GISEL10-NEXT: ; implicit-def: $vgpr8
; GISEL10-NEXT: v_cmpx_lt_i32_e64 v12, v13
; GISEL10-NEXT: s_xor_b32 s3, exec_lo, s3
+; GISEL10-NEXT: s_cbranch_execz .LBB3_6
; GISEL10-NEXT: ; %bb.5: ; %tail.else
; GISEL10-NEXT: s_or_saveexec_b32 s4, -1
; GISEL10-NEXT: v_mov_b32_e32 v0, 15
; GISEL10-NEXT: s_mov_b32 exec_lo, s4
; GISEL10-NEXT: v_mov_b32_e32 v8, v0
-; GISEL10-NEXT: ; %bb.6: ; %Flow
+; GISEL10-NEXT: .LBB3_6: ; %Flow
; GISEL10-NEXT: s_andn2_saveexec_b32 s3, s3
; GISEL10-NEXT: ; %bb.7: ; %tail.then
; GISEL10-NEXT: s_mov_b32 s4, 44
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll
index 1b1c89d9f5ad2f..2fd7b07e265cf1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll
@@ -57,6 +57,7 @@ define amdgpu_cs_chain void @basic(<3 x i32> inreg %sgpr, ptr inreg %callee, i64
; DAGISEL12-NEXT: s_mov_b32 s4, s3
; DAGISEL12-NEXT: s_wait_alu 0xfffe
; DAGISEL12-NEXT: s_and_saveexec_b64 s[8:9], s[10:11]
+; DAGISEL12-NEXT: s_cbranch_execz .LBB0_2
; DAGISEL12-NEXT: ; %bb.1: ; %shader
; DAGISEL12-NEXT: s_or_saveexec_b64 s[10:11], -1
; DAGISEL12-NEXT: s_wait_alu 0xfffe
@@ -68,7 +69,8 @@ define amdgpu_cs_chain void @basic(<3 x i32> inreg %sgpr, ptr inreg %callee, i64
; DAGISEL12-NEXT: v_add_nc_u32_e32 v10, 42, v13
; DAGISEL12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; DAGISEL12-NEXT: v_mov_b32_e32 v12, s13
-; DAGISEL12-NEXT: ; %bb.2: ; %tail
+; DAGISEL12-NEXT: .LBB0_2: ; %tail
+; DAGISEL12-NEXT: s_wait_alu 0xfffe
; DAGISEL12-NEXT: s_or_b64 exec, exec, s[8:9]
; DAGISEL12-NEXT: s_mov_b64 exec, s[6:7]
; DAGISEL12-NEXT: s_wait_alu 0xfffe
@@ -108,6 +110,7 @@ define amdgpu_cs_chain void @basic(<3 x i32> inreg %sgpr, ptr inreg %callee, i64
; DAGISEL10-NEXT: s_mov_b32 s5, s4
; DAGISEL10-NEXT: s_mov_b32 s4, s3
; DAGISEL10-NEXT: s_and_saveexec_b64 s[8:9], s[10:11]
+; DAGISEL10-NEXT: s_cbranch_execz .LBB0_2
; DAGISEL10-NEXT: ; %bb.1: ; %shader
; DAGISEL10-NEXT: s_or_saveexec_b64 s[10:11], -1
; DAGISEL10-NEXT: v_cndmask_b32_e64 v0, 0x47, v13, s[10:11]
@@ -116,7 +119,7 @@ define amdgpu_cs_chain void @basic(<3 x i32> inreg %sgpr, ptr inreg %callee, i64
; DAGISEL10-NEXT: v_mov_b32_e32 v11, s12
; DAGISEL10-NEXT: v_add_nc_u32_e32 v10, 42, v13
; DAGISEL10-NEXT: v_mov_b32_e32 v12, s13
-; DAGISEL10-NEXT: ; %bb.2: ; %tail
+; DAGISEL10-NEXT: .LBB0_2: ; %tail
; DAGISEL10-NEXT: s_or_b64 exec, exec, s[8:9]
; DAGISEL10-NEXT: s_mov_b64 exec, s[6:7]
; DAGISEL10-NEXT: s_setpc_b64 s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
index 1089093ea691c3..cc67a5fb2842b7 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
@@ -264,6 +264,7 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg)
; GFX9-O3-NEXT: v_mov_b32_e32 v3, v1
; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-O3-NEXT: s_and_saveexec_b64 s[34:35], vcc
+; GFX9-O3-NEXT: s_cbranch_execz .LBB1_2
; GFX9-O3-NEXT: ; %bb.1: ; %if
; GFX9-O3-NEXT: s_or_saveexec_b64 s[36:37], -1
; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0
@@ -273,7 +274,7 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg)
; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1
; GFX9-O3-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-O3-NEXT: v_mov_b32_e32 v5, v1
-; GFX9-O3-NEXT: ; %bb.2: ; %merge
+; GFX9-O3-NEXT: .LBB1_2: ; %merge
; GFX9-O3-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX9-O3-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
index 08cc2e4ec7d794..f8005b3f256a76 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
@@ -230,6 +230,7 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
; GFX9-O3-NEXT: v_mov_b32_e32 v3, v1
; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-O3-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-O3-NEXT: s_cbranch_execz .LBB1_2
; GFX9-O3-NEXT: ; %bb.1: ; %if
; GFX9-O3-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0
@@ -239,7 +240,7 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1
; GFX9-O3-NEXT: s_mov_b64 exec, s[6:7]
; GFX9-O3-NEXT: v_mov_b32_e32 v5, v1
-; GFX9-O3-NEXT: ; %bb.2: ; %merge
+; GFX9-O3-NEXT: .LBB1_2: ; %merge
; GFX9-O3-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX9-O3-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -1082,6 +1083,7 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
; GFX9-O3-NEXT: v_mov_b32_e32 v3, v1
; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-O3-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-O3-NEXT: s_cbranch_execz .LBB8_2
; GFX9-O3-NEXT: ; %bb.1: ; %if
; GFX9-O3-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0
@@ -1091,7 +1093,7 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1
; GFX9-O3-NEXT: s_mov_b64 exec, s[6:7]
; GFX9-O3-NEXT: v_mov_b32_e32 v5, v1
-; GFX9-O3-NEXT: ; %bb.2: ; %merge
+; GFX9-O3-NEXT: .LBB8_2: ; %merge
; GFX9-O3-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX9-O3-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
More information about the llvm-commits
mailing list