[llvm] AMDGPU: Fix implicit vcc def to vcc_lo on wave32 targets (PR #109514)
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Sun Sep 22 00:06:30 PDT 2024
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/109514
>From 83709620763a6ac4017d412d8bbfaf8ec999cca1 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Sat, 21 Sep 2024 09:41:56 +0400
Subject: [PATCH 1/2] AMDGPU: Fix implicit vcc def to vcc_lo on wave32 targets
Fixes a crash when shrinking wave32 compares with dead defs
The shrinking code was not replacing the vcc implicit-def operand
with vcc_lo on wave32. If the operand was dead, the code trying
to preserve the dead flag would crash.
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 2 +-
...-divergent-i1-phis-no-lane-mask-merging.ll | 2 +-
...vergence-divergent-i1-used-outside-loop.ll | 6 +-
.../atomic_optimizations_global_pointer.ll | 18 ++--
.../atomic_optimizations_local_pointer.ll | 74 ++++++++---------
.../AMDGPU/global_atomics_scan_fadd.ll | 82 +++++++++----------
.../AMDGPU/global_atomics_scan_fmax.ll | 22 ++---
.../AMDGPU/global_atomics_scan_fmin.ll | 22 ++---
.../AMDGPU/global_atomics_scan_fsub.ll | 82 +++++++++----------
.../CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll | 2 +-
.../AMDGPU/set-inactive-wwm-overwrite.ll | 4 +-
.../AMDGPU/should-not-hoist-set-inactive.ll | 2 +-
llvm/test/CodeGen/AMDGPU/shrink-true16.mir | 2 +-
.../shrink-v-cmp-wave32-dead-vcc-lo.mir | 55 +++++++++++++
llvm/test/CodeGen/AMDGPU/skip-if-dead.ll | 2 +-
llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll | 8 +-
llvm/test/CodeGen/AMDGPU/wave32.ll | 4 +-
17 files changed, 219 insertions(+), 170 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/shrink-v-cmp-wave32-dead-vcc-lo.mir
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 97e8b08270d615..087144687df7fe 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4512,7 +4512,7 @@ MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI,
}
// FIXME: Losing implicit operands
-
+ fixImplicitOperands(*Inst32);
return Inst32;
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
index bb7bc0447aea04..c5ded11c7d3234 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
@@ -167,8 +167,8 @@ define void @divergent_i1_phi_used_inside_loop_bigger_loop_body(float %val, floa
; GFX10-NEXT: s_cbranch_execz .LBB3_6
; GFX10-NEXT: .LBB3_2: ; %loop_start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_cmp_ge_i32_e32 vcc_lo, 0x3e8, v8
; GFX10-NEXT: s_mov_b32 s7, 1
+; GFX10-NEXT: v_cmp_ge_i32_e32 vcc_lo, 0x3e8, v8
; GFX10-NEXT: s_cbranch_vccz .LBB3_4
; GFX10-NEXT: ; %bb.3: ; %else
; GFX10-NEXT: ; in Loop: Header=BB3_2 Depth=1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
index 49c232661c6dc1..b27d8fdc24ff73 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
@@ -75,12 +75,12 @@ define void @divergent_i1_phi_used_outside_loop_larger_loop_body(float %val, ptr
; GFX10-NEXT: .LBB1_1: ; %loop.cond
; GFX10-NEXT: ; in Loop: Header=BB1_2 Depth=1
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_add_nc_u32_e32 v0, 1, v0
; GFX10-NEXT: v_add_co_u32 v1, s4, v1, 4
+; GFX10-NEXT: v_add_nc_u32_e32 v0, 1, v0
; GFX10-NEXT: v_add_co_ci_u32_e64 v2, s4, 0, v2, s4
-; GFX10-NEXT: v_cmp_le_i32_e32 vcc_lo, 10, v0
; GFX10-NEXT: s_andn2_b32 s7, s5, exec_lo
; GFX10-NEXT: s_and_b32 s8, exec_lo, s6
+; GFX10-NEXT: v_cmp_le_i32_e32 vcc_lo, 10, v0
; GFX10-NEXT: s_or_b32 s4, s7, s8
; GFX10-NEXT: s_cbranch_vccz .LBB1_4
; GFX10-NEXT: .LBB1_2: ; %loop.start
@@ -191,9 +191,9 @@ define void @divergent_i1_xor_used_outside_loop_larger_loop_body(i32 %num.elts,
; GFX10-LABEL: divergent_i1_xor_used_outside_loop_larger_loop_body:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: s_mov_b32 s6, -1
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX10-NEXT: s_cbranch_execz .LBB3_6
; GFX10-NEXT: ; %bb.1: ; %loop.start.preheader
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index b17dfc7c3754a1..ce608df44dc434 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -1323,9 +1323,9 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: s_mov_b32 s0, s2
; GFX1032_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB2_2
@@ -1451,10 +1451,9 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1132_DPP-NEXT: v_writelane_b32 v3, s5, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: s_mov_b32 s4, s6
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB2_2
@@ -1587,9 +1586,9 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_DPP-NEXT: v_writelane_b32 v3, s5, 16
; GFX1232_DPP-NEXT: s_wait_alu 0xfffe
; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1232_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1232_DPP-NEXT: s_mov_b32 s4, s6
; GFX1232_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1232_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1232_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1232_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo
; GFX1232_DPP-NEXT: s_cbranch_execz .LBB2_2
@@ -3228,8 +3227,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1032_DPP-NEXT: v_writelane_b32 v2, s8, 16
; GFX1032_DPP-NEXT: v_writelane_b32 v1, s3, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB5_2
@@ -4991,9 +4990,9 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: s_mov_b32 s0, s2
; GFX1032_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB8_2
@@ -5119,10 +5118,9 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1132_DPP-NEXT: v_writelane_b32 v3, s5, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: s_mov_b32 s4, s6
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB8_2
@@ -5255,9 +5253,9 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_DPP-NEXT: v_writelane_b32 v3, s5, 16
; GFX1232_DPP-NEXT: s_wait_alu 0xfffe
; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1232_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1232_DPP-NEXT: s_mov_b32 s4, s6
; GFX1232_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1232_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1232_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1232_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo
; GFX1232_DPP-NEXT: s_cbranch_execz .LBB8_2
@@ -6938,8 +6936,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1032_DPP-NEXT: v_writelane_b32 v2, s8, 16
; GFX1032_DPP-NEXT: v_writelane_b32 v1, s3, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB11_2
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 988bc8eec6e517..ce90fbed813103 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -936,8 +936,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB2_2
@@ -1047,8 +1047,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB2_2
@@ -2684,8 +2684,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: v_writelane_b32 v2, s6, 16
; GFX1032_DPP-NEXT: v_writelane_b32 v1, s5, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9
; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB6_2
@@ -2874,8 +2874,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: v_writelane_b32 v1, s5, 16
; GFX1132_DPP-NEXT: v_writelane_b32 v2, s6, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v8
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v8
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB6_2
@@ -3383,8 +3383,8 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3
; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v4, vcc_lo
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v11, exec_lo, 0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v1
+; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v11, exec_lo, 0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v2
; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v11
@@ -4444,8 +4444,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB10_2
@@ -4555,8 +4555,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB10_2
@@ -6218,8 +6218,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: v_writelane_b32 v2, s6, 16
; GFX1032_DPP-NEXT: v_writelane_b32 v1, s5, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9
; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB14_2
@@ -6408,8 +6408,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: v_writelane_b32 v1, s5, 16
; GFX1132_DPP-NEXT: v_writelane_b32 v2, s6, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v8
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v8
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB14_2
@@ -6915,8 +6915,8 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB15_2
@@ -7026,9 +7026,8 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB15_2
@@ -7627,8 +7626,8 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: v_writelane_b32 v6, s5, 16
; GFX1032_DPP-NEXT: v_writelane_b32 v5, s6, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7
; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB16_2
@@ -7786,8 +7785,8 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: v_writelane_b32 v6, s5, 16
; GFX1132_DPP-NEXT: v_writelane_b32 v5, s6, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB16_2
@@ -8294,8 +8293,8 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB17_2
@@ -8405,8 +8404,8 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB17_2
@@ -9006,8 +9005,8 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: v_writelane_b32 v6, s5, 16
; GFX1032_DPP-NEXT: v_writelane_b32 v5, s6, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7
; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB18_2
@@ -9165,8 +9164,8 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: v_writelane_b32 v6, s5, 16
; GFX1132_DPP-NEXT: v_writelane_b32 v5, s6, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB18_2
@@ -9673,8 +9672,8 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB19_2
@@ -9784,8 +9783,8 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB19_2
@@ -10385,8 +10384,8 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: v_writelane_b32 v6, s5, 16
; GFX1032_DPP-NEXT: v_writelane_b32 v5, s6, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7
; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB20_2
@@ -10544,8 +10543,8 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: v_writelane_b32 v6, s5, 16
; GFX1132_DPP-NEXT: v_writelane_b32 v5, s6, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB20_2
@@ -11051,8 +11050,8 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB21_2
@@ -11162,9 +11161,8 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB21_2
@@ -12196,8 +12194,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: v_writelane_b32 v2, s5, 16
; GFX1032_DPP-NEXT: v_writelane_b32 v1, s6, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9
; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB23_2
@@ -12415,8 +12413,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: v_writelane_b32 v2, s5, 16
; GFX1132_DPP-NEXT: v_writelane_b32 v1, s6, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB23_2
@@ -12923,8 +12921,8 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB24_2
@@ -13034,9 +13032,8 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB24_2
@@ -14788,8 +14785,8 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB27_2
@@ -14899,8 +14896,8 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB27_2
@@ -15909,8 +15906,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: v_writelane_b32 v2, s5, 16
; GFX1032_DPP-NEXT: v_writelane_b32 v1, s6, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9
; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB29_2
@@ -16125,8 +16122,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: v_writelane_b32 v2, s5, 16
; GFX1132_DPP-NEXT: v_writelane_b32 v1, s6, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB29_2
@@ -16633,8 +16630,8 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB30_2
@@ -16744,9 +16741,8 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB30_2
@@ -17754,8 +17750,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: v_writelane_b32 v2, s5, 16
; GFX1032_DPP-NEXT: v_writelane_b32 v1, s6, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9
; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB32_2
@@ -17970,8 +17966,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: v_writelane_b32 v2, s5, 16
; GFX1132_DPP-NEXT: v_writelane_b32 v1, s6, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB32_2
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index 2b18f472c8c402..c3a197ce99859f 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -1263,16 +1263,16 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
;
; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s14, -1
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000
; GFX1032-NEXT: s_add_u32 s12, s12, s9
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_addc_u32 s13, s13, 0
; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB2_3
; GFX1032-NEXT: ; %bb.1:
@@ -1483,16 +1483,16 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
;
; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s14, -1
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0
; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_3
; GFX1032-DPP-NEXT: ; %bb.1:
@@ -2471,16 +2471,16 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
;
; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s14, -1
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000
; GFX1032-NEXT: s_add_u32 s12, s12, s9
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_addc_u32 s13, s13, 0
; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB4_3
; GFX1032-NEXT: ; %bb.1:
@@ -2721,16 +2721,16 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
;
; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s14, -1
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0
; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_3
; GFX1032-DPP-NEXT: ; %bb.1:
@@ -4503,16 +4503,16 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop
;
; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s14, -1
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000
; GFX1032-NEXT: s_add_u32 s12, s12, s9
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_addc_u32 s13, s13, 0
; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB7_3
; GFX1032-NEXT: ; %bb.1:
@@ -4753,16 +4753,16 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop
;
; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s14, -1
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0
; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB7_3
; GFX1032-DPP-NEXT: ; %bb.1:
@@ -5929,19 +5929,19 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
;
; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
; GFX1032-NEXT: s_mov_b32 s33, s8
; GFX1032-NEXT: s_mov_b32 s8, exec_lo
-; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0
; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s50, -1
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0
; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000
; GFX1032-NEXT: s_add_u32 s48, s48, s9
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-NEXT: s_addc_u32 s49, s49, 0
; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1]
; GFX1032-NEXT: s_mov_b32 s44, 0
; GFX1032-NEXT: s_movk_i32 s32, 0x400
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB9_3
; GFX1032-NEXT: ; %bb.1:
@@ -6378,19 +6378,19 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
;
; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
; GFX1032-DPP-NEXT: s_mov_b32 s33, s8
; GFX1032-DPP-NEXT: s_mov_b32 s8, exec_lo
-; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0
; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s50, -1
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0
; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0
; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
; GFX1032-DPP-NEXT: s_mov_b32 s44, 0
; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX1032-DPP-NEXT: ; %bb.1:
@@ -7595,8 +7595,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0
; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11]
; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v8
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9
; GFX1032-DPP-NEXT: s_mov_b32 s44, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
@@ -8020,16 +8020,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
;
; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s14, -1
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000
; GFX1032-NEXT: s_add_u32 s12, s12, s9
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_addc_u32 s13, s13, 0
; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB11_3
; GFX1032-NEXT: ; %bb.1:
@@ -8277,16 +8277,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
;
; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s14, -1
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0
; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_3
; GFX1032-DPP-NEXT: ; %bb.1:
@@ -9107,8 +9107,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0
; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4
; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
@@ -9444,16 +9444,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
;
; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s14, -1
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000
; GFX1032-NEXT: s_add_u32 s12, s12, s9
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_addc_u32 s13, s13, 0
; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB13_3
; GFX1032-NEXT: ; %bb.1:
@@ -9701,16 +9701,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
;
; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s14, -1
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0
; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_3
; GFX1032-DPP-NEXT: ; %bb.1:
@@ -10531,8 +10531,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0
; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4
; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
@@ -11437,8 +11437,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0
; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4
; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
@@ -13574,8 +13574,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0
; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11]
; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v8
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9
; GFX1032-DPP-NEXT: s_mov_b32 s44, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index e3144ae24ae8d6..69c6adf0300c22 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -3348,17 +3348,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
;
; GFX1032-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s50, -1
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-NEXT: s_add_u32 s48, s48, s9
; GFX1032-NEXT: s_addc_u32 s49, s49, 0
; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1]
; GFX1032-NEXT: s_mov_b32 s44, 0
; GFX1032-NEXT: s_movk_i32 s32, 0x400
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB6_3
; GFX1032-NEXT: ; %bb.1:
@@ -3778,17 +3778,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
;
; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s50, -1
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9
; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0
; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
; GFX1032-DPP-NEXT: s_mov_b32 s44, 0
; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_3
; GFX1032-DPP-NEXT: ; %bb.1:
@@ -5038,8 +5038,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11]
; GFX1032-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11]
; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v8
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v9
; GFX1032-DPP-NEXT: s_mov_b32 s44, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
@@ -6403,8 +6403,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6]
; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
@@ -6844,17 +6844,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
;
; GFX1032-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s50, -1
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-NEXT: s_add_u32 s48, s48, s9
; GFX1032-NEXT: s_addc_u32 s49, s49, 0
; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1]
; GFX1032-NEXT: s_mov_b32 s44, 0
; GFX1032-NEXT: s_movk_i32 s32, 0x400
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB10_3
; GFX1032-NEXT: ; %bb.1:
@@ -7274,17 +7274,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
;
; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s50, -1
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9
; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0
; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
; GFX1032-DPP-NEXT: s_mov_b32 s44, 0
; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3
; GFX1032-DPP-NEXT: ; %bb.1:
@@ -8534,8 +8534,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11]
; GFX1032-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11]
; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v8
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v9
; GFX1032-DPP-NEXT: s_mov_b32 s44, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index ddc103184cdf35..b7890f30f77603 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -3348,17 +3348,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
;
; GFX1032-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s50, -1
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-NEXT: s_add_u32 s48, s48, s9
; GFX1032-NEXT: s_addc_u32 s49, s49, 0
; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1]
; GFX1032-NEXT: s_mov_b32 s44, 0
; GFX1032-NEXT: s_movk_i32 s32, 0x400
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB6_3
; GFX1032-NEXT: ; %bb.1:
@@ -3778,17 +3778,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
;
; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s50, -1
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9
; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0
; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
; GFX1032-DPP-NEXT: s_mov_b32 s44, 0
; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_3
; GFX1032-DPP-NEXT: ; %bb.1:
@@ -5038,8 +5038,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11]
; GFX1032-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11]
; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v8
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v9
; GFX1032-DPP-NEXT: s_mov_b32 s44, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
@@ -6403,8 +6403,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6]
; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
@@ -6844,17 +6844,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
;
; GFX1032-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s50, -1
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-NEXT: s_add_u32 s48, s48, s9
; GFX1032-NEXT: s_addc_u32 s49, s49, 0
; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1]
; GFX1032-NEXT: s_mov_b32 s44, 0
; GFX1032-NEXT: s_movk_i32 s32, 0x400
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB10_3
; GFX1032-NEXT: ; %bb.1:
@@ -7274,17 +7274,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
;
; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s50, -1
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9
; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0
; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
; GFX1032-DPP-NEXT: s_mov_b32 s44, 0
; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3
; GFX1032-DPP-NEXT: ; %bb.1:
@@ -8534,8 +8534,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11]
; GFX1032-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11]
; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v8
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v9
; GFX1032-DPP-NEXT: s_mov_b32 s44, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index f353edff1b477a..fcd5d0dc497e67 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -1367,16 +1367,16 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
;
; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s14, -1
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000
; GFX1032-NEXT: s_add_u32 s12, s12, s9
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_addc_u32 s13, s13, 0
; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB2_3
; GFX1032-NEXT: ; %bb.1:
@@ -1617,16 +1617,16 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
;
; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s14, -1
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0
; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_3
; GFX1032-DPP-NEXT: ; %bb.1:
@@ -2687,16 +2687,16 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
;
; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s14, -1
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000
; GFX1032-NEXT: s_add_u32 s12, s12, s9
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_addc_u32 s13, s13, 0
; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB4_3
; GFX1032-NEXT: ; %bb.1:
@@ -2937,16 +2937,16 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
;
; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s14, -1
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0
; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_3
; GFX1032-DPP-NEXT: ; %bb.1:
@@ -4823,16 +4823,16 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop
;
; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s14, -1
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000
; GFX1032-NEXT: s_add_u32 s12, s12, s9
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_addc_u32 s13, s13, 0
; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB7_3
; GFX1032-NEXT: ; %bb.1:
@@ -5073,16 +5073,16 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop
;
; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s14, -1
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0
; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB7_3
; GFX1032-DPP-NEXT: ; %bb.1:
@@ -6249,19 +6249,19 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
;
; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
; GFX1032-NEXT: s_mov_b32 s33, s8
; GFX1032-NEXT: s_mov_b32 s8, exec_lo
-; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0
; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s50, -1
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0
; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000
; GFX1032-NEXT: s_add_u32 s48, s48, s9
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-NEXT: s_addc_u32 s49, s49, 0
; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1]
; GFX1032-NEXT: s_mov_b32 s44, 0
; GFX1032-NEXT: s_movk_i32 s32, 0x400
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB9_3
; GFX1032-NEXT: ; %bb.1:
@@ -6698,19 +6698,19 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
;
; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
; GFX1032-DPP-NEXT: s_mov_b32 s33, s8
; GFX1032-DPP-NEXT: s_mov_b32 s8, exec_lo
-; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0
; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s50, -1
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0
; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0
; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
; GFX1032-DPP-NEXT: s_mov_b32 s44, 0
; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX1032-DPP-NEXT: ; %bb.1:
@@ -7915,8 +7915,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0
; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11]
; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v8
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9
; GFX1032-DPP-NEXT: s_mov_b32 s44, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
@@ -8340,16 +8340,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
;
; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s14, -1
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000
; GFX1032-NEXT: s_add_u32 s12, s12, s9
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_addc_u32 s13, s13, 0
; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB11_3
; GFX1032-NEXT: ; %bb.1:
@@ -8597,16 +8597,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
;
; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s14, -1
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0
; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_3
; GFX1032-DPP-NEXT: ; %bb.1:
@@ -9426,8 +9426,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0
; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4
; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
@@ -9763,16 +9763,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
;
; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s14, -1
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000
; GFX1032-NEXT: s_add_u32 s12, s12, s9
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_addc_u32 s13, s13, 0
; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB13_3
; GFX1032-NEXT: ; %bb.1:
@@ -10020,16 +10020,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
;
; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s14, -1
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0
; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_3
; GFX1032-DPP-NEXT: ; %bb.1:
@@ -10850,8 +10850,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0
; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4
; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
@@ -11756,8 +11756,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0
; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4
; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
@@ -13892,8 +13892,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0
; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11]
; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v8
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9
; GFX1032-DPP-NEXT: s_mov_b32 s44, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
index 684ca3aac7c315..004a720b9ab486 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
@@ -216,8 +216,8 @@ define amdgpu_ps void @branch(float %arg0, float %arg1) {
; GFX10-32-NEXT: s_mov_b32 s1, exec_lo
; GFX10-32-NEXT: v_or_b32_e32 v0, v0, v1
; GFX10-32-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX10-32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX10-32-NEXT: v_cmp_eq_u32_e64 s0, 1, v0
+; GFX10-32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX10-32-NEXT: s_and_saveexec_b32 s2, s0
; GFX10-32-NEXT: s_xor_b32 s0, exec_lo, s2
; GFX10-32-NEXT: s_cbranch_execz .LBB2_3
diff --git a/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll b/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll
index f60786c1bacbff..6f841c88a6d8bb 100644
--- a/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll
+++ b/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll
@@ -4,8 +4,8 @@
define amdgpu_cs void @if_then(ptr addrspace(8) inreg %input, ptr addrspace(8) inreg %output, <3 x i32> %LocalInvocationId) {
; GCN-LABEL: if_then:
; GCN: ; %bb.0: ; %.entry
-; GCN-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GCN-NEXT: v_mov_b32_e32 v3, 0
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GCN-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GCN-NEXT: ; %bb.1: ; %.bb0
; GCN-NEXT: v_mov_b32_e32 v3, 1
@@ -60,8 +60,8 @@ define amdgpu_cs void @if_then(ptr addrspace(8) inreg %input, ptr addrspace(8) i
define amdgpu_cs void @if_else_vgpr_opt(ptr addrspace(8) inreg %input, ptr addrspace(8) inreg %output, <3 x i32> %LocalInvocationId) {
; GCN-LABEL: if_else_vgpr_opt:
; GCN: ; %bb.0: ; %.entry
-; GCN-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GCN-NEXT: v_mov_b32_e32 v3, 0
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GCN-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GCN-NEXT: ; %bb.1: ; %.bb0
; GCN-NEXT: v_mov_b32_e32 v3, 1
diff --git a/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll b/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll
index 90b32e29e98f67..3519befabd3bc7 100644
--- a/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll
@@ -4,10 +4,10 @@
define amdgpu_cs void @should_not_hoist_set_inactive(<4 x i32> inreg %i14, i32 inreg %v, i32 %lane, i32 %f, i32 %f2) #0 {
; GCN-LABEL: should_not_hoist_set_inactive:
; GCN: ; %bb.0: ; %.entry
-; GCN-NEXT: v_cmp_gt_i32_e32 vcc_lo, 3, v1
; GCN-NEXT: v_cmp_eq_u32_e64 s5, 0, v0
; GCN-NEXT: v_cmp_ne_u32_e64 s6, 0, v2
; GCN-NEXT: s_mov_b32 s7, 0
+; GCN-NEXT: v_cmp_gt_i32_e32 vcc_lo, 3, v1
; GCN-NEXT: s_branch .LBB0_2
; GCN-NEXT: .LBB0_1: ; %bb4
; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1
diff --git a/llvm/test/CodeGen/AMDGPU/shrink-true16.mir b/llvm/test/CodeGen/AMDGPU/shrink-true16.mir
index 1a7ec5db9efa2a..be759049bc3a7d 100644
--- a/llvm/test/CodeGen/AMDGPU/shrink-true16.mir
+++ b/llvm/test/CodeGen/AMDGPU/shrink-true16.mir
@@ -11,7 +11,7 @@ body: |
; GFX1100-LABEL: name: 16bit_lo128_shrink
; GFX1100: liveins: $vgpr127
; GFX1100-NEXT: {{ $}}
- ; GFX1100-NEXT: V_CMP_EQ_U16_t16_e32 0, $vgpr127, implicit-def $vcc, implicit $exec, implicit $exec
+ ; GFX1100-NEXT: V_CMP_EQ_U16_t16_e32 0, $vgpr127, implicit-def $vcc_lo, implicit $exec, implicit $exec
$vcc_lo = V_CMP_EQ_U16_t16_e64 0, $vgpr127, implicit-def $vcc, implicit $exec
...
diff --git a/llvm/test/CodeGen/AMDGPU/shrink-v-cmp-wave32-dead-vcc-lo.mir b/llvm/test/CodeGen/AMDGPU/shrink-v-cmp-wave32-dead-vcc-lo.mir
new file mode 100644
index 00000000000000..73c55265af20b9
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/shrink-v-cmp-wave32-dead-vcc-lo.mir
@@ -0,0 +1,55 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass=si-shrink-instructions -mcpu=gfx1100 -o - %s | FileCheck %s
+
+# Make sure there's no crash when shrinking a v_cmp on a wave32 target
+# when the def is dead. Previously the vcc implicit def wasn't
+# properly replaced with vcc_lo, so the expected implicit operand was
+# not found in the shrunk instruction.
+
+---
+name: shrink_v_cmp_vcc_lo_dead
+tracksRegLiveness: true
+tracksDebugUserValues: true
+frameInfo:
+ maxAlignment: 1
+ maxCallFrameSize: 0
+ isCalleeSavedInfoValid: true
+machineFunctionInfo:
+ maxKernArgAlign: 1
+ stackPtrOffsetReg: '$sgpr32'
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+ ; CHECK-LABEL: name: shrink_v_cmp_vcc_lo_dead
+ ; CHECK: liveins: $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: V_CMP_LT_U32_e32 $vgpr0, $vgpr1, implicit-def dead $vcc_lo, implicit $exec
+ ; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31
+ dead renamable $vcc_lo = V_CMP_LT_U32_e64 $vgpr0, $vgpr1, implicit $exec
+ S_SETPC_B64_return undef $sgpr30_sgpr31
+
+...
+
+---
+name: shrink_v_cmp_vcc_lo_live
+tracksRegLiveness: true
+tracksDebugUserValues: true
+frameInfo:
+ maxAlignment: 1
+ maxCallFrameSize: 0
+ isCalleeSavedInfoValid: true
+machineFunctionInfo:
+ maxKernArgAlign: 1
+ stackPtrOffsetReg: '$sgpr32'
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+ ; CHECK-LABEL: name: shrink_v_cmp_vcc_lo_live
+ ; CHECK: liveins: $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: V_CMP_LT_U32_e32 $vgpr0, $vgpr1, implicit-def $vcc_lo, implicit $exec
+ ; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vcc_lo
+ renamable $vcc_lo = V_CMP_LT_U32_e64 $vgpr0, $vgpr1, implicit $exec
+ S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vcc_lo
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
index eebd32cd67e6e6..8e0a83671a1837 100644
--- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
+++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
@@ -1027,8 +1027,8 @@ define amdgpu_ps void @test_kill_divergent_loop(i32 %arg) #0 {
;
; GFX10-WAVE32-LABEL: test_kill_divergent_loop:
; GFX10-WAVE32: ; %bb.0: ; %entry
-; GFX10-WAVE32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX10-WAVE32-NEXT: s_mov_b32 s0, exec_lo
+; GFX10-WAVE32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX10-WAVE32-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX10-WAVE32-NEXT: s_xor_b32 s1, exec_lo, s1
; GFX10-WAVE32-NEXT: s_cbranch_execz .LBB10_3
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll
index 25d8300eb45835..a0bce3432a4bd0 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll
@@ -86,8 +86,8 @@ end:
define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 {
; SI-LABEL: else3:
; SI: ; %bb.0: ; %entry
-; SI-NEXT: v_cmp_gt_i32_e32 vcc_lo, 6, v0
; SI-NEXT: s_mov_b32 s1, 0
+; SI-NEXT: v_cmp_gt_i32_e32 vcc_lo, 6, v0
; SI-NEXT: s_branch .LBB2_2
; SI-NEXT: .LBB2_1: ; %if.end
; SI-NEXT: ; in Loop: Header=BB2_2 Depth=1
@@ -161,16 +161,16 @@ for.end:
define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, ptr %extern_func, ptr %extern_func2) #0 {
; SI-LABEL: loop:
; SI: ; %bb.0: ; %main_body
-; SI-NEXT: v_mov_b32_e32 v6, v0
; SI-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; SI-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; SI-NEXT: s_mov_b32 s14, -1
+; SI-NEXT: v_mov_b32_e32 v6, v0
; SI-NEXT: v_mov_b32_e32 v0, v1
-; SI-NEXT: v_cmp_gt_i32_e32 vcc_lo, 6, v6
; SI-NEXT: s_mov_b32 s15, 0x31c16000
; SI-NEXT: s_add_u32 s12, s12, s1
; SI-NEXT: s_addc_u32 s13, s13, 0
; SI-NEXT: s_mov_b32 s32, 0
+; SI-NEXT: v_cmp_gt_i32_e32 vcc_lo, 6, v6
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: s_and_saveexec_b32 s0, vcc_lo
; SI-NEXT: s_xor_b32 s6, exec_lo, s0
@@ -243,11 +243,11 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, ptr %e
; SI-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; SI-NEXT: s_mov_b32 s14, -1
; SI-NEXT: v_mov_b32_e32 v40, v1
-; SI-NEXT: v_cmp_gt_i32_e32 vcc_lo, 6, v0
; SI-NEXT: s_mov_b32 s15, 0x31c16000
; SI-NEXT: s_add_u32 s12, s12, s1
; SI-NEXT: s_addc_u32 s13, s13, 0
; SI-NEXT: s_mov_b32 s32, 0
+; SI-NEXT: v_cmp_gt_i32_e32 vcc_lo, 6, v0
; SI-NEXT: ; implicit-def: $vgpr0
; SI-NEXT: s_and_saveexec_b32 s0, vcc_lo
; SI-NEXT: s_xor_b32 s6, exec_lo, s0
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index 92117e0688f65c..4576d829b0cb0a 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -372,8 +372,8 @@ define amdgpu_kernel void @test_loop_with_if(ptr addrspace(1) %arg) #0 {
; GFX1032-NEXT: .LBB10_2: ; %bb2
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_cmp_ge_i32_e64 s4, v1, v0
-; GFX1032-NEXT: v_cmp_lt_i32_e32 vcc_lo, v1, v0
; GFX1032-NEXT: s_mov_b32 s3, 0
+; GFX1032-NEXT: v_cmp_lt_i32_e32 vcc_lo, v1, v0
; GFX1032-NEXT: s_and_saveexec_b32 s5, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB10_4
; GFX1032-NEXT: ; %bb.3: ; %bb5
@@ -515,8 +515,8 @@ bb13:
define amdgpu_kernel void @test_loop_with_if_else_break(ptr addrspace(1) %arg) #0 {
; GFX1032-LABEL: test_loop_with_if_else_break:
; GFX1032: ; %bb.0: ; %bb
-; GFX1032-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB11_6
; GFX1032-NEXT: ; %bb.1: ; %.preheader
>From e55232f6fce72c64eca70e90957340364e571e4f Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Sun, 22 Sep 2024 11:06:07 +0400
Subject: [PATCH 2/2] Remove previous call
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 1 -
1 file changed, 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 087144687df7fe..067eca209ed7d6 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4502,7 +4502,6 @@ MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI,
// of vcc was already added during the initial BuildMI, but we
// 1) may need to change vcc to vcc_lo to preserve the original register
// 2) have to preserve the original flags.
- fixImplicitOperands(*Inst32);
copyFlagsToImplicitVCC(*Inst32, *Src2);
continue;
}
More information about the llvm-commits
mailing list