[llvm] AMDGPU: Fix implicit vcc def to vcc_lo on wave32 targets (PR #109514)

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Sun Sep 22 00:06:30 PDT 2024


https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/109514

>From 83709620763a6ac4017d412d8bbfaf8ec999cca1 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Sat, 21 Sep 2024 09:41:56 +0400
Subject: [PATCH 1/2] AMDGPU: Fix implicit vcc def to vcc_lo on wave32 targets

Fixes a crash when shrinking wave32 compares with dead defs

The shrinking code was not replacing the vcc implicit-def operand
with vcc_lo on wave32. If the operand was dead, the code trying
to preserve the dead flag would crash.
---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |  2 +-
 ...-divergent-i1-phis-no-lane-mask-merging.ll |  2 +-
 ...vergence-divergent-i1-used-outside-loop.ll |  6 +-
 .../atomic_optimizations_global_pointer.ll    | 18 ++--
 .../atomic_optimizations_local_pointer.ll     | 74 ++++++++---------
 .../AMDGPU/global_atomics_scan_fadd.ll        | 82 +++++++++----------
 .../AMDGPU/global_atomics_scan_fmax.ll        | 22 ++---
 .../AMDGPU/global_atomics_scan_fmin.ll        | 22 ++---
 .../AMDGPU/global_atomics_scan_fsub.ll        | 82 +++++++++----------
 .../CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll  |  2 +-
 .../AMDGPU/set-inactive-wwm-overwrite.ll      |  4 +-
 .../AMDGPU/should-not-hoist-set-inactive.ll   |  2 +-
 llvm/test/CodeGen/AMDGPU/shrink-true16.mir    |  2 +-
 .../shrink-v-cmp-wave32-dead-vcc-lo.mir       | 55 +++++++++++++
 llvm/test/CodeGen/AMDGPU/skip-if-dead.ll      |  2 +-
 llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll    |  8 +-
 llvm/test/CodeGen/AMDGPU/wave32.ll            |  4 +-
 17 files changed, 219 insertions(+), 170 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/shrink-v-cmp-wave32-dead-vcc-lo.mir

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 97e8b08270d615..087144687df7fe 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4512,7 +4512,7 @@ MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI,
   }
 
   // FIXME: Losing implicit operands
-
+  fixImplicitOperands(*Inst32);
   return Inst32;
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
index bb7bc0447aea04..c5ded11c7d3234 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
@@ -167,8 +167,8 @@ define void @divergent_i1_phi_used_inside_loop_bigger_loop_body(float %val, floa
 ; GFX10-NEXT:    s_cbranch_execz .LBB3_6
 ; GFX10-NEXT:  .LBB3_2: ; %loop_start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_cmp_ge_i32_e32 vcc_lo, 0x3e8, v8
 ; GFX10-NEXT:    s_mov_b32 s7, 1
+; GFX10-NEXT:    v_cmp_ge_i32_e32 vcc_lo, 0x3e8, v8
 ; GFX10-NEXT:    s_cbranch_vccz .LBB3_4
 ; GFX10-NEXT:  ; %bb.3: ; %else
 ; GFX10-NEXT:    ; in Loop: Header=BB3_2 Depth=1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
index 49c232661c6dc1..b27d8fdc24ff73 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
@@ -75,12 +75,12 @@ define void @divergent_i1_phi_used_outside_loop_larger_loop_body(float %val, ptr
 ; GFX10-NEXT:  .LBB1_1: ; %loop.cond
 ; GFX10-NEXT:    ; in Loop: Header=BB1_2 Depth=1
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_add_co_u32 v1, s4, v1, 4
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v2, s4, 0, v2, s4
-; GFX10-NEXT:    v_cmp_le_i32_e32 vcc_lo, 10, v0
 ; GFX10-NEXT:    s_andn2_b32 s7, s5, exec_lo
 ; GFX10-NEXT:    s_and_b32 s8, exec_lo, s6
+; GFX10-NEXT:    v_cmp_le_i32_e32 vcc_lo, 10, v0
 ; GFX10-NEXT:    s_or_b32 s4, s7, s8
 ; GFX10-NEXT:    s_cbranch_vccz .LBB1_4
 ; GFX10-NEXT:  .LBB1_2: ; %loop.start
@@ -191,9 +191,9 @@ define void @divergent_i1_xor_used_outside_loop_larger_loop_body(i32 %num.elts,
 ; GFX10-LABEL: divergent_i1_xor_used_outside_loop_larger_loop_body:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    s_mov_b32 s6, -1
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX10-NEXT:    s_cbranch_execz .LBB3_6
 ; GFX10-NEXT:  ; %bb.1: ; %loop.start.preheader
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index b17dfc7c3754a1..ce608df44dc434 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -1323,9 +1323,9 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032_DPP-NEXT:    v_writelane_b32 v3, s1, 16
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032_DPP-NEXT:    s_mov_b32 s0, s2
 ; GFX1032_DPP-NEXT:    s_mov_b32 s2, -1
+; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr0
 ; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s8, vcc_lo
 ; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB2_2
@@ -1451,10 +1451,9 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v3, s5, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1132_DPP-NEXT:    s_mov_b32 s4, s6
 ; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr0
 ; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s8, vcc_lo
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB2_2
@@ -1587,9 +1586,9 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_DPP-NEXT:    v_writelane_b32 v3, s5, 16
 ; GFX1232_DPP-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1232_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1232_DPP-NEXT:    s_mov_b32 s4, s6
 ; GFX1232_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX1232_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1232_DPP-NEXT:    ; implicit-def: $vgpr0
 ; GFX1232_DPP-NEXT:    s_and_saveexec_b32 s8, vcc_lo
 ; GFX1232_DPP-NEXT:    s_cbranch_execz .LBB2_2
@@ -3228,8 +3227,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1032_DPP-NEXT:    v_writelane_b32 v2, s8, 16
 ; GFX1032_DPP-NEXT:    v_writelane_b32 v1, s3, 16
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s2
-; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032_DPP-NEXT:    s_mov_b32 s2, -1
+; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
 ; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s8, vcc_lo
 ; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB5_2
@@ -4991,9 +4990,9 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032_DPP-NEXT:    v_writelane_b32 v3, s1, 16
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032_DPP-NEXT:    s_mov_b32 s0, s2
 ; GFX1032_DPP-NEXT:    s_mov_b32 s2, -1
+; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr0
 ; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s8, vcc_lo
 ; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB8_2
@@ -5119,10 +5118,9 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v3, s5, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1132_DPP-NEXT:    s_mov_b32 s4, s6
 ; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr0
 ; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s8, vcc_lo
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB8_2
@@ -5255,9 +5253,9 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_DPP-NEXT:    v_writelane_b32 v3, s5, 16
 ; GFX1232_DPP-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1232_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1232_DPP-NEXT:    s_mov_b32 s4, s6
 ; GFX1232_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX1232_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1232_DPP-NEXT:    ; implicit-def: $vgpr0
 ; GFX1232_DPP-NEXT:    s_and_saveexec_b32 s8, vcc_lo
 ; GFX1232_DPP-NEXT:    s_cbranch_execz .LBB8_2
@@ -6938,8 +6936,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1032_DPP-NEXT:    v_writelane_b32 v2, s8, 16
 ; GFX1032_DPP-NEXT:    v_writelane_b32 v1, s3, 16
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s2
-; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032_DPP-NEXT:    s_mov_b32 s2, -1
+; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
 ; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s8, vcc_lo
 ; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB11_2
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 988bc8eec6e517..ce90fbed813103 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -936,8 +936,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032_DPP-NEXT:    v_writelane_b32 v3, s1, 16
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr0
 ; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB2_2
@@ -1047,8 +1047,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v3, s1, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr0
 ; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB2_2
@@ -2684,8 +2684,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_writelane_b32 v2, s6, 16
 ; GFX1032_DPP-NEXT:    v_writelane_b32 v1, s5, 16
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v9
 ; GFX1032_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v9
 ; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
 ; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB6_2
@@ -2874,8 +2874,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v1, s5, 16
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v2, s6, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v8
 ; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v8
 ; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr8_vgpr9
 ; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB6_2
@@ -3383,8 +3383,8 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
 ; GFX1032_DPP-NEXT:    v_add_co_u32 v1, vcc_lo, v1, v3
 ; GFX1032_DPP-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v2, v4, vcc_lo
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v11, exec_lo, 0
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v9, v1
+; GFX1032_DPP-NEXT:    v_mbcnt_lo_u32_b32 v11, exec_lo, 0
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v10, v2
 ; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v11
@@ -4444,8 +4444,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032_DPP-NEXT:    v_writelane_b32 v3, s1, 16
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr0
 ; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB10_2
@@ -4555,8 +4555,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v3, s1, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr0
 ; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB10_2
@@ -6218,8 +6218,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_writelane_b32 v2, s6, 16
 ; GFX1032_DPP-NEXT:    v_writelane_b32 v1, s5, 16
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v9
 ; GFX1032_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v9
 ; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
 ; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB14_2
@@ -6408,8 +6408,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v1, s5, 16
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v2, s6, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v8
 ; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v8
 ; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr8_vgpr9
 ; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB14_2
@@ -6915,8 +6915,8 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032_DPP-NEXT:    v_writelane_b32 v3, s1, 16
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr0
 ; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB15_2
@@ -7026,9 +7026,8 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v3, s1, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr0
 ; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB15_2
@@ -7627,8 +7626,8 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_writelane_b32 v6, s5, 16
 ; GFX1032_DPP-NEXT:    v_writelane_b32 v5, s6, 16
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
 ; GFX1032_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
 ; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
 ; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB16_2
@@ -7786,8 +7785,8 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v6, s5, 16
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v5, s6, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
 ; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
 ; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
 ; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB16_2
@@ -8294,8 +8293,8 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032_DPP-NEXT:    v_writelane_b32 v3, s1, 16
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr0
 ; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB17_2
@@ -8405,8 +8404,8 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v3, s1, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr0
 ; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB17_2
@@ -9006,8 +9005,8 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_writelane_b32 v6, s5, 16
 ; GFX1032_DPP-NEXT:    v_writelane_b32 v5, s6, 16
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
 ; GFX1032_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
 ; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
 ; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB18_2
@@ -9165,8 +9164,8 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v6, s5, 16
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v5, s6, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
 ; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
 ; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
 ; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB18_2
@@ -9673,8 +9672,8 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032_DPP-NEXT:    v_writelane_b32 v3, s1, 16
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr0
 ; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB19_2
@@ -9784,8 +9783,8 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v3, s1, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr0
 ; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB19_2
@@ -10385,8 +10384,8 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_writelane_b32 v6, s5, 16
 ; GFX1032_DPP-NEXT:    v_writelane_b32 v5, s6, 16
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
 ; GFX1032_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
 ; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
 ; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB20_2
@@ -10544,8 +10543,8 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v6, s5, 16
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v5, s6, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
 ; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
 ; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr7_vgpr8
 ; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB20_2
@@ -11051,8 +11050,8 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032_DPP-NEXT:    v_writelane_b32 v3, s1, 16
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr0
 ; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB21_2
@@ -11162,9 +11161,8 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v3, s1, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr0
 ; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB21_2
@@ -12196,8 +12194,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_writelane_b32 v2, s5, 16
 ; GFX1032_DPP-NEXT:    v_writelane_b32 v1, s6, 16
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v9
 ; GFX1032_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v9
 ; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
 ; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB23_2
@@ -12415,8 +12413,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v2, s5, 16
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v1, s6, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v9
 ; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v9
 ; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
 ; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB23_2
@@ -12923,8 +12921,8 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032_DPP-NEXT:    v_writelane_b32 v3, s1, 16
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr0
 ; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB24_2
@@ -13034,9 +13032,8 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v3, s1, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr0
 ; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB24_2
@@ -14788,8 +14785,8 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032_DPP-NEXT:    v_writelane_b32 v3, s1, 16
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr0
 ; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB27_2
@@ -14899,8 +14896,8 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v3, s1, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr0
 ; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB27_2
@@ -15909,8 +15906,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_writelane_b32 v2, s5, 16
 ; GFX1032_DPP-NEXT:    v_writelane_b32 v1, s6, 16
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v9
 ; GFX1032_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v9
 ; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
 ; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB29_2
@@ -16125,8 +16122,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v2, s5, 16
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v1, s6, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v9
 ; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v9
 ; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
 ; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB29_2
@@ -16633,8 +16630,8 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032_DPP-NEXT:    v_writelane_b32 v3, s1, 16
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr0
 ; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB30_2
@@ -16744,9 +16741,8 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v3, s1, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr0
 ; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB30_2
@@ -17754,8 +17750,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_writelane_b32 v2, s5, 16
 ; GFX1032_DPP-NEXT:    v_writelane_b32 v1, s6, 16
 ; GFX1032_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v9
 ; GFX1032_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX1032_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v9
 ; GFX1032_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
 ; GFX1032_DPP-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1032_DPP-NEXT:    s_cbranch_execz .LBB32_2
@@ -17970,8 +17966,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v2, s5, 16
 ; GFX1132_DPP-NEXT:    v_writelane_b32 v1, s6, 16
 ; GFX1132_DPP-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v9
 ; GFX1132_DPP-NEXT:    s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v9
 ; GFX1132_DPP-NEXT:    ; implicit-def: $vgpr9_vgpr10
 ; GFX1132_DPP-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1132_DPP-NEXT:    s_cbranch_execz .LBB32_2
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index 2b18f472c8c402..c3a197ce99859f 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -1263,16 +1263,16 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
 ;
 ; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
 ; GFX1032:       ; %bb.0:
-; GFX1032-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1032-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX1032-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1032-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX1032-NEXT:    s_mov_b32 s14, -1
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
 ; GFX1032-NEXT:    s_mov_b32 s15, 0x31c16000
 ; GFX1032-NEXT:    s_add_u32 s12, s12, s9
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX1032-NEXT:    s_mov_b32 s4, 0
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz .LBB2_3
 ; GFX1032-NEXT:  ; %bb.1:
@@ -1483,16 +1483,16 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
 ; GFX1032-DPP:       ; %bb.0:
-; GFX1032-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1032-DPP-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1032-DPP-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s14, -1
+; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
 ; GFX1032-DPP-NEXT:    s_mov_b32 s15, 0x31c16000
 ; GFX1032-DPP-NEXT:    s_add_u32 s12, s12, s9
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX1032-DPP-NEXT:    s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB2_3
 ; GFX1032-DPP-NEXT:  ; %bb.1:
@@ -2471,16 +2471,16 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
 ;
 ; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
 ; GFX1032:       ; %bb.0:
-; GFX1032-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1032-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX1032-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1032-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX1032-NEXT:    s_mov_b32 s14, -1
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
 ; GFX1032-NEXT:    s_mov_b32 s15, 0x31c16000
 ; GFX1032-NEXT:    s_add_u32 s12, s12, s9
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX1032-NEXT:    s_mov_b32 s4, 0
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX1032-NEXT:  ; %bb.1:
@@ -2721,16 +2721,16 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
 ; GFX1032-DPP:       ; %bb.0:
-; GFX1032-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1032-DPP-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1032-DPP-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s14, -1
+; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
 ; GFX1032-DPP-NEXT:    s_mov_b32 s15, 0x31c16000
 ; GFX1032-DPP-NEXT:    s_add_u32 s12, s12, s9
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX1032-DPP-NEXT:    s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX1032-DPP-NEXT:  ; %bb.1:
@@ -4503,16 +4503,16 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop
 ;
 ; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp:
 ; GFX1032:       ; %bb.0:
-; GFX1032-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1032-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX1032-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1032-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX1032-NEXT:    s_mov_b32 s14, -1
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
 ; GFX1032-NEXT:    s_mov_b32 s15, 0x31c16000
 ; GFX1032-NEXT:    s_add_u32 s12, s12, s9
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX1032-NEXT:    s_mov_b32 s4, 0
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1032-NEXT:  ; %bb.1:
@@ -4753,16 +4753,16 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp:
 ; GFX1032-DPP:       ; %bb.0:
-; GFX1032-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1032-DPP-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1032-DPP-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s14, -1
+; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
 ; GFX1032-DPP-NEXT:    s_mov_b32 s15, 0x31c16000
 ; GFX1032-DPP-NEXT:    s_add_u32 s12, s12, s9
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX1032-DPP-NEXT:    s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1032-DPP-NEXT:  ; %bb.1:
@@ -5929,19 +5929,19 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ;
 ; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1032:       ; %bb.0:
+; GFX1032-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
 ; GFX1032-NEXT:    s_mov_b32 s33, s8
 ; GFX1032-NEXT:    s_mov_b32 s8, exec_lo
-; GFX1032-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v3, s8, 0
 ; GFX1032-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
 ; GFX1032-NEXT:    s_mov_b32 s50, -1
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v3, s8, 0
 ; GFX1032-NEXT:    s_mov_b32 s51, 0x31c16000
 ; GFX1032-NEXT:    s_add_u32 s48, s48, s9
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
 ; GFX1032-NEXT:    s_addc_u32 s49, s49, 0
 ; GFX1032-NEXT:    s_mov_b64 s[38:39], s[0:1]
 ; GFX1032-NEXT:    s_mov_b32 s44, 0
 ; GFX1032-NEXT:    s_movk_i32 s32, 0x400
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz .LBB9_3
 ; GFX1032-NEXT:  ; %bb.1:
@@ -6378,19 +6378,19 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1032-DPP:       ; %bb.0:
+; GFX1032-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
 ; GFX1032-DPP-NEXT:    s_mov_b32 s33, s8
 ; GFX1032-DPP-NEXT:    s_mov_b32 s8, exec_lo
-; GFX1032-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, s8, 0
 ; GFX1032-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s50, -1
+; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, s8, 0
 ; GFX1032-DPP-NEXT:    s_mov_b32 s51, 0x31c16000
 ; GFX1032-DPP-NEXT:    s_add_u32 s48, s48, s9
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
 ; GFX1032-DPP-NEXT:    s_addc_u32 s49, s49, 0
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[38:39], s[0:1]
 ; GFX1032-DPP-NEXT:    s_mov_b32 s44, 0
 ; GFX1032-DPP-NEXT:    s_movk_i32 s32, 0x400
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB9_3
 ; GFX1032-DPP-NEXT:  ; %bb.1:
@@ -7595,8 +7595,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
 ; GFX1032-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v8
+; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v42, v9
 ; GFX1032-DPP-NEXT:    s_mov_b32 s44, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
@@ -8020,16 +8020,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
 ;
 ; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
 ; GFX1032:       ; %bb.0:
-; GFX1032-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1032-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX1032-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1032-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX1032-NEXT:    s_mov_b32 s14, -1
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
 ; GFX1032-NEXT:    s_mov_b32 s15, 0x31c16000
 ; GFX1032-NEXT:    s_add_u32 s12, s12, s9
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX1032-NEXT:    s_mov_b32 s4, 0
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz .LBB11_3
 ; GFX1032-NEXT:  ; %bb.1:
@@ -8277,16 +8277,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
 ; GFX1032-DPP:       ; %bb.0:
-; GFX1032-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1032-DPP-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1032-DPP-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s14, -1
+; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
 ; GFX1032-DPP-NEXT:    s_mov_b32 s15, 0x31c16000
 ; GFX1032-DPP-NEXT:    s_add_u32 s12, s12, s9
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX1032-DPP-NEXT:    s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB11_3
 ; GFX1032-DPP-NEXT:  ; %bb.1:
@@ -9107,8 +9107,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
 ; GFX1032-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
 ; GFX1032-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[5:6]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
@@ -9444,16 +9444,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ;
 ; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp:
 ; GFX1032:       ; %bb.0:
-; GFX1032-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1032-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX1032-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1032-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX1032-NEXT:    s_mov_b32 s14, -1
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
 ; GFX1032-NEXT:    s_mov_b32 s15, 0x31c16000
 ; GFX1032-NEXT:    s_add_u32 s12, s12, s9
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX1032-NEXT:    s_mov_b32 s4, 0
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz .LBB13_3
 ; GFX1032-NEXT:  ; %bb.1:
@@ -9701,16 +9701,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp:
 ; GFX1032-DPP:       ; %bb.0:
-; GFX1032-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1032-DPP-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1032-DPP-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s14, -1
+; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
 ; GFX1032-DPP-NEXT:    s_mov_b32 s15, 0x31c16000
 ; GFX1032-DPP-NEXT:    s_add_u32 s12, s12, s9
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX1032-DPP-NEXT:    s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB13_3
 ; GFX1032-DPP-NEXT:  ; %bb.1:
@@ -10531,8 +10531,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
 ; GFX1032-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[5:6]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
@@ -11437,8 +11437,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
 ; GFX1032-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[5:6]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
@@ -13574,8 +13574,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1032-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
 ; GFX1032-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v8
+; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v42, v9
 ; GFX1032-DPP-NEXT:    s_mov_b32 s44, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index e3144ae24ae8d6..69c6adf0300c22 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -3348,17 +3348,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
 ;
 ; GFX1032-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1032:       ; %bb.0:
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
 ; GFX1032-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
 ; GFX1032-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
 ; GFX1032-NEXT:    s_mov_b32 s50, -1
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
 ; GFX1032-NEXT:    s_mov_b32 s51, 0x31c16000
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
 ; GFX1032-NEXT:    s_add_u32 s48, s48, s9
 ; GFX1032-NEXT:    s_addc_u32 s49, s49, 0
 ; GFX1032-NEXT:    s_mov_b64 s[38:39], s[0:1]
 ; GFX1032-NEXT:    s_mov_b32 s44, 0
 ; GFX1032-NEXT:    s_movk_i32 s32, 0x400
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz .LBB6_3
 ; GFX1032-NEXT:  ; %bb.1:
@@ -3778,17 +3778,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1032-DPP:       ; %bb.0:
-; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
 ; GFX1032-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
 ; GFX1032-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s50, -1
+; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
 ; GFX1032-DPP-NEXT:    s_mov_b32 s51, 0x31c16000
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
 ; GFX1032-DPP-NEXT:    s_add_u32 s48, s48, s9
 ; GFX1032-DPP-NEXT:    s_addc_u32 s49, s49, 0
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[38:39], s[0:1]
 ; GFX1032-DPP-NEXT:    s_mov_b32 s44, 0
 ; GFX1032-DPP-NEXT:    s_movk_i32 s32, 0x400
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB6_3
 ; GFX1032-DPP-NEXT:  ; %bb.1:
@@ -5038,8 +5038,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
 ; GFX1032-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, v8
+; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v9
 ; GFX1032-DPP-NEXT:    s_mov_b32 s44, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
@@ -6403,8 +6403,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX1032-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[5:6]
 ; GFX1032-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[5:6]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -6844,17 +6844,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
 ;
 ; GFX1032-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
 ; GFX1032:       ; %bb.0:
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
 ; GFX1032-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
 ; GFX1032-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
 ; GFX1032-NEXT:    s_mov_b32 s50, -1
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
 ; GFX1032-NEXT:    s_mov_b32 s51, 0x31c16000
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
 ; GFX1032-NEXT:    s_add_u32 s48, s48, s9
 ; GFX1032-NEXT:    s_addc_u32 s49, s49, 0
 ; GFX1032-NEXT:    s_mov_b64 s[38:39], s[0:1]
 ; GFX1032-NEXT:    s_mov_b32 s44, 0
 ; GFX1032-NEXT:    s_movk_i32 s32, 0x400
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX1032-NEXT:  ; %bb.1:
@@ -7274,17 +7274,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
 ; GFX1032-DPP:       ; %bb.0:
-; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
 ; GFX1032-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
 ; GFX1032-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s50, -1
+; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
 ; GFX1032-DPP-NEXT:    s_mov_b32 s51, 0x31c16000
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
 ; GFX1032-DPP-NEXT:    s_add_u32 s48, s48, s9
 ; GFX1032-DPP-NEXT:    s_addc_u32 s49, s49, 0
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[38:39], s[0:1]
 ; GFX1032-DPP-NEXT:    s_mov_b32 s44, 0
 ; GFX1032-DPP-NEXT:    s_movk_i32 s32, 0x400
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX1032-DPP-NEXT:  ; %bb.1:
@@ -8534,8 +8534,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1032-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
 ; GFX1032-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, v8
+; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v9
 ; GFX1032-DPP-NEXT:    s_mov_b32 s44, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index ddc103184cdf35..b7890f30f77603 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -3348,17 +3348,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
 ;
 ; GFX1032-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1032:       ; %bb.0:
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
 ; GFX1032-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
 ; GFX1032-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
 ; GFX1032-NEXT:    s_mov_b32 s50, -1
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
 ; GFX1032-NEXT:    s_mov_b32 s51, 0x31c16000
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
 ; GFX1032-NEXT:    s_add_u32 s48, s48, s9
 ; GFX1032-NEXT:    s_addc_u32 s49, s49, 0
 ; GFX1032-NEXT:    s_mov_b64 s[38:39], s[0:1]
 ; GFX1032-NEXT:    s_mov_b32 s44, 0
 ; GFX1032-NEXT:    s_movk_i32 s32, 0x400
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz .LBB6_3
 ; GFX1032-NEXT:  ; %bb.1:
@@ -3778,17 +3778,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1032-DPP:       ; %bb.0:
-; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
 ; GFX1032-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
 ; GFX1032-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s50, -1
+; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
 ; GFX1032-DPP-NEXT:    s_mov_b32 s51, 0x31c16000
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
 ; GFX1032-DPP-NEXT:    s_add_u32 s48, s48, s9
 ; GFX1032-DPP-NEXT:    s_addc_u32 s49, s49, 0
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[38:39], s[0:1]
 ; GFX1032-DPP-NEXT:    s_mov_b32 s44, 0
 ; GFX1032-DPP-NEXT:    s_movk_i32 s32, 0x400
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB6_3
 ; GFX1032-DPP-NEXT:  ; %bb.1:
@@ -5038,8 +5038,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
 ; GFX1032-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, v8
+; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v9
 ; GFX1032-DPP-NEXT:    s_mov_b32 s44, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
@@ -6403,8 +6403,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX1032-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[5:6]
 ; GFX1032-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], v[5:6]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -6844,17 +6844,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
 ;
 ; GFX1032-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
 ; GFX1032:       ; %bb.0:
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
 ; GFX1032-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
 ; GFX1032-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
 ; GFX1032-NEXT:    s_mov_b32 s50, -1
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
 ; GFX1032-NEXT:    s_mov_b32 s51, 0x31c16000
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
 ; GFX1032-NEXT:    s_add_u32 s48, s48, s9
 ; GFX1032-NEXT:    s_addc_u32 s49, s49, 0
 ; GFX1032-NEXT:    s_mov_b64 s[38:39], s[0:1]
 ; GFX1032-NEXT:    s_mov_b32 s44, 0
 ; GFX1032-NEXT:    s_movk_i32 s32, 0x400
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX1032-NEXT:  ; %bb.1:
@@ -7274,17 +7274,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
 ; GFX1032-DPP:       ; %bb.0:
-; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
 ; GFX1032-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
 ; GFX1032-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s50, -1
+; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
 ; GFX1032-DPP-NEXT:    s_mov_b32 s51, 0x31c16000
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
 ; GFX1032-DPP-NEXT:    s_add_u32 s48, s48, s9
 ; GFX1032-DPP-NEXT:    s_addc_u32 s49, s49, 0
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[38:39], s[0:1]
 ; GFX1032-DPP-NEXT:    s_mov_b32 s44, 0
 ; GFX1032-DPP-NEXT:    s_movk_i32 s32, 0x400
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX1032-DPP-NEXT:  ; %bb.1:
@@ -8534,8 +8534,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1032-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
 ; GFX1032-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, v8
+; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v9
 ; GFX1032-DPP-NEXT:    s_mov_b32 s44, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index f353edff1b477a..fcd5d0dc497e67 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -1367,16 +1367,16 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
 ;
 ; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp:
 ; GFX1032:       ; %bb.0:
-; GFX1032-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1032-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX1032-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1032-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX1032-NEXT:    s_mov_b32 s14, -1
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
 ; GFX1032-NEXT:    s_mov_b32 s15, 0x31c16000
 ; GFX1032-NEXT:    s_add_u32 s12, s12, s9
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX1032-NEXT:    s_mov_b32 s4, 0
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz .LBB2_3
 ; GFX1032-NEXT:  ; %bb.1:
@@ -1617,16 +1617,16 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp:
 ; GFX1032-DPP:       ; %bb.0:
-; GFX1032-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1032-DPP-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1032-DPP-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s14, -1
+; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
 ; GFX1032-DPP-NEXT:    s_mov_b32 s15, 0x31c16000
 ; GFX1032-DPP-NEXT:    s_add_u32 s12, s12, s9
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX1032-DPP-NEXT:    s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB2_3
 ; GFX1032-DPP-NEXT:  ; %bb.1:
@@ -2687,16 +2687,16 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
 ;
 ; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp:
 ; GFX1032:       ; %bb.0:
-; GFX1032-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1032-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX1032-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1032-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX1032-NEXT:    s_mov_b32 s14, -1
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
 ; GFX1032-NEXT:    s_mov_b32 s15, 0x31c16000
 ; GFX1032-NEXT:    s_add_u32 s12, s12, s9
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX1032-NEXT:    s_mov_b32 s4, 0
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX1032-NEXT:  ; %bb.1:
@@ -2937,16 +2937,16 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp:
 ; GFX1032-DPP:       ; %bb.0:
-; GFX1032-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1032-DPP-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1032-DPP-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s14, -1
+; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
 ; GFX1032-DPP-NEXT:    s_mov_b32 s15, 0x31c16000
 ; GFX1032-DPP-NEXT:    s_add_u32 s12, s12, s9
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX1032-DPP-NEXT:    s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX1032-DPP-NEXT:  ; %bb.1:
@@ -4823,16 +4823,16 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop
 ;
 ; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp:
 ; GFX1032:       ; %bb.0:
-; GFX1032-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1032-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX1032-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1032-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX1032-NEXT:    s_mov_b32 s14, -1
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
 ; GFX1032-NEXT:    s_mov_b32 s15, 0x31c16000
 ; GFX1032-NEXT:    s_add_u32 s12, s12, s9
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX1032-NEXT:    s_mov_b32 s4, 0
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1032-NEXT:  ; %bb.1:
@@ -5073,16 +5073,16 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp:
 ; GFX1032-DPP:       ; %bb.0:
-; GFX1032-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1032-DPP-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1032-DPP-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s14, -1
+; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
 ; GFX1032-DPP-NEXT:    s_mov_b32 s15, 0x31c16000
 ; GFX1032-DPP-NEXT:    s_add_u32 s12, s12, s9
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX1032-DPP-NEXT:    s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1032-DPP-NEXT:  ; %bb.1:
@@ -6249,19 +6249,19 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ;
 ; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1032:       ; %bb.0:
+; GFX1032-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
 ; GFX1032-NEXT:    s_mov_b32 s33, s8
 ; GFX1032-NEXT:    s_mov_b32 s8, exec_lo
-; GFX1032-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v3, s8, 0
 ; GFX1032-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
 ; GFX1032-NEXT:    s_mov_b32 s50, -1
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v3, s8, 0
 ; GFX1032-NEXT:    s_mov_b32 s51, 0x31c16000
 ; GFX1032-NEXT:    s_add_u32 s48, s48, s9
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
 ; GFX1032-NEXT:    s_addc_u32 s49, s49, 0
 ; GFX1032-NEXT:    s_mov_b64 s[38:39], s[0:1]
 ; GFX1032-NEXT:    s_mov_b32 s44, 0
 ; GFX1032-NEXT:    s_movk_i32 s32, 0x400
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz .LBB9_3
 ; GFX1032-NEXT:  ; %bb.1:
@@ -6698,19 +6698,19 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1032-DPP:       ; %bb.0:
+; GFX1032-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
 ; GFX1032-DPP-NEXT:    s_mov_b32 s33, s8
 ; GFX1032-DPP-NEXT:    s_mov_b32 s8, exec_lo
-; GFX1032-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, s8, 0
 ; GFX1032-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s50, -1
+; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, s8, 0
 ; GFX1032-DPP-NEXT:    s_mov_b32 s51, 0x31c16000
 ; GFX1032-DPP-NEXT:    s_add_u32 s48, s48, s9
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
 ; GFX1032-DPP-NEXT:    s_addc_u32 s49, s49, 0
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[38:39], s[0:1]
 ; GFX1032-DPP-NEXT:    s_mov_b32 s44, 0
 ; GFX1032-DPP-NEXT:    s_movk_i32 s32, 0x400
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB9_3
 ; GFX1032-DPP-NEXT:  ; %bb.1:
@@ -7915,8 +7915,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
 ; GFX1032-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v8
+; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v42, v9
 ; GFX1032-DPP-NEXT:    s_mov_b32 s44, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
@@ -8340,16 +8340,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
 ;
 ; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
 ; GFX1032:       ; %bb.0:
-; GFX1032-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1032-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX1032-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1032-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX1032-NEXT:    s_mov_b32 s14, -1
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
 ; GFX1032-NEXT:    s_mov_b32 s15, 0x31c16000
 ; GFX1032-NEXT:    s_add_u32 s12, s12, s9
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX1032-NEXT:    s_mov_b32 s4, 0
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz .LBB11_3
 ; GFX1032-NEXT:  ; %bb.1:
@@ -8597,16 +8597,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
 ; GFX1032-DPP:       ; %bb.0:
-; GFX1032-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1032-DPP-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1032-DPP-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s14, -1
+; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
 ; GFX1032-DPP-NEXT:    s_mov_b32 s15, 0x31c16000
 ; GFX1032-DPP-NEXT:    s_add_u32 s12, s12, s9
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX1032-DPP-NEXT:    s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB11_3
 ; GFX1032-DPP-NEXT:  ; %bb.1:
@@ -9426,8 +9426,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
 ; GFX1032-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
 ; GFX1032-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[5:6]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
@@ -9763,16 +9763,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ;
 ; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp:
 ; GFX1032:       ; %bb.0:
-; GFX1032-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1032-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX1032-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1032-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX1032-NEXT:    s_mov_b32 s14, -1
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
 ; GFX1032-NEXT:    s_mov_b32 s15, 0x31c16000
 ; GFX1032-NEXT:    s_add_u32 s12, s12, s9
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX1032-NEXT:    s_mov_b32 s4, 0
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz .LBB13_3
 ; GFX1032-NEXT:  ; %bb.1:
@@ -10020,16 +10020,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp:
 ; GFX1032-DPP:       ; %bb.0:
-; GFX1032-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1032-DPP-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1032-DPP-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s14, -1
+; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
 ; GFX1032-DPP-NEXT:    s_mov_b32 s15, 0x31c16000
 ; GFX1032-DPP-NEXT:    s_add_u32 s12, s12, s9
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX1032-DPP-NEXT:    s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB13_3
 ; GFX1032-DPP-NEXT:  ; %bb.1:
@@ -10850,8 +10850,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
 ; GFX1032-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[5:6]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
@@ -11756,8 +11756,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1032-DPP-NEXT:    v_permlanex16_b32 v5, v3, 0, 0
 ; GFX1032-DPP-NEXT:    v_add_f64 v[3:4], v[3:4], v[5:6]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
@@ -13892,8 +13892,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1032-DPP-NEXT:    v_permlanex16_b32 v10, v8, 0, 0
 ; GFX1032-DPP-NEXT:    v_add_f64 v[8:9], v[8:9], v[10:11]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v41, v8
+; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v42, v9
 ; GFX1032-DPP-NEXT:    s_mov_b32 s44, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
index 684ca3aac7c315..004a720b9ab486 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
@@ -216,8 +216,8 @@ define amdgpu_ps void @branch(float %arg0, float %arg1) {
 ; GFX10-32-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX10-32-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX10-32-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-32-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
+; GFX10-32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-32-NEXT:    s_and_saveexec_b32 s2, s0
 ; GFX10-32-NEXT:    s_xor_b32 s0, exec_lo, s2
 ; GFX10-32-NEXT:    s_cbranch_execz .LBB2_3
diff --git a/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll b/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll
index f60786c1bacbff..6f841c88a6d8bb 100644
--- a/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll
+++ b/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll
@@ -4,8 +4,8 @@
 define amdgpu_cs void @if_then(ptr addrspace(8) inreg %input, ptr addrspace(8) inreg %output, <3 x i32> %LocalInvocationId) {
 ; GCN-LABEL: if_then:
 ; GCN:       ; %bb.0: ; %.entry
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GCN-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GCN-NEXT:  ; %bb.1: ; %.bb0
 ; GCN-NEXT:    v_mov_b32_e32 v3, 1
@@ -60,8 +60,8 @@ define amdgpu_cs void @if_then(ptr addrspace(8) inreg %input, ptr addrspace(8) i
 define amdgpu_cs void @if_else_vgpr_opt(ptr addrspace(8) inreg %input, ptr addrspace(8) inreg %output, <3 x i32> %LocalInvocationId) {
 ; GCN-LABEL: if_else_vgpr_opt:
 ; GCN:       ; %bb.0: ; %.entry
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GCN-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GCN-NEXT:  ; %bb.1: ; %.bb0
 ; GCN-NEXT:    v_mov_b32_e32 v3, 1
diff --git a/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll b/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll
index 90b32e29e98f67..3519befabd3bc7 100644
--- a/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll
@@ -4,10 +4,10 @@
 define amdgpu_cs void @should_not_hoist_set_inactive(<4 x i32> inreg %i14, i32 inreg %v, i32 %lane, i32 %f, i32 %f2) #0 {
 ; GCN-LABEL: should_not_hoist_set_inactive:
 ; GCN:       ; %bb.0: ; %.entry
-; GCN-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 3, v1
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s5, 0, v0
 ; GCN-NEXT:    v_cmp_ne_u32_e64 s6, 0, v2
 ; GCN-NEXT:    s_mov_b32 s7, 0
+; GCN-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 3, v1
 ; GCN-NEXT:    s_branch .LBB0_2
 ; GCN-NEXT:  .LBB0_1: ; %bb4
 ; GCN-NEXT:    ; in Loop: Header=BB0_2 Depth=1
diff --git a/llvm/test/CodeGen/AMDGPU/shrink-true16.mir b/llvm/test/CodeGen/AMDGPU/shrink-true16.mir
index 1a7ec5db9efa2a..be759049bc3a7d 100644
--- a/llvm/test/CodeGen/AMDGPU/shrink-true16.mir
+++ b/llvm/test/CodeGen/AMDGPU/shrink-true16.mir
@@ -11,7 +11,7 @@ body: |
     ; GFX1100-LABEL: name: 16bit_lo128_shrink
     ; GFX1100: liveins: $vgpr127
     ; GFX1100-NEXT: {{  $}}
-    ; GFX1100-NEXT: V_CMP_EQ_U16_t16_e32 0, $vgpr127, implicit-def $vcc, implicit $exec, implicit $exec
+    ; GFX1100-NEXT: V_CMP_EQ_U16_t16_e32 0, $vgpr127, implicit-def $vcc_lo, implicit $exec, implicit $exec
     $vcc_lo = V_CMP_EQ_U16_t16_e64 0, $vgpr127, implicit-def $vcc, implicit $exec
 ...
 
diff --git a/llvm/test/CodeGen/AMDGPU/shrink-v-cmp-wave32-dead-vcc-lo.mir b/llvm/test/CodeGen/AMDGPU/shrink-v-cmp-wave32-dead-vcc-lo.mir
new file mode 100644
index 00000000000000..73c55265af20b9
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/shrink-v-cmp-wave32-dead-vcc-lo.mir
@@ -0,0 +1,55 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass=si-shrink-instructions -mcpu=gfx1100 -o - %s | FileCheck %s
+
+# Make sure there's no crash when shrinking a v_cmp on a wave32 target
+# when the def is dead. Previously the vcc implicit def wasn't
+# properly replaced with vcc_lo, so the expected implicit operand was
+# not found in the shrunk instruction.
+
+---
+name:            shrink_v_cmp_vcc_lo_dead
+tracksRegLiveness: true
+tracksDebugUserValues: true
+frameInfo:
+  maxAlignment:    1
+  maxCallFrameSize: 0
+  isCalleeSavedInfoValid: true
+machineFunctionInfo:
+  maxKernArgAlign: 1
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    ; CHECK-LABEL: name: shrink_v_cmp_vcc_lo_dead
+    ; CHECK: liveins: $vgpr0, $vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: V_CMP_LT_U32_e32 $vgpr0, $vgpr1, implicit-def dead $vcc_lo, implicit $exec
+    ; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31
+    dead renamable $vcc_lo = V_CMP_LT_U32_e64 $vgpr0, $vgpr1, implicit $exec
+    S_SETPC_B64_return undef $sgpr30_sgpr31
+
+...
+
+---
+name:            shrink_v_cmp_vcc_lo_live
+tracksRegLiveness: true
+tracksDebugUserValues: true
+frameInfo:
+  maxAlignment:    1
+  maxCallFrameSize: 0
+  isCalleeSavedInfoValid: true
+machineFunctionInfo:
+  maxKernArgAlign: 1
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    ; CHECK-LABEL: name: shrink_v_cmp_vcc_lo_live
+    ; CHECK: liveins: $vgpr0, $vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: V_CMP_LT_U32_e32 $vgpr0, $vgpr1, implicit-def $vcc_lo, implicit $exec
+    ; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vcc_lo
+    renamable $vcc_lo = V_CMP_LT_U32_e64 $vgpr0, $vgpr1, implicit $exec
+    S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vcc_lo
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
index eebd32cd67e6e6..8e0a83671a1837 100644
--- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
+++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
@@ -1027,8 +1027,8 @@ define amdgpu_ps void @test_kill_divergent_loop(i32 %arg) #0 {
 ;
 ; GFX10-WAVE32-LABEL: test_kill_divergent_loop:
 ; GFX10-WAVE32:       ; %bb.0: ; %entry
-; GFX10-WAVE32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-WAVE32-NEXT:    s_mov_b32 s0, exec_lo
+; GFX10-WAVE32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-WAVE32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX10-WAVE32-NEXT:    s_xor_b32 s1, exec_lo, s1
 ; GFX10-WAVE32-NEXT:    s_cbranch_execz .LBB10_3
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll
index 25d8300eb45835..a0bce3432a4bd0 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll
@@ -86,8 +86,8 @@ end:
 define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 {
 ; SI-LABEL: else3:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 6, v0
 ; SI-NEXT:    s_mov_b32 s1, 0
+; SI-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 6, v0
 ; SI-NEXT:    s_branch .LBB2_2
 ; SI-NEXT:  .LBB2_1: ; %if.end
 ; SI-NEXT:    ; in Loop: Header=BB2_2 Depth=1
@@ -161,16 +161,16 @@ for.end:
 define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, ptr %extern_func, ptr %extern_func2) #0 {
 ; SI-LABEL: loop:
 ; SI:       ; %bb.0: ; %main_body
-; SI-NEXT:    v_mov_b32_e32 v6, v0
 ; SI-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
 ; SI-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; SI-NEXT:    s_mov_b32 s14, -1
+; SI-NEXT:    v_mov_b32_e32 v6, v0
 ; SI-NEXT:    v_mov_b32_e32 v0, v1
-; SI-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 6, v6
 ; SI-NEXT:    s_mov_b32 s15, 0x31c16000
 ; SI-NEXT:    s_add_u32 s12, s12, s1
 ; SI-NEXT:    s_addc_u32 s13, s13, 0
 ; SI-NEXT:    s_mov_b32 s32, 0
+; SI-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 6, v6
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; SI-NEXT:    s_xor_b32 s6, exec_lo, s0
@@ -243,11 +243,11 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, ptr %e
 ; SI-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; SI-NEXT:    s_mov_b32 s14, -1
 ; SI-NEXT:    v_mov_b32_e32 v40, v1
-; SI-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 6, v0
 ; SI-NEXT:    s_mov_b32 s15, 0x31c16000
 ; SI-NEXT:    s_add_u32 s12, s12, s1
 ; SI-NEXT:    s_addc_u32 s13, s13, 0
 ; SI-NEXT:    s_mov_b32 s32, 0
+; SI-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 6, v0
 ; SI-NEXT:    ; implicit-def: $vgpr0
 ; SI-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; SI-NEXT:    s_xor_b32 s6, exec_lo, s0
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index 92117e0688f65c..4576d829b0cb0a 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -372,8 +372,8 @@ define amdgpu_kernel void @test_loop_with_if(ptr addrspace(1) %arg) #0 {
 ; GFX1032-NEXT:  .LBB10_2: ; %bb2
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-NEXT:    v_cmp_ge_i32_e64 s4, v1, v0
-; GFX1032-NEXT:    v_cmp_lt_i32_e32 vcc_lo, v1, v0
 ; GFX1032-NEXT:    s_mov_b32 s3, 0
+; GFX1032-NEXT:    v_cmp_lt_i32_e32 vcc_lo, v1, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s5, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz .LBB10_4
 ; GFX1032-NEXT:  ; %bb.3: ; %bb5
@@ -515,8 +515,8 @@ bb13:
 define amdgpu_kernel void @test_loop_with_if_else_break(ptr addrspace(1) %arg) #0 {
 ; GFX1032-LABEL: test_loop_with_if_else_break:
 ; GFX1032:       ; %bb.0: ; %bb
-; GFX1032-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_mov_b32 s4, 0
+; GFX1032-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz .LBB11_6
 ; GFX1032-NEXT:  ; %bb.1: ; %.preheader

>From e55232f6fce72c64eca70e90957340364e571e4f Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Sun, 22 Sep 2024 11:06:07 +0400
Subject: [PATCH 2/2] Remove previous call

---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 087144687df7fe..067eca209ed7d6 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4502,7 +4502,6 @@ MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI,
         // of vcc was already added during the initial BuildMI, but we
         // 1) may need to change vcc to vcc_lo to preserve the original register
         // 2) have to preserve the original flags.
-        fixImplicitOperands(*Inst32);
         copyFlagsToImplicitVCC(*Inst32, *Src2);
         continue;
       }



More information about the llvm-commits mailing list