[llvm] AMDGPU: Fix temporal divergence introduced by machine-sink and performance regression introduced by D155343 (PR #67456)

via llvm-commits llvm-commits at lists.llvm.org
Fri Sep 29 02:15:26 PDT 2023


https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/67456

>From 080b27f8d5da2bc72fd0f5ba5c98b3260c228394 Mon Sep 17 00:00:00 2001
From: Petar Avramovic <Petar.Avramovic at amd.com>
Date: Thu, 21 Sep 2023 13:02:43 +0200
Subject: [PATCH 1/3] Revert "MachineSink: Fix sinking VGPR def out of a
 divergent loop"

This reverts commit 3f8ef57bede94445b1a1042c987cc914a886e7ff.
---
 llvm/lib/CodeGen/MachineSink.cpp                  | 15 ++++-----------
 ...-loop-var-out-of-divergent-loop-swdev407790.ll |  2 +-
 ...loop-var-out-of-divergent-loop-swdev407790.mir |  2 +-
 .../CodeGen/AMDGPU/sink-after-control-flow.mir    |  2 +-
 4 files changed, 7 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index 9d4e0c647048f53..02c7880f86f00a1 100644
--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -300,7 +300,8 @@ static bool blockPrologueInterferes(const MachineBasicBlock *BB,
       if (!Reg)
         continue;
       if (MO.isUse()) {
-        if (Reg.isPhysical() && MRI && MRI->isConstantPhysReg(Reg))
+        if (Reg.isPhysical() &&
+            (TII->isIgnorableUse(MO) || (MRI && MRI->isConstantPhysReg(Reg))))
           continue;
         if (PI->modifiesRegister(Reg, TRI))
           return true;
@@ -1247,24 +1248,16 @@ MachineSinking::FindSuccToSinkTo(MachineInstr &MI, MachineBasicBlock *MBB,
   if (MBB == SuccToSinkTo)
     return nullptr;
 
-  if (!SuccToSinkTo)
-    return nullptr;
-
   // It's not safe to sink instructions to EH landing pad. Control flow into
   // landing pad is implicitly defined.
-  if (SuccToSinkTo->isEHPad())
+  if (SuccToSinkTo && SuccToSinkTo->isEHPad())
     return nullptr;
 
   // It ought to be okay to sink instructions into an INLINEASM_BR target, but
   // only if we make sure that MI occurs _before_ an INLINEASM_BR instruction in
   // the source block (which this code does not yet do). So for now, forbid
   // doing so.
-  if (SuccToSinkTo->isInlineAsmBrIndirectTarget())
-    return nullptr;
-
-  MachineBasicBlock::const_iterator InsertPos =
-      SuccToSinkTo->SkipPHIsAndLabels(SuccToSinkTo->begin());
-  if (blockPrologueInterferes(SuccToSinkTo, InsertPos, MI, TRI, TII, MRI))
+  if (SuccToSinkTo && SuccToSinkTo->isInlineAsmBrIndirectTarget())
     return nullptr;
 
   return SuccToSinkTo;
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll
index e2456b74f7ef1fa..b8e74bc7db09a1a 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll
@@ -21,6 +21,7 @@ define void @machinesink_loop_variable_out_of_divergent_loop(i32 %arg, i1 %cmp49
 ; CHECK-NEXT:  .LBB0_1: ; %Flow
 ; CHECK-NEXT:    ; in Loop: Header=BB0_3 Depth=1
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s8
+; CHECK-NEXT:    v_add_nc_u32_e32 v4, -4, v4
 ; CHECK-NEXT:  .LBB0_2: ; %Flow1
 ; CHECK-NEXT:    ; in Loop: Header=BB0_3 Depth=1
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s7
@@ -53,7 +54,6 @@ define void @machinesink_loop_variable_out_of_divergent_loop(i32 %arg, i1 %cmp49
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    v_add_nc_u32_e32 v4, s9, v2
 ; CHECK-NEXT:    v_cmp_ge_u32_e64 s4, v4, v0
-; CHECK-NEXT:    v_add_nc_u32_e32 v4, -4, v4
 ; CHECK-NEXT:    s_or_b32 s8, s4, s8
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_1
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.mir b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.mir
index cc14b4a80d58a7d..037a285794120da 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.mir
@@ -42,7 +42,6 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.5(0x40000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */
-  ; CHECK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec
   ; CHECK-NEXT:   [[SI_IF_BREAK:%[0-9]+]]:sreg_32 = SI_IF_BREAK killed [[SI_IF1]], [[SI_IF]], implicit-def dead $scc
   ; CHECK-NEXT:   SI_LOOP [[SI_IF_BREAK]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.5
@@ -52,6 +51,7 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY]], %bb.4
   ; CHECK-NEXT:   SI_END_CF [[SI_IF_BREAK]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec
   ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]]
   ; CHECK-NEXT:   S_BRANCH %bb.2
   ; CHECK-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/sink-after-control-flow.mir b/llvm/test/CodeGen/AMDGPU/sink-after-control-flow.mir
index ee3d7aeb454f96b..4feef2149b42249 100644
--- a/llvm/test/CodeGen/AMDGPU/sink-after-control-flow.mir
+++ b/llvm/test/CodeGen/AMDGPU/sink-after-control-flow.mir
@@ -17,7 +17,6 @@ body:             |
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 8
-  ; GFX10-NEXT:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[S_MOV_B32_]], [[DEF]], implicit $exec
   ; GFX10-NEXT:   [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[DEF]], 8, 5, implicit $exec
   ; GFX10-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 5
   ; GFX10-NEXT:   [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 [[V_BFE_U32_e64_]], killed [[S_MOV_B32_1]], implicit $exec
@@ -38,6 +37,7 @@ body:             |
   ; GFX10-NEXT:   successors: %bb.3(0x40000000), %bb.4(0x40000000)
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   $exec_lo = S_OR_B32 $exec_lo, [[S_XOR_B32_1]], implicit-def $scc
+  ; GFX10-NEXT:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[S_MOV_B32_]], [[DEF]], implicit $exec
   ; GFX10-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 31
   ; GFX10-NEXT:   [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 [[V_BFE_U32_e64_]], killed [[S_MOV_B32_2]], implicit $exec
   ; GFX10-NEXT:   [[S_XOR_B32_2:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_NE_U32_e64_1]], -1, implicit-def $scc

>From f0abfeb9979134663d851980311e8f1e3c718347 Mon Sep 17 00:00:00 2001
From: Petar Avramovic <Petar.Avramovic at amd.com>
Date: Thu, 21 Sep 2023 14:20:49 +0200
Subject: [PATCH 2/3] AMDGPU: Add test for temporal divergence introduced by
 machine-sink

Introduced by 5b657f50b8e8dc5836fb80e566ca7569fd04c26f that moved
LICM after AMDGPUCodeGenPrepare. Some instructions are no longer
sunk during ir optimizations but in machine-sinking instead.
If vgpr instruction used sgpr defined inside the cycle is sunk outside
of the cycle we end up with not-handled case of temporal divergence.
Add test for theoretical case when SALU instruction is sunk outside of
the cycle.
---
 ...ne-sink-temporal-divergence-swdev407790.ll | 1092 ++++++++++++++
 ...e-sink-temporal-divergence-swdev407790.mir | 1319 +++++++++++++++++
 2 files changed, 2411 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.mir

diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
new file mode 100644
index 000000000000000..ca1cf526d949a14
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
@@ -0,0 +1,1092 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 < %s | FileCheck %s
+
+; ModuleID = 'kernel_round1_passing.bc'
+source_filename = "/tmp/comgr-295d04/input/CompileSource"
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8"
+target triple = "amdgcn-amd-amdhsa"
+
+ at kernel_round1.first_words_data = external hidden unnamed_addr addrspace(3) global [896 x i8], align 1
+ at kernel_round1.collisionsData = external hidden unnamed_addr addrspace(3) global [3840 x i32], align 4
+ at kernel_round1.collisionsNum = external hidden addrspace(3) global i32, align 4
+
+; Function Attrs: convergent mustprogress nofree nounwind willreturn memory(none)
+declare hidden i64 @_Z13get_global_idj(i32 noundef) local_unnamed_addr #0
+
+; Function Attrs: convergent nounwind
+declare hidden i32 @_Z10atomic_addPU3AS1Vjj(ptr addrspace(1) noundef, i32 noundef) local_unnamed_addr #1
+
+; Function Attrs: convergent nounwind
+declare hidden i32 @_Z10atomic_subPU3AS1Vjj(ptr addrspace(1) noundef, i32 noundef) local_unnamed_addr #1
+
+; Function Attrs: convergent mustprogress nofree nounwind willreturn memory(none)
+declare hidden i64 @_Z12get_local_idj(i32 noundef) local_unnamed_addr #0
+
+; Function Attrs: convergent nounwind
+declare hidden void @_Z7barrierj(i32 noundef) local_unnamed_addr #1
+
+; Function Attrs: convergent mustprogress nofree nounwind willreturn memory(none)
+declare hidden i32 @_Z3minjj(i32 noundef, i32 noundef) local_unnamed_addr #0
+
+; Function Attrs: convergent nounwind
+declare hidden i32 @_Z10atomic_incPU3AS3Vj(ptr addrspace(3) noundef) local_unnamed_addr #1
+
+; Function Attrs: convergent mustprogress nofree nounwind willreturn memory(none)
+declare hidden i64 @_Z14get_local_sizej(i32 noundef) local_unnamed_addr #0
+
+; Function Attrs: convergent norecurse nounwind
+define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture noundef readonly align 1 %0, ptr addrspace(1) nocapture noundef writeonly align 1 %1, ptr addrspace(1) nocapture noundef readonly align 4 %2, ptr addrspace(1) noundef align 4 %3, ptr addrspace(1) nocapture noundef readnone align 4 %4) local_unnamed_addr #2 !kernel_arg_addr_space !5 !kernel_arg_access_qual !6 !kernel_arg_type !7 !kernel_arg_base_type !7 !kernel_arg_type_qual !8 !kernel_arg_name !9 !reqd_work_group_size !10 {
+; CHECK-LABEL: kernel_round1:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_add_u32 s10, s10, s15
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    s_addc_u32 s11, s11, 0
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
+; CHECK-NEXT:    s_load_dwordx8 s[44:51], s[6:7], 0x0
+; CHECK-NEXT:    s_add_u32 s0, s0, s15
+; CHECK-NEXT:    s_mov_b64 s[34:35], s[6:7]
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    v_mov_b32_e32 v41, v0
+; CHECK-NEXT:    s_add_u32 s42, s34, 40
+; CHECK-NEXT:    v_mov_b32_e32 v31, v0
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    s_mov_b64 s[36:37], s[8:9]
+; CHECK-NEXT:    s_addc_u32 s43, s35, 0
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
+; CHECK-NEXT:    s_mov_b64 s[8:9], s[42:43]
+; CHECK-NEXT:    s_mov_b32 s33, s14
+; CHECK-NEXT:    s_mov_b32 s40, s13
+; CHECK-NEXT:    s_mov_b32 s41, s12
+; CHECK-NEXT:    s_mov_b64 s[38:39], s[4:5]
+; CHECK-NEXT:    s_getpc_b64 s[6:7]
+; CHECK-NEXT:    s_add_u32 s6, s6, _Z13get_global_idj at rel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s7, s7, _Z13get_global_idj at rel32@hi+12
+; CHECK-NEXT:    v_mov_b32_e32 v45, 0
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; CHECK-NEXT:    v_mov_b32_e32 v43, v0
+; CHECK-NEXT:    v_mov_b32_e32 v31, v41
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[38:39]
+; CHECK-NEXT:    s_mov_b64 s[8:9], s[42:43]
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
+; CHECK-NEXT:    s_mov_b32 s12, s41
+; CHECK-NEXT:    s_mov_b32 s13, s40
+; CHECK-NEXT:    s_mov_b32 s14, s33
+; CHECK-NEXT:    s_getpc_b64 s[6:7]
+; CHECK-NEXT:    s_add_u32 s6, s6, _Z12get_local_idj at rel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s7, s7, _Z12get_local_idj at rel32@hi+12
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; CHECK-NEXT:    v_mov_b32_e32 v40, v0
+; CHECK-NEXT:    v_mov_b32_e32 v31, v41
+; CHECK-NEXT:    v_mov_b32_e32 v0, 1
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[38:39]
+; CHECK-NEXT:    s_mov_b64 s[8:9], s[42:43]
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
+; CHECK-NEXT:    s_mov_b32 s12, s41
+; CHECK-NEXT:    s_mov_b32 s13, s40
+; CHECK-NEXT:    s_mov_b32 s14, s33
+; CHECK-NEXT:    ds_write_b32 v45, v45 offset:15360
+; CHECK-NEXT:    s_getpc_b64 s[52:53]
+; CHECK-NEXT:    s_add_u32 s52, s52, _Z7barrierj at rel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s53, s53, _Z7barrierj at rel32@hi+12
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[52:53]
+; CHECK-NEXT:    v_lshrrev_b32_e32 v0, 1, v43
+; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 2, v43
+; CHECK-NEXT:    v_mov_b32_e32 v31, v41
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[38:39]
+; CHECK-NEXT:    s_mov_b64 s[8:9], s[42:43]
+; CHECK-NEXT:    v_and_b32_e32 v0, 0x7ffffffc, v0
+; CHECK-NEXT:    v_and_b32_e32 v1, 28, v1
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
+; CHECK-NEXT:    s_mov_b32 s12, s41
+; CHECK-NEXT:    s_mov_b32 s13, s40
+; CHECK-NEXT:    global_load_dword v0, v0, s[48:49]
+; CHECK-NEXT:    s_mov_b32 s14, s33
+; CHECK-NEXT:    s_getpc_b64 s[6:7]
+; CHECK-NEXT:    s_add_u32 s6, s6, _Z3minjj at rel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s7, s7, _Z3minjj at rel32@hi+12
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_bfe_u32 v0, v0, v1, 4
+; CHECK-NEXT:    v_mov_b32_e32 v1, 12
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; CHECK-NEXT:    v_mov_b32_e32 v42, v0
+; CHECK-NEXT:    s_mov_b32 s48, exec_lo
+; CHECK-NEXT:    v_cmpx_ne_u32_e32 0, v42
+; CHECK-NEXT:    s_cbranch_execz .LBB0_25
+; CHECK-NEXT:  ; %bb.1: ; %.preheader5
+; CHECK-NEXT:    v_mul_lo_u32 v0, v40, 14
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:    s_mov_b32 s5, 0
+; CHECK-NEXT:    v_add_nc_u32_e32 v44, 0x3c04, v0
+; CHECK-NEXT:  .LBB0_2: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    v_add_nc_u32_e32 v1, s5, v44
+; CHECK-NEXT:    s_add_i32 s5, s5, 1
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s5, v42
+; CHECK-NEXT:    ds_write_b8 v1, v45
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
+; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; CHECK-NEXT:    s_cbranch_execnz .LBB0_2
+; CHECK-NEXT:  ; %bb.3:
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; CHECK-NEXT:    v_add_nc_u32_e32 v45, -1, v42
+; CHECK-NEXT:    s_mov_b32 s49, 0
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v45
+; CHECK-NEXT:    s_and_b32 exec_lo, exec_lo, vcc_lo
+; CHECK-NEXT:    s_cbranch_execz .LBB0_25
+; CHECK-NEXT:  ; %bb.4:
+; CHECK-NEXT:    v_lshlrev_b32_e32 v43, 10, v43
+; CHECK-NEXT:    v_add_nc_u32_e32 v46, 0x3c05, v0
+; CHECK-NEXT:    v_mov_b32_e32 v47, 0
+; CHECK-NEXT:    s_getpc_b64 s[42:43]
+; CHECK-NEXT:    s_add_u32 s42, s42, _Z10atomic_incPU3AS3Vj at rel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s43, s43, _Z10atomic_incPU3AS3Vj at rel32@hi+12
+; CHECK-NEXT:    s_mov_b32 s55, 0
+; CHECK-NEXT:  .LBB0_5: ; =>This Loop Header: Depth=1
+; CHECK-NEXT:    ; Child Loop BB0_8 Depth 2
+; CHECK-NEXT:    ; Child Loop BB0_20 Depth 2
+; CHECK-NEXT:    v_add_nc_u32_e32 v0, s55, v44
+; CHECK-NEXT:    s_lshl_b32 s4, s55, 5
+; CHECK-NEXT:    s_add_i32 s54, s55, 1
+; CHECK-NEXT:    s_add_i32 s5, s55, 5
+; CHECK-NEXT:    v_or3_b32 v57, s4, v43, s54
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    ds_read_u8 v56, v0
+; CHECK-NEXT:    v_mov_b32_e32 v59, s54
+; CHECK-NEXT:    s_mov_b32 s56, exec_lo
+; CHECK-NEXT:    v_cmpx_lt_u32_e64 s5, v42
+; CHECK-NEXT:    s_cbranch_execz .LBB0_17
+; CHECK-NEXT:  ; %bb.6: ; %.preheader2
+; CHECK-NEXT:    ; in Loop: Header=BB0_5 Depth=1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_and_b32_e32 v58, 0xff, v56
+; CHECK-NEXT:    s_mov_b32 s57, 0
+; CHECK-NEXT:    s_mov_b32 s58, 0
+; CHECK-NEXT:    s_branch .LBB0_8
+; CHECK-NEXT:  .LBB0_7: ; in Loop: Header=BB0_8 Depth=2
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s59
+; CHECK-NEXT:    s_add_i32 s58, s58, 4
+; CHECK-NEXT:    s_add_i32 s4, s55, s58
+; CHECK-NEXT:    s_add_i32 s5, s4, 5
+; CHECK-NEXT:    s_add_i32 s4, s4, 1
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc_lo, s5, v42
+; CHECK-NEXT:    v_mov_b32_e32 v59, s4
+; CHECK-NEXT:    s_or_b32 s57, vcc_lo, s57
+; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s57
+; CHECK-NEXT:    s_cbranch_execz .LBB0_16
+; CHECK-NEXT:  .LBB0_8: ; Parent Loop BB0_5 Depth=1
+; CHECK-NEXT:    ; => This Inner Loop Header: Depth=2
+; CHECK-NEXT:    v_add_nc_u32_e32 v60, s58, v46
+; CHECK-NEXT:    v_add_nc_u32_e32 v59, s58, v57
+; CHECK-NEXT:    s_mov_b32 s59, exec_lo
+; CHECK-NEXT:    ds_read_u8 v0, v60
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_cmpx_eq_u16_e64 v58, v0
+; CHECK-NEXT:    s_cbranch_execz .LBB0_10
+; CHECK-NEXT:  ; %bb.9: ; in Loop: Header=BB0_8 Depth=2
+; CHECK-NEXT:    v_mov_b32_e32 v31, v41
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0x3c00
+; CHECK-NEXT:    s_add_u32 s8, s34, 40
+; CHECK-NEXT:    s_addc_u32 s9, s35, 0
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[38:39]
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
+; CHECK-NEXT:    s_mov_b32 s12, s41
+; CHECK-NEXT:    s_mov_b32 s13, s40
+; CHECK-NEXT:    s_mov_b32 s14, s33
+; CHECK-NEXT:    v_add_nc_u32_e32 v47, 1, v47
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[42:43]
+; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; CHECK-NEXT:    ds_write_b32 v0, v59
+; CHECK-NEXT:  .LBB0_10: ; in Loop: Header=BB0_8 Depth=2
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s59
+; CHECK-NEXT:    ds_read_u8 v0, v60 offset:1
+; CHECK-NEXT:    s_mov_b32 s59, exec_lo
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_cmpx_eq_u16_e64 v58, v0
+; CHECK-NEXT:    s_cbranch_execz .LBB0_12
+; CHECK-NEXT:  ; %bb.11: ; in Loop: Header=BB0_8 Depth=2
+; CHECK-NEXT:    v_mov_b32_e32 v31, v41
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0x3c00
+; CHECK-NEXT:    s_add_u32 s8, s34, 40
+; CHECK-NEXT:    s_addc_u32 s9, s35, 0
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[38:39]
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
+; CHECK-NEXT:    s_mov_b32 s12, s41
+; CHECK-NEXT:    s_mov_b32 s13, s40
+; CHECK-NEXT:    s_mov_b32 s14, s33
+; CHECK-NEXT:    v_add_nc_u32_e32 v61, 1, v59
+; CHECK-NEXT:    v_add_nc_u32_e32 v47, 1, v47
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[42:43]
+; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; CHECK-NEXT:    ds_write_b32 v0, v61
+; CHECK-NEXT:  .LBB0_12: ; in Loop: Header=BB0_8 Depth=2
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s59
+; CHECK-NEXT:    ds_read_u8 v0, v60 offset:2
+; CHECK-NEXT:    s_mov_b32 s59, exec_lo
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_cmpx_eq_u16_e64 v58, v0
+; CHECK-NEXT:    s_cbranch_execz .LBB0_14
+; CHECK-NEXT:  ; %bb.13: ; in Loop: Header=BB0_8 Depth=2
+; CHECK-NEXT:    v_mov_b32_e32 v31, v41
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0x3c00
+; CHECK-NEXT:    s_add_u32 s8, s34, 40
+; CHECK-NEXT:    s_addc_u32 s9, s35, 0
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[38:39]
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
+; CHECK-NEXT:    s_mov_b32 s12, s41
+; CHECK-NEXT:    s_mov_b32 s13, s40
+; CHECK-NEXT:    s_mov_b32 s14, s33
+; CHECK-NEXT:    v_add_nc_u32_e32 v61, 2, v59
+; CHECK-NEXT:    v_add_nc_u32_e32 v47, 1, v47
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[42:43]
+; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; CHECK-NEXT:    ds_write_b32 v0, v61
+; CHECK-NEXT:  .LBB0_14: ; in Loop: Header=BB0_8 Depth=2
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s59
+; CHECK-NEXT:    ds_read_u8 v0, v60 offset:3
+; CHECK-NEXT:    s_mov_b32 s59, exec_lo
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_cmpx_eq_u16_e64 v58, v0
+; CHECK-NEXT:    s_cbranch_execz .LBB0_7
+; CHECK-NEXT:  ; %bb.15: ; in Loop: Header=BB0_8 Depth=2
+; CHECK-NEXT:    v_mov_b32_e32 v31, v41
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0x3c00
+; CHECK-NEXT:    s_add_u32 s8, s34, 40
+; CHECK-NEXT:    s_addc_u32 s9, s35, 0
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[38:39]
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
+; CHECK-NEXT:    s_mov_b32 s12, s41
+; CHECK-NEXT:    s_mov_b32 s13, s40
+; CHECK-NEXT:    s_mov_b32 s14, s33
+; CHECK-NEXT:    v_add_nc_u32_e32 v59, 3, v59
+; CHECK-NEXT:    v_add_nc_u32_e32 v47, 1, v47
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[42:43]
+; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; CHECK-NEXT:    ds_write_b32 v0, v59
+; CHECK-NEXT:    s_branch .LBB0_7
+; CHECK-NEXT:  .LBB0_16: ; %Flow43
+; CHECK-NEXT:    ; in Loop: Header=BB0_5 Depth=1
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s57
+; CHECK-NEXT:    v_add_nc_u32_e32 v57, s58, v57
+; CHECK-NEXT:  .LBB0_17: ; %Flow44
+; CHECK-NEXT:    ; in Loop: Header=BB0_5 Depth=1
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s56
+; CHECK-NEXT:    s_mov_b32 s55, exec_lo
+; CHECK-NEXT:    v_cmpx_lt_u32_e64 v59, v42
+; CHECK-NEXT:    s_cbranch_execz .LBB0_23
+; CHECK-NEXT:  ; %bb.18: ; %.preheader
+; CHECK-NEXT:    ; in Loop: Header=BB0_5 Depth=1
+; CHECK-NEXT:    s_mov_b32 s56, 0
+; CHECK-NEXT:    s_inst_prefetch 0x1
+; CHECK-NEXT:    s_branch .LBB0_20
+; CHECK-NEXT:    .p2align 6
+; CHECK-NEXT:  .LBB0_19: ; in Loop: Header=BB0_20 Depth=2
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s57
+; CHECK-NEXT:    v_add_nc_u32_e32 v59, 1, v59
+; CHECK-NEXT:    v_add_nc_u32_e32 v57, 1, v57
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v59, v42
+; CHECK-NEXT:    s_or_b32 s56, vcc_lo, s56
+; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s56
+; CHECK-NEXT:    s_cbranch_execz .LBB0_22
+; CHECK-NEXT:  .LBB0_20: ; Parent Loop BB0_5 Depth=1
+; CHECK-NEXT:    ; => This Inner Loop Header: Depth=2
+; CHECK-NEXT:    v_add_nc_u32_e32 v0, v44, v59
+; CHECK-NEXT:    ds_read_u8 v0, v0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
+; CHECK-NEXT:    s_and_saveexec_b32 s57, s4
+; CHECK-NEXT:    s_cbranch_execz .LBB0_19
+; CHECK-NEXT:  ; %bb.21: ; in Loop: Header=BB0_20 Depth=2
+; CHECK-NEXT:    v_mov_b32_e32 v31, v41
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0x3c00
+; CHECK-NEXT:    s_add_u32 s8, s34, 40
+; CHECK-NEXT:    s_addc_u32 s9, s35, 0
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[38:39]
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
+; CHECK-NEXT:    s_mov_b32 s12, s41
+; CHECK-NEXT:    s_mov_b32 s13, s40
+; CHECK-NEXT:    s_mov_b32 s14, s33
+; CHECK-NEXT:    v_add_nc_u32_e32 v47, 1, v47
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[42:43]
+; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; CHECK-NEXT:    ds_write_b32 v0, v57
+; CHECK-NEXT:    s_branch .LBB0_19
+; CHECK-NEXT:  .LBB0_22: ; %Flow41
+; CHECK-NEXT:    ; in Loop: Header=BB0_5 Depth=1
+; CHECK-NEXT:    s_inst_prefetch 0x2
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s56
+; CHECK-NEXT:  .LBB0_23: ; %Flow42
+; CHECK-NEXT:    ; in Loop: Header=BB0_5 Depth=1
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s55
+; CHECK-NEXT:  ; %bb.24: ; in Loop: Header=BB0_5 Depth=1
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc_lo, s54, v45
+; CHECK-NEXT:    v_cmp_lt_u32_e64 s4, 59, v47
+; CHECK-NEXT:    v_add_nc_u32_e32 v46, 1, v46
+; CHECK-NEXT:    s_mov_b32 s55, s54
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
+; CHECK-NEXT:    s_and_b32 s4, exec_lo, s4
+; CHECK-NEXT:    s_or_b32 s49, s4, s49
+; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s49
+; CHECK-NEXT:    s_cbranch_execnz .LBB0_5
+; CHECK-NEXT:  .LBB0_25: ; %Flow49
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s48
+; CHECK-NEXT:    v_mov_b32_e32 v31, v41
+; CHECK-NEXT:    v_mov_b32_e32 v0, 1
+; CHECK-NEXT:    s_add_u32 s8, s34, 40
+; CHECK-NEXT:    s_addc_u32 s9, s35, 0
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[38:39]
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
+; CHECK-NEXT:    s_mov_b32 s12, s41
+; CHECK-NEXT:    s_mov_b32 s13, s40
+; CHECK-NEXT:    s_mov_b32 s14, s33
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[52:53]
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    s_mov_b32 s4, exec_lo
+; CHECK-NEXT:    ds_read_b32 v47, v0 offset:15360
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_cmpx_gt_u32_e64 v47, v40
+; CHECK-NEXT:    s_cbranch_execz .LBB0_33
+; CHECK-NEXT:  ; %bb.26:
+; CHECK-NEXT:    s_add_u32 s52, s44, 8
+; CHECK-NEXT:    s_addc_u32 s53, s45, 0
+; CHECK-NEXT:    s_getpc_b64 s[42:43]
+; CHECK-NEXT:    s_add_u32 s42, s42, _Z10atomic_addPU3AS1Vjj at rel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s43, s43, _Z10atomic_addPU3AS1Vjj at rel32@hi+12
+; CHECK-NEXT:    s_mov_b32 s54, 0
+; CHECK-NEXT:    s_getpc_b64 s[44:45]
+; CHECK-NEXT:    s_add_u32 s44, s44, _Z10atomic_subPU3AS1Vjj at rel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s45, s45, _Z10atomic_subPU3AS1Vjj at rel32@hi+12
+; CHECK-NEXT:    s_getpc_b64 s[48:49]
+; CHECK-NEXT:    s_add_u32 s48, s48, _Z14get_local_sizej at rel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s49, s49, _Z14get_local_sizej at rel32@hi+12
+; CHECK-NEXT:    s_branch .LBB0_28
+; CHECK-NEXT:  .LBB0_27: ; in Loop: Header=BB0_28 Depth=1
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s55
+; CHECK-NEXT:    v_mov_b32_e32 v31, v41
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    s_add_u32 s8, s34, 40
+; CHECK-NEXT:    s_addc_u32 s9, s35, 0
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[38:39]
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
+; CHECK-NEXT:    s_mov_b32 s12, s41
+; CHECK-NEXT:    s_mov_b32 s13, s40
+; CHECK-NEXT:    s_mov_b32 s14, s33
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[48:49]
+; CHECK-NEXT:    v_add_co_u32 v40, vcc_lo, v0, v40
+; CHECK-NEXT:    v_cmp_le_u32_e32 vcc_lo, v47, v40
+; CHECK-NEXT:    s_or_b32 s54, vcc_lo, s54
+; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s54
+; CHECK-NEXT:    s_cbranch_execz .LBB0_33
+; CHECK-NEXT:  .LBB0_28: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 2, v40
+; CHECK-NEXT:    s_mov_b32 s55, exec_lo
+; CHECK-NEXT:    ds_read_b32 v0, v0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_lshrrev_b32_e32 v63, 10, v0
+; CHECK-NEXT:    v_bfe_u32 v62, v0, 5, 5
+; CHECK-NEXT:    v_and_b32_e32 v72, 31, v0
+; CHECK-NEXT:    v_mul_u32_u24_e32 v1, 0x180, v63
+; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 5, v62
+; CHECK-NEXT:    v_lshlrev_b32_e32 v4, 5, v72
+; CHECK-NEXT:    v_add_co_u32 v2, s4, s52, v1
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v3, null, s53, 0, s4
+; CHECK-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
+; CHECK-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_xor_b32_e32 v46, v9, v5
+; CHECK-NEXT:    v_xor_b32_e32 v45, v8, v4
+; CHECK-NEXT:    v_xor_b32_e32 v57, v11, v7
+; CHECK-NEXT:    v_xor_b32_e32 v56, v10, v6
+; CHECK-NEXT:    v_or_b32_e32 v5, v46, v57
+; CHECK-NEXT:    v_or_b32_e32 v4, v45, v56
+; CHECK-NEXT:    v_cmpx_ne_u64_e32 0, v[4:5]
+; CHECK-NEXT:    s_cbranch_execz .LBB0_27
+; CHECK-NEXT:  ; %bb.29: ; in Loop: Header=BB0_28 Depth=1
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx2 v[58:59], v[2:3], off offset:16
+; CHECK-NEXT:    global_load_dwordx2 v[60:61], v[0:1], off offset:16
+; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 4, v45
+; CHECK-NEXT:    v_alignbit_b32 v1, v46, v45, 12
+; CHECK-NEXT:    v_and_b32_e32 v2, 0xf0000, v45
+; CHECK-NEXT:    v_mov_b32_e32 v31, v41
+; CHECK-NEXT:    s_add_u32 s8, s34, 40
+; CHECK-NEXT:    v_and_b32_e32 v3, 0xf000, v0
+; CHECK-NEXT:    v_and_b32_e32 v4, 0xf00, v1
+; CHECK-NEXT:    v_and_b32_e32 v0, 0xf0, v0
+; CHECK-NEXT:    v_and_b32_e32 v1, 15, v1
+; CHECK-NEXT:    s_addc_u32 s9, s35, 0
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
+; CHECK-NEXT:    v_or3_b32 v2, v3, v2, v4
+; CHECK-NEXT:    s_mov_b32 s12, s41
+; CHECK-NEXT:    s_mov_b32 s13, s40
+; CHECK-NEXT:    s_mov_b32 s14, s33
+; CHECK-NEXT:    v_or3_b32 v73, v2, v0, v1
+; CHECK-NEXT:    v_lshrrev_b32_e32 v0, 1, v73
+; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 2, v73
+; CHECK-NEXT:    v_and_b32_e32 v0, 0x7fffc, v0
+; CHECK-NEXT:    v_lshlrev_b32_e64 v44, v1, 1
+; CHECK-NEXT:    v_and_b32_e32 v74, 28, v1
+; CHECK-NEXT:    v_add_co_u32 v42, s4, s50, v0
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v43, null, s51, 0, s4
+; CHECK-NEXT:    v_mov_b32_e32 v2, v44
+; CHECK-NEXT:    v_mov_b32_e32 v0, v42
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[38:39]
+; CHECK-NEXT:    v_mov_b32_e32 v1, v43
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[42:43]
+; CHECK-NEXT:    v_bfe_u32 v0, v0, v74, 4
+; CHECK-NEXT:    s_mov_b32 s4, exec_lo
+; CHECK-NEXT:    v_cmpx_gt_u32_e32 12, v0
+; CHECK-NEXT:    s_xor_b32 s4, exec_lo, s4
+; CHECK-NEXT:    s_cbranch_execz .LBB0_31
+; CHECK-NEXT:  ; %bb.30: ; in Loop: Header=BB0_28 Depth=1
+; CHECK-NEXT:    v_xor_b32_e32 v5, v60, v58
+; CHECK-NEXT:    v_lshrrev_b64 v[3:4], 16, v[56:57]
+; CHECK-NEXT:    v_mul_u32_u24_e32 v11, 0x180, v73
+; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; CHECK-NEXT:    v_lshrrev_b64 v[1:2], 16, v[45:46]
+; CHECK-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
+; CHECK-NEXT:    v_lshlrev_b32_e32 v8, 6, v72
+; CHECK-NEXT:    v_lshlrev_b32_e32 v10, 12, v63
+; CHECK-NEXT:    v_xor_b32_e32 v6, v61, v59
+; CHECK-NEXT:    v_lshlrev_b32_e32 v9, 16, v56
+; CHECK-NEXT:    v_or_b32_e32 v4, v7, v4
+; CHECK-NEXT:    v_add_co_u32 v7, s5, s46, v11
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v11, null, s47, 0, s5
+; CHECK-NEXT:    v_or3_b32 v10, v8, v10, v62
+; CHECK-NEXT:    v_add_co_u32 v7, vcc_lo, v7, v0
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, 0, v11, vcc_lo
+; CHECK-NEXT:    v_lshrrev_b64 v[5:6], 16, v[5:6]
+; CHECK-NEXT:    v_or_b32_e32 v2, v9, v2
+; CHECK-NEXT:    global_store_dword v[7:8], v10, off offset:4
+; CHECK-NEXT:    global_store_dwordx4 v[7:8], v[1:4], off offset:8
+; CHECK-NEXT:    global_store_dwordx2 v[7:8], v[5:6], off offset:24
+; CHECK-NEXT:    ; implicit-def: $vgpr42
+; CHECK-NEXT:    ; implicit-def: $vgpr43
+; CHECK-NEXT:    ; implicit-def: $vgpr44
+; CHECK-NEXT:  .LBB0_31: ; %Flow
+; CHECK-NEXT:    ; in Loop: Header=BB0_28 Depth=1
+; CHECK-NEXT:    s_andn2_saveexec_b32 s4, s4
+; CHECK-NEXT:    s_cbranch_execz .LBB0_27
+; CHECK-NEXT:  ; %bb.32: ; in Loop: Header=BB0_28 Depth=1
+; CHECK-NEXT:    v_mov_b32_e32 v31, v41
+; CHECK-NEXT:    v_mov_b32_e32 v0, v42
+; CHECK-NEXT:    v_mov_b32_e32 v1, v43
+; CHECK-NEXT:    v_mov_b32_e32 v2, v44
+; CHECK-NEXT:    s_add_u32 s8, s34, 40
+; CHECK-NEXT:    s_addc_u32 s9, s35, 0
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[38:39]
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
+; CHECK-NEXT:    s_mov_b32 s12, s41
+; CHECK-NEXT:    s_mov_b32 s13, s40
+; CHECK-NEXT:    s_mov_b32 s14, s33
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[44:45]
+; CHECK-NEXT:    s_branch .LBB0_27
+; CHECK-NEXT:  .LBB0_33:
+; CHECK-NEXT:    s_endpgm
+  %6 = tail call i64 @_Z13get_global_idj(i32 noundef 0) #4
+  %7 = trunc i64 %6 to i32
+  %8 = tail call i64 @_Z12get_local_idj(i32 noundef 0) #4
+  %9 = trunc i64 %8 to i32
+  %10 = mul i32 %9, 14
+  %11 = getelementptr inbounds i8, ptr addrspace(3) @kernel_round1.first_words_data, i32 %10
+  store i32 0, ptr addrspace(3) @kernel_round1.collisionsNum, align 4, !tbaa !11
+  tail call void @_Z7barrierj(i32 noundef 1) #5
+  %12 = lshr i64 %6, 3
+  %13 = shl i32 %7, 2
+  %14 = and i32 %13, 28
+  %15 = and i64 %12, 536870911
+  %16 = getelementptr inbounds i32, ptr addrspace(1) %2, i64 %15
+  %17 = load i32, ptr addrspace(1) %16, align 4, !tbaa !11
+  %18 = lshr i32 %17, %14
+  %19 = and i32 %18, 15
+  %20 = tail call i32 @_Z3minjj(i32 noundef %19, i32 noundef 12) #4
+  %21 = icmp eq i32 %20, 0
+  br i1 %21, label %119, label %27
+
+22:                                               ; preds = %27
+  %23 = add i32 %20, -1
+  %24 = icmp eq i32 %23, 0
+  br i1 %24, label %119, label %25
+
+25:                                               ; preds = %22
+  %26 = shl i32 %7, 10
+  br label %37
+
+27:                                               ; preds = %5, %27
+  %28 = phi i32 [ %30, %27 ], [ 0, %5 ]
+  %29 = getelementptr inbounds i8, ptr addrspace(3) %11, i32 %28
+  store i8 0, ptr addrspace(3) %29, align 1, !tbaa !15
+  %30 = add nuw i32 %28, 1
+  %31 = icmp eq i32 %30, %20
+  br i1 %31, label %22, label %27
+
+32:                                               ; preds = %114, %48
+  %33 = phi i32 [ %50, %48 ], [ %115, %114 ]
+  %34 = icmp ult i32 %44, %23
+  %35 = icmp ult i32 %33, 60
+  %36 = select i1 %34, i1 %35, i1 false
+  br i1 %36, label %37, label %119
+
+37:                                               ; preds = %32, %25
+  %38 = phi i32 [ 0, %25 ], [ %44, %32 ]
+  %39 = phi i32 [ 0, %25 ], [ %33, %32 ]
+  %40 = getelementptr inbounds i8, ptr addrspace(3) %11, i32 %38
+  %41 = load i8, ptr addrspace(3) %40, align 1, !tbaa !15
+  %42 = shl i32 %38, 5
+  %43 = or i32 %42, %26
+  %44 = add nuw i32 %38, 1
+  %45 = or i32 %43, %44
+  %46 = add i32 %38, 5
+  %47 = icmp ult i32 %46, %20
+  br i1 %47, label %53, label %48
+
+48:                                               ; preds = %98, %37
+  %49 = phi i32 [ %45, %37 ], [ %100, %98 ]
+  %50 = phi i32 [ %39, %37 ], [ %99, %98 ]
+  %51 = phi i32 [ %44, %37 ], [ %54, %98 ]
+  %52 = icmp ult i32 %51, %20
+  br i1 %52, label %103, label %32
+
+53:                                               ; preds = %37, %98
+  %54 = phi i32 [ %101, %98 ], [ %46, %37 ]
+  %55 = phi i32 [ %54, %98 ], [ %44, %37 ]
+  %56 = phi i32 [ %99, %98 ], [ %39, %37 ]
+  %57 = phi i32 [ %100, %98 ], [ %45, %37 ]
+  %58 = getelementptr inbounds i8, ptr addrspace(3) %11, i32 %55
+  %59 = load i8, ptr addrspace(3) %58, align 1, !tbaa !15
+  %60 = icmp eq i8 %41, %59
+  br i1 %60, label %61, label %65
+
+61:                                               ; preds = %53
+  %62 = add i32 %56, 1
+  %63 = tail call i32 @_Z10atomic_incPU3AS3Vj(ptr addrspace(3) noundef @kernel_round1.collisionsNum) #5
+  %64 = getelementptr inbounds i32, ptr addrspace(3) @kernel_round1.collisionsData, i32 %63
+  store i32 %57, ptr addrspace(3) %64, align 4, !tbaa !11
+  br label %65
+
+65:                                               ; preds = %61, %53
+  %66 = phi i32 [ %62, %61 ], [ %56, %53 ]
+  %67 = add i32 %55, 1
+  %68 = getelementptr inbounds i8, ptr addrspace(3) %11, i32 %67
+  %69 = load i8, ptr addrspace(3) %68, align 1, !tbaa !15
+  %70 = icmp eq i8 %41, %69
+  br i1 %70, label %71, label %76
+
+71:                                               ; preds = %65
+  %72 = add i32 %57, 1
+  %73 = add i32 %66, 1
+  %74 = tail call i32 @_Z10atomic_incPU3AS3Vj(ptr addrspace(3) noundef @kernel_round1.collisionsNum) #5
+  %75 = getelementptr inbounds i32, ptr addrspace(3) @kernel_round1.collisionsData, i32 %74
+  store i32 %72, ptr addrspace(3) %75, align 4, !tbaa !11
+  br label %76
+
+76:                                               ; preds = %71, %65
+  %77 = phi i32 [ %73, %71 ], [ %66, %65 ]
+  %78 = add i32 %55, 2
+  %79 = getelementptr inbounds i8, ptr addrspace(3) %11, i32 %78
+  %80 = load i8, ptr addrspace(3) %79, align 1, !tbaa !15
+  %81 = icmp eq i8 %41, %80
+  br i1 %81, label %82, label %87
+
+82:                                               ; preds = %76
+  %83 = add i32 %57, 2
+  %84 = add i32 %77, 1
+  %85 = tail call i32 @_Z10atomic_incPU3AS3Vj(ptr addrspace(3) noundef @kernel_round1.collisionsNum) #5
+  %86 = getelementptr inbounds i32, ptr addrspace(3) @kernel_round1.collisionsData, i32 %85
+  store i32 %83, ptr addrspace(3) %86, align 4, !tbaa !11
+  br label %87
+
+87:                                               ; preds = %82, %76
+  %88 = phi i32 [ %84, %82 ], [ %77, %76 ]
+  %89 = add i32 %55, 3
+  %90 = getelementptr inbounds i8, ptr addrspace(3) %11, i32 %89
+  %91 = load i8, ptr addrspace(3) %90, align 1, !tbaa !15
+  %92 = icmp eq i8 %41, %91
+  br i1 %92, label %93, label %98
+
+93:                                               ; preds = %87
+  %94 = add i32 %57, 3
+  %95 = add i32 %88, 1
+  %96 = tail call i32 @_Z10atomic_incPU3AS3Vj(ptr addrspace(3) noundef @kernel_round1.collisionsNum) #5
+  %97 = getelementptr inbounds i32, ptr addrspace(3) @kernel_round1.collisionsData, i32 %96
+  store i32 %94, ptr addrspace(3) %97, align 4, !tbaa !11
+  br label %98
+
+98:                                               ; preds = %93, %87
+  %99 = phi i32 [ %95, %93 ], [ %88, %87 ]
+  %100 = add i32 %57, 4
+  %101 = add i32 %54, 4
+  %102 = icmp ult i32 %101, %20
+  br i1 %102, label %53, label %48
+
+103:                                              ; preds = %48, %114
+  %104 = phi i32 [ %117, %114 ], [ %51, %48 ]
+  %105 = phi i32 [ %115, %114 ], [ %50, %48 ]
+  %106 = phi i32 [ %116, %114 ], [ %49, %48 ]
+  %107 = getelementptr inbounds i8, ptr addrspace(3) %11, i32 %104
+  %108 = load i8, ptr addrspace(3) %107, align 1, !tbaa !15
+  %109 = icmp eq i8 %41, %108
+  br i1 %109, label %110, label %114
+
+110:                                              ; preds = %103
+  %111 = add i32 %105, 1
+  %112 = tail call i32 @_Z10atomic_incPU3AS3Vj(ptr addrspace(3) noundef @kernel_round1.collisionsNum) #5
+  %113 = getelementptr inbounds i32, ptr addrspace(3) @kernel_round1.collisionsData, i32 %112
+  store i32 %106, ptr addrspace(3) %113, align 4, !tbaa !11
+  br label %114
+
+114:                                              ; preds = %110, %103
+  %115 = phi i32 [ %111, %110 ], [ %105, %103 ]
+  %116 = add i32 %106, 1
+  %117 = add nuw i32 %104, 1
+  %118 = icmp ult i32 %117, %20
+  br i1 %118, label %103, label %32
+
+119:                                              ; preds = %32, %22, %5
+  tail call void @_Z7barrierj(i32 noundef 1) #5
+  %120 = load i32, ptr addrspace(3) @kernel_round1.collisionsNum, align 4, !tbaa !11
+  %121 = icmp ugt i32 %120, %9
+  br i1 %121, label %122, label %206
+
+122:                                              ; preds = %119
+  %123 = getelementptr inbounds i8, ptr addrspace(1) %0, i64 8
+  br label %124
+
+124:                                              ; preds = %201, %122
+  %125 = phi i32 [ %9, %122 ], [ %204, %201 ]
+  %126 = phi i64 [ %8, %122 ], [ %203, %201 ]
+  %127 = and i64 %126, 4294967295
+  %128 = getelementptr inbounds i32, ptr addrspace(3) @kernel_round1.collisionsData, i32 %125
+  %129 = load i32, ptr addrspace(3) %128, align 4, !tbaa !11
+  %130 = lshr i32 %129, 10
+  %131 = lshr i32 %129, 5
+  %132 = and i32 %131, 31
+  %133 = and i32 %129, 31
+  %134 = mul nuw nsw i32 %130, 384
+  %135 = zext i32 %134 to i64
+  %136 = getelementptr inbounds i8, ptr addrspace(1) %123, i64 %135
+  %137 = shl nuw nsw i32 %132, 5
+  %138 = zext i32 %137 to i64
+  %139 = getelementptr inbounds i8, ptr addrspace(1) %136, i64 %138
+  %140 = shl nuw nsw i32 %133, 5
+  %141 = zext i32 %140 to i64
+  %142 = getelementptr inbounds i8, ptr addrspace(1) %136, i64 %141
+  %143 = getelementptr inbounds i64, ptr addrspace(1) %139, i64 1
+  %144 = load i64, ptr addrspace(1) %139, align 8, !tbaa !16
+  %145 = getelementptr inbounds i64, ptr addrspace(1) %142, i64 1
+  %146 = load i64, ptr addrspace(1) %142, align 8, !tbaa !16
+  %147 = xor i64 %146, %144
+  %148 = load i64, ptr addrspace(1) %143, align 8, !tbaa !16
+  %149 = load i64, ptr addrspace(1) %145, align 8, !tbaa !16
+  %150 = xor i64 %149, %148
+  %151 = icmp ne i64 %147, 0
+  %152 = icmp ne i64 %150, 0
+  %153 = select i1 %151, i1 true, i1 %152
+  br i1 %153, label %154, label %201
+
+154:                                              ; preds = %124
+  %155 = getelementptr inbounds i64, ptr addrspace(1) %142, i64 2
+  %156 = load i64, ptr addrspace(1) %155, align 8, !tbaa !16
+  %157 = getelementptr inbounds i64, ptr addrspace(1) %139, i64 2
+  %158 = load i64, ptr addrspace(1) %157, align 8, !tbaa !16
+  %159 = and i64 %147, 983040
+  %160 = shl i64 %147, 4
+  %161 = and i64 %160, 61440
+  %162 = or i64 %161, %159
+  %163 = lshr i64 %147, 12
+  %164 = and i64 %163, 3840
+  %165 = or i64 %162, %164
+  %166 = and i64 %160, 240
+  %167 = or i64 %165, %166
+  %168 = and i64 %163, 15
+  %169 = or i64 %167, %168
+  %170 = trunc i64 %169 to i32
+  %171 = lshr i64 %169, 3
+  %172 = shl nuw nsw i32 %170, 2
+  %173 = and i32 %172, 28
+  %174 = getelementptr inbounds i32, ptr addrspace(1) %3, i64 %171
+  %175 = shl nuw nsw i32 1, %173
+  %176 = tail call i32 @_Z10atomic_addPU3AS1Vjj(ptr addrspace(1) noundef %174, i32 noundef %175) #5
+  %177 = lshr i32 %176, %173
+  %178 = and i32 %177, 15
+  %179 = icmp ugt i32 %178, 11
+  br i1 %179, label %180, label %182
+
+180:                                              ; preds = %154
+  %181 = tail call i32 @_Z10atomic_subPU3AS1Vjj(ptr addrspace(1) noundef %174, i32 noundef %175) #5
+  br label %201
+
+182:                                              ; preds = %154
+  %183 = xor i64 %158, %156
+  %184 = lshr i64 %183, 16
+  %185 = tail call i64 @llvm.fshl.i64(i64 %183, i64 %150, i64 48)
+  %186 = tail call i64 @llvm.fshl.i64(i64 %150, i64 %147, i64 48)
+  %187 = shl nuw nsw i32 %133, 6
+  %188 = shl i32 %130, 12
+  %189 = or i32 %187, %188
+  %190 = or i32 %189, %132
+  %191 = mul nuw nsw i64 %169, 384
+  %192 = and i64 %191, 4294967168
+  %193 = getelementptr inbounds i8, ptr addrspace(1) %1, i64 %192
+  %194 = shl nuw nsw i32 %178, 5
+  %195 = or i32 %194, 8
+  %196 = zext i32 %195 to i64
+  %197 = getelementptr inbounds i8, ptr addrspace(1) %193, i64 %196
+  %198 = getelementptr inbounds i8, ptr addrspace(1) %197, i64 -4
+  store i32 %190, ptr addrspace(1) %198, align 4, !tbaa !11
+  store i64 %186, ptr addrspace(1) %197, align 8, !tbaa !16
+  %199 = getelementptr inbounds i8, ptr addrspace(1) %197, i64 8
+  store i64 %185, ptr addrspace(1) %199, align 8, !tbaa !16
+  %200 = getelementptr inbounds i8, ptr addrspace(1) %197, i64 16
+  store i64 %184, ptr addrspace(1) %200, align 8, !tbaa !16
+  br label %201
+
+201:                                              ; preds = %182, %180, %124
+  %202 = tail call i64 @_Z14get_local_sizej(i32 noundef 0) #4
+  %203 = add i64 %202, %127
+  %204 = trunc i64 %203 to i32
+  %205 = icmp ugt i32 %120, %204
+  br i1 %205, label %124, label %206
+
+206:                                              ; preds = %201, %119
+  ret void
+}
+
+; Removed most of the if-else blocks
+
+define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapture noundef readonly align 1 %.0, ptr addrspace(1) nocapture noundef writeonly align 1 %.1, ptr addrspace(1) nocapture noundef readonly align 4 %.2, ptr addrspace(1) noundef align 4 %.3, ptr addrspace(1) nocapture noundef readnone align 4 %.4) local_unnamed_addr #2 !kernel_arg_addr_space !5 !kernel_arg_access_qual !6 !kernel_arg_type !7 !kernel_arg_base_type !7 !kernel_arg_type_qual !8 !kernel_arg_name !9 !reqd_work_group_size !10 {
+; CHECK-LABEL: kernel_round1_short:
+; CHECK:       ; %bb.0: ; %.5
+; CHECK-NEXT:    s_add_u32 s10, s10, s15
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    s_addc_u32 s11, s11, 0
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
+; CHECK-NEXT:    s_load_dwordx2 s[46:47], s[6:7], 0x10
+; CHECK-NEXT:    s_add_u32 s0, s0, s15
+; CHECK-NEXT:    s_mov_b64 s[36:37], s[6:7]
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    v_mov_b32_e32 v40, v0
+; CHECK-NEXT:    s_add_u32 s42, s36, 40
+; CHECK-NEXT:    v_mov_b32_e32 v31, v0
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    s_mov_b64 s[34:35], s[8:9]
+; CHECK-NEXT:    s_addc_u32 s43, s37, 0
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[34:35]
+; CHECK-NEXT:    s_mov_b64 s[8:9], s[42:43]
+; CHECK-NEXT:    s_mov_b32 s33, s14
+; CHECK-NEXT:    s_mov_b32 s40, s13
+; CHECK-NEXT:    s_mov_b32 s41, s12
+; CHECK-NEXT:    s_mov_b64 s[38:39], s[4:5]
+; CHECK-NEXT:    s_getpc_b64 s[6:7]
+; CHECK-NEXT:    s_add_u32 s6, s6, _Z13get_global_idj at rel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s7, s7, _Z13get_global_idj at rel32@hi+12
+; CHECK-NEXT:    v_mov_b32_e32 v43, 0
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; CHECK-NEXT:    v_mov_b32_e32 v42, v0
+; CHECK-NEXT:    v_mov_b32_e32 v31, v40
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[38:39]
+; CHECK-NEXT:    s_mov_b64 s[8:9], s[42:43]
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[34:35]
+; CHECK-NEXT:    s_mov_b32 s12, s41
+; CHECK-NEXT:    s_mov_b32 s13, s40
+; CHECK-NEXT:    s_mov_b32 s14, s33
+; CHECK-NEXT:    s_getpc_b64 s[6:7]
+; CHECK-NEXT:    s_add_u32 s6, s6, _Z12get_local_idj at rel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s7, s7, _Z12get_local_idj at rel32@hi+12
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; CHECK-NEXT:    v_mul_lo_u32 v46, v0, 14
+; CHECK-NEXT:    v_mov_b32_e32 v31, v40
+; CHECK-NEXT:    v_mov_b32_e32 v0, 1
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[38:39]
+; CHECK-NEXT:    s_mov_b64 s[8:9], s[42:43]
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[34:35]
+; CHECK-NEXT:    s_mov_b32 s12, s41
+; CHECK-NEXT:    s_mov_b32 s13, s40
+; CHECK-NEXT:    s_mov_b32 s14, s33
+; CHECK-NEXT:    ds_write_b32 v43, v43 offset:15360
+; CHECK-NEXT:    s_getpc_b64 s[44:45]
+; CHECK-NEXT:    s_add_u32 s44, s44, _Z7barrierj at rel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s45, s45, _Z7barrierj at rel32@hi+12
+; CHECK-NEXT:    v_add_nc_u32_e32 v44, 0x3c04, v46
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[44:45]
+; CHECK-NEXT:    v_lshrrev_b32_e32 v0, 1, v42
+; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 2, v42
+; CHECK-NEXT:    v_mov_b32_e32 v31, v40
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[38:39]
+; CHECK-NEXT:    s_mov_b64 s[8:9], s[42:43]
+; CHECK-NEXT:    v_and_b32_e32 v0, 0x7ffffffc, v0
+; CHECK-NEXT:    v_and_b32_e32 v1, 28, v1
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[34:35]
+; CHECK-NEXT:    s_mov_b32 s12, s41
+; CHECK-NEXT:    s_mov_b32 s13, s40
+; CHECK-NEXT:    global_load_dword v0, v0, s[46:47]
+; CHECK-NEXT:    s_mov_b32 s14, s33
+; CHECK-NEXT:    s_getpc_b64 s[6:7]
+; CHECK-NEXT:    s_add_u32 s6, s6, _Z3minjj at rel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s7, s7, _Z3minjj at rel32@hi+12
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_bfe_u32 v0, v0, v1, 4
+; CHECK-NEXT:    v_mov_b32_e32 v1, 12
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; CHECK-NEXT:    v_mov_b32_e32 v41, v0
+; CHECK-NEXT:    v_lshlrev_b32_e32 v42, 10, v42
+; CHECK-NEXT:    s_getpc_b64 s[42:43]
+; CHECK-NEXT:    s_add_u32 s42, s42, _Z10atomic_incPU3AS3Vj at rel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s43, s43, _Z10atomic_incPU3AS3Vj at rel32@hi+12
+; CHECK-NEXT:    s_mov_b32 s46, 0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:    v_add_nc_u32_e32 v45, -1, v41
+; CHECK-NEXT:    ds_write_b8 v46, v43 offset:15364
+; CHECK-NEXT:  .LBB1_1: ; %.37
+; CHECK-NEXT:    ; =>This Loop Header: Depth=1
+; CHECK-NEXT:    ; Child Loop BB1_3 Depth 2
+; CHECK-NEXT:    ; Child Loop BB1_8 Depth 2
+; CHECK-NEXT:    v_add_nc_u32_e32 v0, s4, v44
+; CHECK-NEXT:    s_lshl_b32 s5, s4, 5
+; CHECK-NEXT:    s_add_i32 s47, s4, 1
+; CHECK-NEXT:    s_add_i32 s6, s4, 5
+; CHECK-NEXT:    v_or3_b32 v47, s5, v42, s47
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    ds_read_u8 v46, v0
+; CHECK-NEXT:    v_mov_b32_e32 v56, s47
+; CHECK-NEXT:    s_mov_b32 s5, exec_lo
+; CHECK-NEXT:    v_cmpx_lt_u32_e64 s6, v41
+; CHECK-NEXT:    s_cbranch_execz .LBB1_5
+; CHECK-NEXT:  ; %bb.2: ; %.53.preheader
+; CHECK-NEXT:    ; in Loop: Header=BB1_1 Depth=1
+; CHECK-NEXT:    s_mov_b32 s6, 0
+; CHECK-NEXT:    s_mov_b32 s7, 0
+; CHECK-NEXT:  .LBB1_3: ; %.53
+; CHECK-NEXT:    ; Parent Loop BB1_1 Depth=1
+; CHECK-NEXT:    ; => This Inner Loop Header: Depth=2
+; CHECK-NEXT:    s_add_i32 s7, s7, 4
+; CHECK-NEXT:    v_add_nc_u32_e32 v43, 1, v43
+; CHECK-NEXT:    s_add_i32 s8, s4, s7
+; CHECK-NEXT:    s_add_i32 s9, s8, 5
+; CHECK-NEXT:    s_add_i32 s8, s8, 1
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc_lo, s9, v41
+; CHECK-NEXT:    v_mov_b32_e32 v56, s8
+; CHECK-NEXT:    s_or_b32 s6, vcc_lo, s6
+; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s6
+; CHECK-NEXT:    s_cbranch_execnz .LBB1_3
+; CHECK-NEXT:  ; %bb.4: ; %Flow3
+; CHECK-NEXT:    ; in Loop: Header=BB1_1 Depth=1
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s6
+; CHECK-NEXT:    v_add_nc_u32_e32 v47, s7, v47
+; CHECK-NEXT:  .LBB1_5: ; %Flow4
+; CHECK-NEXT:    ; in Loop: Header=BB1_1 Depth=1
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; CHECK-NEXT:    s_mov_b32 s48, exec_lo
+; CHECK-NEXT:    v_cmpx_lt_u32_e64 v56, v41
+; CHECK-NEXT:    s_cbranch_execz .LBB1_11
+; CHECK-NEXT:  ; %bb.6: ; %.103.preheader
+; CHECK-NEXT:    ; in Loop: Header=BB1_1 Depth=1
+; CHECK-NEXT:    s_mov_b32 s49, 0
+; CHECK-NEXT:    s_inst_prefetch 0x1
+; CHECK-NEXT:    s_branch .LBB1_8
+; CHECK-NEXT:    .p2align 6
+; CHECK-NEXT:  .LBB1_7: ; %.114
+; CHECK-NEXT:    ; in Loop: Header=BB1_8 Depth=2
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s50
+; CHECK-NEXT:    v_add_nc_u32_e32 v56, 1, v56
+; CHECK-NEXT:    v_add_nc_u32_e32 v47, 1, v47
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v56, v41
+; CHECK-NEXT:    s_or_b32 s49, vcc_lo, s49
+; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s49
+; CHECK-NEXT:    s_cbranch_execz .LBB1_10
+; CHECK-NEXT:  .LBB1_8: ; %.103
+; CHECK-NEXT:    ; Parent Loop BB1_1 Depth=1
+; CHECK-NEXT:    ; => This Inner Loop Header: Depth=2
+; CHECK-NEXT:    v_add_nc_u32_e32 v0, v44, v56
+; CHECK-NEXT:    ds_read_u8 v0, v0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_cmp_eq_u16_sdwa s4, v46, v0 src0_sel:BYTE_0 src1_sel:DWORD
+; CHECK-NEXT:    s_and_saveexec_b32 s50, s4
+; CHECK-NEXT:    s_cbranch_execz .LBB1_7
+; CHECK-NEXT:  ; %bb.9: ; %.110
+; CHECK-NEXT:    ; in Loop: Header=BB1_8 Depth=2
+; CHECK-NEXT:    v_mov_b32_e32 v31, v40
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0x3c00
+; CHECK-NEXT:    s_add_u32 s8, s36, 40
+; CHECK-NEXT:    s_addc_u32 s9, s37, 0
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[38:39]
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[34:35]
+; CHECK-NEXT:    s_mov_b32 s12, s41
+; CHECK-NEXT:    s_mov_b32 s13, s40
+; CHECK-NEXT:    s_mov_b32 s14, s33
+; CHECK-NEXT:    v_add_nc_u32_e32 v43, 1, v43
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[42:43]
+; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; CHECK-NEXT:    ds_write_b32 v0, v47
+; CHECK-NEXT:    s_branch .LBB1_7
+; CHECK-NEXT:  .LBB1_10: ; %Flow
+; CHECK-NEXT:    ; in Loop: Header=BB1_1 Depth=1
+; CHECK-NEXT:    s_inst_prefetch 0x2
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s49
+; CHECK-NEXT:  .LBB1_11: ; %Flow2
+; CHECK-NEXT:    ; in Loop: Header=BB1_1 Depth=1
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s48
+; CHECK-NEXT:  ; %bb.12: ; %.32
+; CHECK-NEXT:    ; in Loop: Header=BB1_1 Depth=1
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc_lo, s47, v45
+; CHECK-NEXT:    v_cmp_lt_u32_e64 s4, 59, v43
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
+; CHECK-NEXT:    s_and_b32 s4, exec_lo, s4
+; CHECK-NEXT:    s_or_b32 s46, s4, s46
+; CHECK-NEXT:    s_mov_b32 s4, s47
+; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s46
+; CHECK-NEXT:    s_cbranch_execnz .LBB1_1
+; CHECK-NEXT:  ; %bb.13: ; %.119
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s46
+; CHECK-NEXT:    v_mov_b32_e32 v31, v40
+; CHECK-NEXT:    v_mov_b32_e32 v0, 1
+; CHECK-NEXT:    s_add_u32 s8, s36, 40
+; CHECK-NEXT:    s_addc_u32 s9, s37, 0
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[38:39]
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[34:35]
+; CHECK-NEXT:    s_mov_b32 s12, s41
+; CHECK-NEXT:    s_mov_b32 s13, s40
+; CHECK-NEXT:    s_mov_b32 s14, s33
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[44:45]
+; CHECK-NEXT:    s_endpgm
+.5:
+  %.6 = tail call i64 @_Z13get_global_idj(i32 noundef 0) #4
+  %.7 = trunc i64 %.6 to i32
+  %.8 = tail call i64 @_Z12get_local_idj(i32 noundef 0) #4
+  %.9 = trunc i64 %.8 to i32
+  %.10 = mul i32 %.9, 14
+  %.11 = getelementptr inbounds i8, ptr addrspace(3) @kernel_round1.first_words_data, i32 %.10
+  store i32 0, ptr addrspace(3) @kernel_round1.collisionsNum, align 4, !tbaa !11
+  tail call void @_Z7barrierj(i32 noundef 1) #5
+  %.12 = lshr i64 %.6, 3
+  %.13 = shl i32 %.7, 2
+  %.14 = and i32 %.13, 28
+  %.15 = and i64 %.12, 536870911
+  %.16 = getelementptr inbounds i32, ptr addrspace(1) %.2, i64 %.15
+  %.17 = load i32, ptr addrspace(1) %.16, align 4, !tbaa !11
+  %.18 = lshr i32 %.17, %.14
+  %.19 = and i32 %.18, 15
+  %.20 = tail call i32 @_Z3minjj(i32 noundef %.19, i32 noundef 12) #4
+  %.21 = icmp eq i32 %.20, 0
+  %.23 = add i32 %.20, -1
+  %.24 = icmp eq i32 %.23, 0
+  store i8 0, ptr addrspace(3) %.11, align 1, !tbaa !15
+  br label %.37
+
+.32:                                               ; preds = %.114, %.48
+  %.33 = phi i32 [ %.50, %.48 ], [ %.115, %.114 ]
+  %.34 = icmp ult i32 %.44, %.23
+  %.35 = icmp ult i32 %.33, 60
+  %.36 = select i1 %.34, i1 %.35, i1 false
+  br i1 %.36, label %.37, label %.119
+
+.37:                                               ; preds = %.32, %.25
+  %.38 = phi i32 [ 0, %.5 ], [ %.44, %.32 ]
+  %.39 = phi i32 [ 0, %.5 ], [ %.33, %.32 ]
+  %.26 = shl i32 %.7, 10
+  %.40 = getelementptr inbounds i8, ptr addrspace(3) %.11, i32 %.38
+  %.41 = load i8, ptr addrspace(3) %.40, align 1, !tbaa !15
+  %.42 = shl i32 %.38, 5
+  %.43 = or i32 %.42, %.26
+  %.44 = add nuw i32 %.38, 1
+  %.45 = or i32 %.43, %.44
+  %.46 = add i32 %.38, 5
+  %.47 = icmp ult i32 %.46, %.20
+  br i1 %.47, label %.53, label %.48
+
+.48:                                               ; preds = %.98, %.37
+  %.49 = phi i32 [ %.45, %.37 ], [ %.100, %.98 ]
+  %.50 = phi i32 [ %.39, %.37 ], [ %.99, %.98 ]
+  %.51 = phi i32 [ %.44, %.37 ], [ %.54, %.98 ]
+  %.52 = icmp ult i32 %.51, %.20
+  br i1 %.52, label %.103, label %.32
+
+.53:                                               ; preds = %.37, %.98
+  %.54 = phi i32 [ %.101, %.98 ], [ %.46, %.37 ]
+  %.55 = phi i32 [ %.54, %.98 ], [ %.44, %.37 ]
+  %.56 = phi i32 [ %.99, %.98 ], [ %.39, %.37 ]
+  %.57 = phi i32 [ %.100, %.98 ], [ %.45, %.37 ]
+  %.58 = getelementptr inbounds i8, ptr addrspace(3) %.11, i32 %.55
+  %.59 = load i8, ptr addrspace(3) %.58, align 1, !tbaa !15
+  %.60 = icmp eq i8 %.41, %.59
+  br label %.98
+
+.98:                                               ; preds = %.93, %.87
+  %.99 = add i32 %.56, 1
+  %.100 = add i32 %.57, 4
+  %.101 = add i32 %.54, 4
+  %.102 = icmp ult i32 %.101, %.20
+  br i1 %.102, label %.53, label %.48
+
+.103:                                              ; preds = %.48, %.114
+  %.104 = phi i32 [ %.117, %.114 ], [ %.51, %.48 ]
+  %.105 = phi i32 [ %.115, %.114 ], [ %.50, %.48 ]
+  %.106 = phi i32 [ %.116, %.114 ], [ %.49, %.48 ]
+  %.107 = getelementptr inbounds i8, ptr addrspace(3) %.11, i32 %.104
+  %.108 = load i8, ptr addrspace(3) %.107, align 1, !tbaa !15
+  %.109 = icmp eq i8 %.41, %.108
+  br i1 %.109, label %.110, label %.114
+
+.110:                                              ; preds = %.103
+  %.111 = add i32 %.105, 1
+  %.112 = tail call i32 @_Z10atomic_incPU3AS3Vj(ptr addrspace(3) noundef @kernel_round1.collisionsNum) #5
+  %.113 = getelementptr inbounds i32, ptr addrspace(3) @kernel_round1.collisionsData, i32 %.112
+  store i32 %.106, ptr addrspace(3) %.113, align 4, !tbaa !11
+  br label %.114
+
+.114:                                              ; preds = %.110, %.103
+  %.115 = phi i32 [ %.111, %.110 ], [ %.105, %.103 ]
+  %.116 = add i32 %.106, 1
+  %.117 = add nuw i32 %.104, 1
+  %.118 = icmp ult i32 %.117, %.20
+  br i1 %.118, label %.103, label %.32
+
+.119:                                              ; preds = %.32, %.22, %.5
+  tail call void @_Z7barrierj(i32 noundef 1) #5
+  %.120 = load i32, ptr addrspace(3) @kernel_round1.collisionsNum, align 4, !tbaa !11
+  %.121 = icmp ugt i32 %.120, %.9
+  br label %.206
+
+.206:                                              ; preds = %.201, %.119
+  ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare i64 @llvm.fshl.i64(i64, i64, i64) #3
+
+attributes #0 = { convergent mustprogress nofree nounwind willreturn memory(none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1030" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" }
+attributes #1 = { convergent nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1030" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" }
+attributes #2 = { convergent norecurse nounwind "amdgpu-flat-work-group-size"="64,64" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1030" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" "uniform-work-group-size"="true" }
+attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #4 = { convergent nounwind willreturn memory(none) }
+attributes #5 = { convergent nounwind }
+
+!llvm.module.flags = !{!0, !1, !2}
+!opencl.ocl.version = !{!3}
+!llvm.ident = !{!4}
+
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
+!1 = !{i32 1, !"wchar_size", i32 4}
+!2 = !{i32 8, !"PIC Level", i32 2}
+!3 = !{i32 1, i32 2}
+!4 = !{!"clang version 17.0.0 (ssh://chfang@git.amd.com:29418/lightning/ec/llvm-project 06ead8cf696777b9f17876b60707ba9de4d0606f)"}
+!5 = !{i32 1, i32 1, i32 1, i32 1, i32 1}
+!6 = !{!"none", !"none", !"none", !"none", !"none"}
+!7 = !{!"char*", !"char*", !"uint*", !"uint*", !"uint*"}
+!8 = !{!"", !"", !"", !"", !""}
+!9 = !{!"ht_src", !"ht_dst", !"rowCountersSrc", !"rowCountersDst", !"debug"}
+!10 = !{i32 64, i32 1, i32 1}
+!11 = !{!12, !12, i64 0}
+!12 = !{!"int", !13, i64 0}
+!13 = !{!"omnipotent char", !14, i64 0}
+!14 = !{!"Simple C/C++ TBAA"}
+!15 = !{!13, !13, i64 0}
+!16 = !{!17, !17, i64 0}
+!17 = !{!"long", !13, i64 0}
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.mir b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.mir
new file mode 100644
index 000000000000000..191b400011b6b2b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.mir
@@ -0,0 +1,1319 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 -run-pass=machine-sink -o -  %s | FileCheck %s
+
+--- |
+  %llvm.amdgcn.kernel.kernel_round1.lds.t = type { [3840 x i32], i32, [896 x i8] }
+  @llvm.amdgcn.kernel.kernel_round1.lds = internal addrspace(3) global %llvm.amdgcn.kernel.kernel_round1.lds.t poison, align 4
+  declare hidden i64 @_Z13get_global_idj(i32 noundef)
+  declare hidden i64 @_Z12get_local_idj(i32 noundef)
+  declare hidden void @_Z7barrierj(i32 noundef)
+  declare hidden i32 @_Z3minjj(i32 noundef, i32 noundef)
+  declare hidden i32 @_Z10atomic_incPU3AS3Vj(ptr addrspace(3) noundef)
+
+  define protected amdgpu_kernel void @kernel_round1_short(){
+    ret void
+  }
+
+  define protected amdgpu_kernel void @sink_salu(){
+    ret void
+  }
+...
+
+---
+name: kernel_round1_short
+alignment: 1
+tracksRegLiveness: true
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  stackPtrOffsetReg: '$sgpr32'
+
+body: |
+  ; CHECK-LABEL: name: kernel_round1_short
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr12, $sgpr13, $sgpr14
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY4]](p4), 16, 0 :: (dereferenceable invariant load (s64), align 16, addrspace 4)
+  ; CHECK-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 40
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0(p4)
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1(p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub0
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub1
+  ; CHECK-NEXT:   [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY7]], [[COPY9]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY8]], [[COPY10]], implicit-def $scc, implicit $scc
+  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; CHECK-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @_Z13get_global_idj + 4, target-flags(amdgpu-rel32-hi) @_Z13get_global_idj + 12, implicit-def dead $scc
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF]]
+  ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[REG_SEQUENCE]]
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY3]]
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY2]]
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY1]]
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]]
+  ; CHECK-NEXT:   $vgpr31 = COPY [[COPY6]](s32)
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY11]]
+  ; CHECK-NEXT:   $vgpr0 = COPY [[V_MOV_B32_e32_]]
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = SI_CALL killed [[SI_PC_ADD_REL_OFFSET]], @_Z13get_global_idj, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit-def $vgpr0, implicit-def $vgpr1
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY13]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; CHECK-NEXT:   [[SI_PC_ADD_REL_OFFSET1:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @_Z12get_local_idj + 4, target-flags(amdgpu-rel32-hi) @_Z12get_local_idj + 12, implicit-def dead $scc
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF2]]
+  ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[REG_SEQUENCE]]
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY3]]
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY2]]
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY1]]
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY]]
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF3]]
+  ; CHECK-NEXT:   $vgpr31 = COPY [[COPY6]](s32)
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY15]]
+  ; CHECK-NEXT:   $vgpr0 = COPY [[V_MOV_B32_e32_]]
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = SI_CALL killed [[SI_PC_ADD_REL_OFFSET1]], @_Z12get_local_idj, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit-def $vgpr0, implicit-def $vgpr1
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; CHECK-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY16]], %subreg.sub0, [[COPY17]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0
+  ; CHECK-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 14
+  ; CHECK-NEXT:   [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 killed [[COPY18]], killed [[S_MOV_B32_]], implicit $exec
+  ; CHECK-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 15364
+  ; CHECK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MUL_LO_U32_e64_]], killed [[S_MOV_B32_1]], 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_]], 15360, 0, implicit $exec :: (store (s32), align 1024, addrspace 3)
+  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; CHECK-NEXT:   [[SI_PC_ADD_REL_OFFSET2:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @_Z7barrierj + 4, target-flags(amdgpu-rel32-hi) @_Z7barrierj + 12, implicit-def dead $scc
+  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF4]]
+  ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[REG_SEQUENCE]]
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY3]]
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY2]]
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY1]]
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY]]
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF5]]
+  ; CHECK-NEXT:   $vgpr31 = COPY [[COPY6]](s32)
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]]
+  ; CHECK-NEXT:   $vgpr0 = COPY [[V_MOV_B32_e32_1]]
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = SI_CALL [[SI_PC_ADD_REL_OFFSET2]], @_Z7barrierj, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; CHECK-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2
+  ; CHECK-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[S_MOV_B32_2]], [[COPY14]], implicit $exec
+  ; CHECK-NEXT:   [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 28
+  ; CHECK-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 killed [[V_LSHLREV_B32_e64_]], killed [[S_MOV_B32_3]], implicit $exec
+  ; CHECK-NEXT:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_1]], [[COPY14]], implicit $exec
+  ; CHECK-NEXT:   [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483644
+  ; CHECK-NEXT:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 killed [[V_LSHRREV_B32_e64_]], killed [[S_MOV_B32_4]], implicit $exec
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR killed [[S_LOAD_DWORDX2_IMM]], killed [[V_AND_B32_e64_1]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+  ; CHECK-NEXT:   [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 killed [[GLOBAL_LOAD_DWORD_SADDR]], killed [[V_AND_B32_e64_]], 4, implicit $exec
+  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; CHECK-NEXT:   [[SI_PC_ADD_REL_OFFSET3:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @_Z3minjj + 4, target-flags(amdgpu-rel32-hi) @_Z3minjj + 12, implicit-def dead $scc
+  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 12, implicit $exec
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF6]]
+  ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[REG_SEQUENCE]]
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY3]]
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY2]]
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY1]]
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY]]
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF7]]
+  ; CHECK-NEXT:   $vgpr31 = COPY [[COPY6]](s32)
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]]
+  ; CHECK-NEXT:   $vgpr0 = COPY [[V_BFE_U32_e64_]]
+  ; CHECK-NEXT:   $vgpr1 = COPY [[V_MOV_B32_e32_2]]
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = SI_CALL killed [[SI_PC_ADD_REL_OFFSET3]], @_Z3minjj, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1, implicit-def $vgpr0
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; CHECK-NEXT:   [[COPY21:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; CHECK-NEXT:   [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+  ; CHECK-NEXT:   [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY21]], killed [[S_MOV_B32_5]], 0, implicit $exec
+  ; CHECK-NEXT:   [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+  ; CHECK-NEXT:   [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_6]]
+  ; CHECK-NEXT:   DS_WRITE_B8_gfx9 [[V_MUL_LO_U32_e64_]], killed [[COPY22]], 15364, 0, implicit $exec :: (store (s8), addrspace 3)
+  ; CHECK-NEXT:   [[S_MOV_B32_7:%[0-9]+]]:sreg_32 = S_MOV_B32 10
+  ; CHECK-NEXT:   [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 killed [[S_MOV_B32_7]], [[COPY14]], implicit $exec
+  ; CHECK-NEXT:   [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_6]], implicit $exec
+  ; CHECK-NEXT:   [[S_MOV_B32_8:%[0-9]+]]:sreg_32 = S_MOV_B32 5
+  ; CHECK-NEXT:   [[S_MOV_B32_9:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+  ; CHECK-NEXT:   [[S_MOV_B32_10:%[0-9]+]]:sreg_32 = S_MOV_B32 4
+  ; CHECK-NEXT:   [[S_MOV_B32_11:%[0-9]+]]:sreg_32 = S_MOV_B32 255
+  ; CHECK-NEXT:   [[SI_PC_ADD_REL_OFFSET4:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @_Z10atomic_incPU3AS3Vj + 4, target-flags(amdgpu-rel32-hi) @_Z10atomic_incPU3AS3Vj + 12, implicit-def dead $scc
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 15360, implicit $exec
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[S_MOV_B32_12:%[0-9]+]]:sreg_32 = S_MOV_B32 59
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.10(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:vgpr_32 = PHI %74, %bb.13
+  ; CHECK-NEXT:   SI_END_CF %75, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.10
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.14(0x04000000), %bb.3(0x7c000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[V_CMP_GE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_GE_U32_e64 %77, [[V_ADD_U32_e64_1]], implicit $exec
+  ; CHECK-NEXT:   [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_GT_U32_e64 %79, [[S_MOV_B32_12]], implicit $exec
+  ; CHECK-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 killed [[V_CMP_GE_U32_e64_]], killed [[V_CMP_GT_U32_e64_]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[SI_IF_BREAK:%[0-9]+]]:sreg_32 = SI_IF_BREAK killed [[S_OR_B32_]], %82, implicit-def dead $scc
+  ; CHECK-NEXT:   SI_LOOP [[SI_IF_BREAK]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.14
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.8(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_6]], %bb.0, [[SI_IF_BREAK]], %bb.2
+  ; CHECK-NEXT:   [[PHI2:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_6]], %bb.0, %77, %bb.2
+  ; CHECK-NEXT:   [[PHI3:%[0-9]+]]:vgpr_32 = PHI [[COPY23]], %bb.0, %79, %bb.2
+  ; CHECK-NEXT:   [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_]], [[PHI2]], 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_U8_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 killed [[V_ADD_U32_e64_2]], 0, 0, implicit $exec :: (load (s8), addrspace 3)
+  ; CHECK-NEXT:   [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[PHI2]], [[S_MOV_B32_8]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = nuw S_ADD_I32 [[PHI2]], [[S_MOV_B32_9]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[V_OR3_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR3_B32_e64 killed [[S_LSHL_B32_]], [[V_LSHLREV_B32_e64_1]], [[S_ADD_I32_]], implicit $exec
+  ; CHECK-NEXT:   [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[PHI2]], [[S_MOV_B32_8]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[V_CMP_LT_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_LT_U32_e64 killed [[S_ADD_I32_1]], [[COPY21]], implicit $exec
+  ; CHECK-NEXT:   [[COPY24:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]], implicit $exec
+  ; CHECK-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_LT_U32_e64_]], %bb.8, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.4
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.9(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[S_MOV_B32_13:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+  ; CHECK-NEXT:   S_BRANCH %bb.9
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.8(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI4:%[0-9]+]]:vgpr_32 = PHI %95, %bb.9
+  ; CHECK-NEXT:   SI_END_CF %96, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_OR3_B32_e64_]], %108, 0, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.8
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6:
+  ; CHECK-NEXT:   successors: %bb.7(0x40000000), %bb.10(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[V_CMP_LT_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_LT_U32_e64 %98, [[COPY21]], implicit $exec
+  ; CHECK-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_LT_U32_e64_1]], %bb.10, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7:
+  ; CHECK-NEXT:   successors: %bb.11(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[S_MOV_B32_14:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+  ; CHECK-NEXT:   [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[DS_READ_U8_gfx9_]], [[S_MOV_B32_11]], implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.11
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.8:
+  ; CHECK-NEXT:   successors: %bb.6(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI5:%[0-9]+]]:vgpr_32 = PHI [[COPY24]], %bb.3, [[PHI4]], %bb.5
+  ; CHECK-NEXT:   [[PHI6:%[0-9]+]]:vgpr_32 = PHI [[PHI3]], %bb.3, %103, %bb.5
+  ; CHECK-NEXT:   [[PHI7:%[0-9]+]]:vgpr_32 = PHI [[V_OR3_B32_e64_]], %bb.3, [[V_ADD_U32_e64_3]], %bb.5
+  ; CHECK-NEXT:   SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.6
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.9:
+  ; CHECK-NEXT:   successors: %bb.5(0x04000000), %bb.9(0x7c000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI8:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_13]], %bb.4, %96, %bb.9
+  ; CHECK-NEXT:   [[PHI9:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_13]], %bb.4, %108, %bb.9
+  ; CHECK-NEXT:   [[PHI10:%[0-9]+]]:vgpr_32 = PHI [[PHI3]], %bb.4, %103, %bb.9
+  ; CHECK-NEXT:   [[V_ADD_U32_e64_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI10]], [[S_MOV_B32_9]], 0, implicit $exec
+  ; CHECK-NEXT:   [[S_ADD_I32_2:%[0-9]+]]:sreg_32 = S_ADD_I32 [[PHI9]], [[S_MOV_B32_10]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_ADD_I32_3:%[0-9]+]]:sreg_32 = S_ADD_I32 [[PHI2]], [[S_ADD_I32_2]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_ADD_I32_4:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_ADD_I32_3]], [[S_MOV_B32_8]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[V_CMP_GE_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_GE_U32_e64 killed [[S_ADD_I32_4]], [[COPY21]], implicit $exec
+  ; CHECK-NEXT:   [[S_ADD_I32_5:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_ADD_I32_3]], [[S_MOV_B32_9]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[SI_IF_BREAK1:%[0-9]+]]:sreg_32 = SI_IF_BREAK killed [[V_CMP_GE_U32_e64_1]], [[PHI8]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_5]], implicit $exec
+  ; CHECK-NEXT:   SI_LOOP [[SI_IF_BREAK1]], %bb.9, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.10:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI11:%[0-9]+]]:vgpr_32 = PHI [[PHI6]], %bb.6, [[PHI]], %bb.1
+  ; CHECK-NEXT:   SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.11:
+  ; CHECK-NEXT:   successors: %bb.12(0x40000000), %bb.13(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI12:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_14]], %bb.7, %75, %bb.13
+  ; CHECK-NEXT:   [[PHI13:%[0-9]+]]:vgpr_32 = PHI [[PHI5]], %bb.7, %116, %bb.13
+  ; CHECK-NEXT:   [[PHI14:%[0-9]+]]:vgpr_32 = PHI [[PHI6]], %bb.7, %74, %bb.13
+  ; CHECK-NEXT:   [[PHI15:%[0-9]+]]:vgpr_32 = PHI [[PHI7]], %bb.7, %119, %bb.13
+  ; CHECK-NEXT:   [[V_ADD_U32_e64_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_]], [[PHI13]], 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_U8_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 killed [[V_ADD_U32_e64_5]], 0, 0, implicit $exec :: (load (s8), addrspace 3)
+  ; CHECK-NEXT:   [[V_CMP_EQ_U16_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U16_e64 [[V_AND_B32_e64_2]], killed [[DS_READ_U8_gfx9_1]], implicit $exec
+  ; CHECK-NEXT:   [[SI_IF2:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_EQ_U16_e64_]], %bb.13, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.12
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.12:
+  ; CHECK-NEXT:   successors: %bb.13(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[V_ADD_U32_e64_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI14]], [[S_MOV_B32_9]], 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY26:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0(p4)
+  ; CHECK-NEXT:   [[COPY27:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1(p4)
+  ; CHECK-NEXT:   [[COPY28:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub0
+  ; CHECK-NEXT:   [[COPY29:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub1
+  ; CHECK-NEXT:   [[S_ADD_U32_1:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY26]], [[COPY28]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADDC_U32_1:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY27]], [[COPY29]], implicit-def $scc, implicit $scc
+  ; CHECK-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_ADD_U32_1]], %subreg.sub0, [[S_ADDC_U32_1]], %subreg.sub1
+  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; CHECK-NEXT:   [[COPY30:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY5]]
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF8]]
+  ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[REG_SEQUENCE3]]
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY3]]
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY2]]
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY1]]
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY]]
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF9]]
+  ; CHECK-NEXT:   $vgpr31 = COPY [[COPY6]](s32)
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY30]]
+  ; CHECK-NEXT:   $vgpr0 = COPY [[V_MOV_B32_e32_3]]
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = SI_CALL [[SI_PC_ADD_REL_OFFSET4]], @_Z10atomic_incPU3AS3Vj, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit-def $vgpr0
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; CHECK-NEXT:   [[COPY31:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; CHECK-NEXT:   [[V_LSHLREV_B32_e64_2:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[S_MOV_B32_2]], [[COPY31]], implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 killed [[V_LSHLREV_B32_e64_2]], [[PHI15]], 0, 0, implicit $exec :: (store (s32), addrspace 3)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.13:
+  ; CHECK-NEXT:   successors: %bb.1(0x04000000), %bb.11(0x7c000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI16:%[0-9]+]]:vgpr_32 = PHI [[PHI14]], %bb.11, [[V_ADD_U32_e64_6]], %bb.12
+  ; CHECK-NEXT:   SI_END_CF [[SI_IF2]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e64_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI15]], [[S_MOV_B32_9]], 0, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e64_8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI13]], [[S_MOV_B32_9]], 0, implicit $exec
+  ; CHECK-NEXT:   [[V_CMP_GE_U32_e64_2:%[0-9]+]]:sreg_32 = V_CMP_GE_U32_e64 [[V_ADD_U32_e64_8]], [[COPY21]], implicit $exec
+  ; CHECK-NEXT:   [[SI_IF_BREAK2:%[0-9]+]]:sreg_32 = SI_IF_BREAK killed [[V_CMP_GE_U32_e64_2]], [[PHI12]], implicit-def dead $scc
+  ; CHECK-NEXT:   SI_LOOP [[SI_IF_BREAK2]], %bb.11, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.14:
+  ; CHECK-NEXT:   SI_END_CF [[SI_IF_BREAK]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 40
+  ; CHECK-NEXT:   [[COPY32:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0(p4)
+  ; CHECK-NEXT:   [[COPY33:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1(p4)
+  ; CHECK-NEXT:   [[COPY34:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_1]].sub0
+  ; CHECK-NEXT:   [[COPY35:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_1]].sub1
+  ; CHECK-NEXT:   [[S_ADD_U32_2:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY32]], [[COPY34]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADDC_U32_2:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY33]], [[COPY35]], implicit-def $scc, implicit $scc
+  ; CHECK-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_ADD_U32_2]], %subreg.sub0, [[S_ADDC_U32_2]], %subreg.sub1
+  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; CHECK-NEXT:   [[COPY36:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF10]]
+  ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[REG_SEQUENCE4]]
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY3]]
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY2]]
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY1]]
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY]]
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF11]]
+  ; CHECK-NEXT:   $vgpr31 = COPY [[COPY6]](s32)
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY36]]
+  ; CHECK-NEXT:   $vgpr0 = COPY [[V_MOV_B32_e32_4]]
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = SI_CALL [[SI_PC_ADD_REL_OFFSET2]], @_Z7barrierj, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.3(0x80000000)
+    liveins: $vgpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr12, $sgpr13, $sgpr14
+
+    %51:sgpr_32 = COPY $sgpr14
+    %50:sgpr_32 = COPY $sgpr13
+    %49:sgpr_32 = COPY $sgpr12
+    %47:sgpr_64 = COPY $sgpr8_sgpr9
+    %46:sgpr_64(p4) = COPY $sgpr6_sgpr7
+    %45:sgpr_64 = COPY $sgpr4_sgpr5
+    %43:vgpr_32(s32) = COPY $vgpr0
+    %54:sreg_64_xexec = S_LOAD_DWORDX2_IMM %46(p4), 16, 0 :: (dereferenceable invariant load (s64), align 16, addrspace 4)
+    %55:sreg_64 = S_MOV_B64 40
+    %158:sreg_32 = COPY %46.sub0(p4)
+    %159:sreg_32 = COPY %46.sub1(p4)
+    %160:sreg_32 = COPY %55.sub0
+    %161:sreg_32 = COPY %55.sub1
+    %156:sreg_32 = S_ADD_U32 %158, %160, implicit-def $scc
+    %157:sreg_32 = S_ADDC_U32 %159, %161, implicit-def $scc, implicit $scc
+    %56:sreg_64 = REG_SEQUENCE %156, %subreg.sub0, %157, %subreg.sub1
+    ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+    %57:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @_Z13get_global_idj + 4, target-flags(amdgpu-rel32-hi) @_Z13get_global_idj + 12, implicit-def dead $scc
+    %58:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+    %59:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    $sgpr4_sgpr5 = COPY %45
+    %60:sreg_64 = IMPLICIT_DEF
+    $sgpr6_sgpr7 = COPY %60
+    $sgpr8_sgpr9 = COPY %56
+    $sgpr10_sgpr11 = COPY %47
+    $sgpr12 = COPY %49
+    $sgpr13 = COPY %50
+    $sgpr14 = COPY %51
+    %61:sreg_32 = IMPLICIT_DEF
+    $sgpr15 = COPY %61
+    $vgpr31 = COPY %43(s32)
+    $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %58
+    $vgpr0 = COPY %59
+    $sgpr30_sgpr31 = SI_CALL killed %57, @_Z13get_global_idj, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit-def $vgpr0, implicit-def $vgpr1
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+    %62:vgpr_32 = COPY $vgpr0
+    %63:vgpr_32 = COPY $vgpr1
+    %150:vreg_64 = REG_SEQUENCE %62, %subreg.sub0, %63, %subreg.sub1
+    %152:vgpr_32 = COPY %150.sub0
+    ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+    %66:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @_Z12get_local_idj + 4, target-flags(amdgpu-rel32-hi) @_Z12get_local_idj + 12, implicit-def dead $scc
+    %67:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+    $sgpr4_sgpr5 = COPY %45
+    %68:sreg_64 = IMPLICIT_DEF
+    $sgpr6_sgpr7 = COPY %68
+    $sgpr8_sgpr9 = COPY %56
+    $sgpr10_sgpr11 = COPY %47
+    $sgpr12 = COPY %49
+    $sgpr13 = COPY %50
+    $sgpr14 = COPY %51
+    %69:sreg_32 = IMPLICIT_DEF
+    $sgpr15 = COPY %69
+    $vgpr31 = COPY %43(s32)
+    $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %67
+    $vgpr0 = COPY %59
+    $sgpr30_sgpr31 = SI_CALL killed %66, @_Z12get_local_idj, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit-def $vgpr0, implicit-def $vgpr1
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+    %70:vgpr_32 = COPY $vgpr0
+    %71:vgpr_32 = COPY $vgpr1
+    %149:vreg_64 = REG_SEQUENCE %70, %subreg.sub0, %71, %subreg.sub1
+    %151:vgpr_32 = COPY %149.sub0
+    %74:sreg_32 = S_MOV_B32 14
+    %75:vgpr_32 = V_MUL_LO_U32_e64 killed %151, killed %74, implicit $exec
+    %76:sreg_32 = S_MOV_B32 15364
+    %4:vgpr_32 = V_ADD_U32_e64 %75, killed %76, 0, implicit $exec
+    DS_WRITE_B32_gfx9 %59, %59, 15360, 0, implicit $exec :: (store (s32), align 1024, addrspace 3)
+    ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+    %77:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @_Z7barrierj + 4, target-flags(amdgpu-rel32-hi) @_Z7barrierj + 12, implicit-def dead $scc
+    %78:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+    %79:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+    $sgpr4_sgpr5 = COPY %45
+    %80:sreg_64 = IMPLICIT_DEF
+    $sgpr6_sgpr7 = COPY %80
+    $sgpr8_sgpr9 = COPY %56
+    $sgpr10_sgpr11 = COPY %47
+    $sgpr12 = COPY %49
+    $sgpr13 = COPY %50
+    $sgpr14 = COPY %51
+    %81:sreg_32 = IMPLICIT_DEF
+    $sgpr15 = COPY %81
+    $vgpr31 = COPY %43(s32)
+    $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %78
+    $vgpr0 = COPY %79
+    $sgpr30_sgpr31 = SI_CALL %77, @_Z7barrierj, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+    %82:sreg_32 = S_MOV_B32 2
+    %83:vgpr_32 = V_LSHLREV_B32_e64 %82, %152, implicit $exec
+    %84:sreg_32 = S_MOV_B32 28
+    %85:vgpr_32 = V_AND_B32_e64 killed %83, killed %84, implicit $exec
+    %86:vgpr_32 = V_LSHRREV_B32_e64 %79, %152, implicit $exec
+    %87:sreg_32 = S_MOV_B32 2147483644
+    %88:vgpr_32 = V_AND_B32_e64 killed %86, killed %87, implicit $exec
+    %89:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR killed %54, killed %88, 0, 0, implicit $exec :: (load (s32), addrspace 1)
+    %90:vgpr_32 = V_BFE_U32_e64 killed %89, killed %85, 4, implicit $exec
+    ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+    %91:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @_Z3minjj + 4, target-flags(amdgpu-rel32-hi) @_Z3minjj + 12, implicit-def dead $scc
+    %92:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+    %93:vgpr_32 = V_MOV_B32_e32 12, implicit $exec
+    $sgpr4_sgpr5 = COPY %45
+    %94:sreg_64 = IMPLICIT_DEF
+    $sgpr6_sgpr7 = COPY %94
+    $sgpr8_sgpr9 = COPY %56
+    $sgpr10_sgpr11 = COPY %47
+    $sgpr12 = COPY %49
+    $sgpr13 = COPY %50
+    $sgpr14 = COPY %51
+    %95:sreg_32 = IMPLICIT_DEF
+    $sgpr15 = COPY %95
+    $vgpr31 = COPY %43(s32)
+    $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %92
+    $vgpr0 = COPY %90
+    $vgpr1 = COPY %93
+    $sgpr30_sgpr31 = SI_CALL killed %91, @_Z3minjj, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1, implicit-def $vgpr0
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+    %96:vgpr_32 = COPY $vgpr0
+    %97:sreg_32 = S_MOV_B32 -1
+    %2:vgpr_32 = V_ADD_U32_e64 %96, killed %97, 0, implicit $exec
+    %98:sreg_32 = S_MOV_B32 0
+    %99:vgpr_32 = COPY %98
+    DS_WRITE_B8_gfx9 %75, killed %99, 15364, 0, implicit $exec :: (store (s8), addrspace 3)
+    %100:sreg_32 = S_MOV_B32 10
+    %3:vgpr_32 = V_LSHLREV_B32_e64 killed %100, %152, implicit $exec
+    %153:vgpr_32 = COPY %98, implicit $exec
+    %102:sreg_32 = S_MOV_B32 5
+    %104:sreg_32 = S_MOV_B32 1
+    %109:sreg_32 = S_MOV_B32 4
+    %118:sreg_32 = S_MOV_B32 255
+    %124:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @_Z10atomic_incPU3AS3Vj + 4, target-flags(amdgpu-rel32-hi) @_Z10atomic_incPU3AS3Vj + 12, implicit-def dead $scc
+    %126:vgpr_32 = V_MOV_B32_e32 15360, implicit $exec
+    %127:sreg_64 = IMPLICIT_DEF
+    %128:sreg_32 = IMPLICIT_DEF
+    %135:sreg_32 = S_MOV_B32 59
+    S_BRANCH %bb.3
+
+  bb.1:
+    successors: %bb.10(0x80000000)
+
+    %5:vgpr_32 = PHI %38, %bb.13
+    SI_END_CF %41, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.10
+
+  bb.2:
+    successors: %bb.14(0x04000000), %bb.3(0x7c000000)
+
+    %134:sreg_32 = V_CMP_GE_U32_e64 %12, %2, implicit $exec
+    %136:sreg_32 = V_CMP_GT_U32_e64 %31, %135, implicit $exec
+    %137:sreg_32 = S_OR_B32 killed %134, killed %136, implicit-def dead $scc
+    %7:sreg_32 = SI_IF_BREAK killed %137, %8, implicit-def dead $scc
+    SI_LOOP %7, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.14
+
+  bb.3:
+    successors: %bb.4(0x40000000), %bb.8(0x40000000)
+
+    %8:sreg_32 = PHI %98, %bb.0, %7, %bb.2
+    %9:sreg_32 = PHI %98, %bb.0, %12, %bb.2
+    %10:vgpr_32 = PHI %153, %bb.0, %31, %bb.2
+    %101:vgpr_32 = V_ADD_U32_e64 %4, %9, 0, implicit $exec
+    %11:vgpr_32 = DS_READ_U8_gfx9 killed %101, 0, 0, implicit $exec :: (load (s8), addrspace 3)
+    %103:sreg_32 = S_LSHL_B32 %9, %102, implicit-def dead $scc
+    %12:sreg_32 = nuw S_ADD_I32 %9, %104, implicit-def dead $scc
+    %13:vgpr_32 = V_OR3_B32_e64 killed %103, %3, %12, implicit $exec
+    %105:sreg_32 = S_ADD_I32 %9, %102, implicit-def dead $scc
+    %106:sreg_32 = V_CMP_LT_U32_e64 killed %105, %96, implicit $exec
+    %155:vgpr_32 = COPY %12, implicit $exec
+    %14:sreg_32 = SI_IF killed %106, %bb.8, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.4:
+    successors: %bb.9(0x80000000)
+
+    %107:sreg_32 = S_MOV_B32 0
+    S_BRANCH %bb.9
+
+  bb.5:
+    successors: %bb.8(0x80000000)
+
+    %17:vgpr_32 = PHI %154, %bb.9
+    SI_END_CF %30, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.8
+
+  bb.6:
+    successors: %bb.7(0x40000000), %bb.10(0x40000000)
+
+    %114:sreg_32 = V_CMP_LT_U32_e64 %20, %96, implicit $exec
+    %19:sreg_32 = SI_IF killed %114, %bb.10, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.7
+
+  bb.7:
+    successors: %bb.11(0x80000000)
+
+    %115:sreg_32 = S_MOV_B32 0
+    %119:vgpr_32 = V_AND_B32_e64 %11, %118, implicit $exec
+    S_BRANCH %bb.11
+
+  bb.8:
+    successors: %bb.6(0x80000000)
+
+    %20:vgpr_32 = PHI %155, %bb.3, %17, %bb.5
+    %21:vgpr_32 = PHI %10, %bb.3, %26, %bb.5
+    %22:vgpr_32 = PHI %13, %bb.3, %28, %bb.5
+    SI_END_CF %14, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.6
+
+  bb.9:
+    successors: %bb.5(0x04000000), %bb.9(0x7c000000)
+
+    %23:sreg_32 = PHI %107, %bb.4, %30, %bb.9
+    %24:sreg_32 = PHI %107, %bb.4, %27, %bb.9
+    %25:vgpr_32 = PHI %10, %bb.4, %26, %bb.9
+    %26:vgpr_32 = V_ADD_U32_e64 %25, %104, 0, implicit $exec
+    %27:sreg_32 = S_ADD_I32 %24, %109, implicit-def dead $scc
+    %110:sreg_32 = S_ADD_I32 %9, %27, implicit-def dead $scc
+    %112:sreg_32 = S_ADD_I32 %110, %102, implicit-def dead $scc
+    %113:sreg_32 = V_CMP_GE_U32_e64 killed %112, %96, implicit $exec
+    %28:vgpr_32 = V_ADD_U32_e64 %13, %27, 0, implicit $exec
+    %29:sreg_32 = S_ADD_I32 %110, %104, implicit-def dead $scc
+    %30:sreg_32 = SI_IF_BREAK killed %113, %23, implicit-def dead $scc
+    %154:vgpr_32 = COPY %29, implicit $exec
+    SI_LOOP %30, %bb.9, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.5
+
+  bb.10:
+    successors: %bb.2(0x80000000)
+
+    %31:vgpr_32 = PHI %21, %bb.6, %5, %bb.1
+    SI_END_CF %19, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.2
+
+  bb.11:
+    successors: %bb.12(0x40000000), %bb.13(0x40000000)
+
+    %32:sreg_32 = PHI %115, %bb.7, %41, %bb.13
+    %33:vgpr_32 = PHI %20, %bb.7, %40, %bb.13
+    %34:vgpr_32 = PHI %21, %bb.7, %38, %bb.13
+    %35:vgpr_32 = PHI %22, %bb.7, %39, %bb.13
+    %116:vgpr_32 = V_ADD_U32_e64 %4, %33, 0, implicit $exec
+    %117:vgpr_32 = DS_READ_U8_gfx9 killed %116, 0, 0, implicit $exec :: (load (s8), addrspace 3)
+    %120:sreg_32 = V_CMP_EQ_U16_e64 %119, killed %117, implicit $exec
+    %36:sreg_32 = SI_IF killed %120, %bb.13, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.12
+
+  bb.12:
+    successors: %bb.13(0x80000000)
+
+    %37:vgpr_32 = V_ADD_U32_e64 %34, %104, 0, implicit $exec
+    %164:sreg_32 = COPY %46.sub0(p4)
+    %165:sreg_32 = COPY %46.sub1(p4)
+    %166:sreg_32 = COPY %55.sub0
+    %167:sreg_32 = COPY %55.sub1
+    %162:sreg_32 = S_ADD_U32 %164, %166, implicit-def $scc
+    %163:sreg_32 = S_ADDC_U32 %165, %167, implicit-def $scc, implicit $scc
+    %123:sreg_64 = REG_SEQUENCE %162, %subreg.sub0, %163, %subreg.sub1
+    ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+    %125:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+    $sgpr4_sgpr5 = COPY %45
+    $sgpr6_sgpr7 = COPY %127
+    $sgpr8_sgpr9 = COPY %123
+    $sgpr10_sgpr11 = COPY %47
+    $sgpr12 = COPY %49
+    $sgpr13 = COPY %50
+    $sgpr14 = COPY %51
+    $sgpr15 = COPY %128
+    $vgpr31 = COPY %43(s32)
+    $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %125
+    $vgpr0 = COPY %126
+    $sgpr30_sgpr31 = SI_CALL %124, @_Z10atomic_incPU3AS3Vj, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit-def $vgpr0
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+    %129:vgpr_32 = COPY $vgpr0
+    %131:vgpr_32 = V_LSHLREV_B32_e64 %82, %129, implicit $exec
+    DS_WRITE_B32_gfx9 killed %131, %35, 0, 0, implicit $exec :: (store (s32), addrspace 3)
+
+  bb.13:
+    successors: %bb.1(0x04000000), %bb.11(0x7c000000)
+
+    %38:vgpr_32 = PHI %34, %bb.11, %37, %bb.12
+    SI_END_CF %36, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %39:vgpr_32 = V_ADD_U32_e64 %35, %104, 0, implicit $exec
+    %40:vgpr_32 = V_ADD_U32_e64 %33, %104, 0, implicit $exec
+    %133:sreg_32 = V_CMP_GE_U32_e64 %40, %96, implicit $exec
+    %41:sreg_32 = SI_IF_BREAK killed %133, %32, implicit-def dead $scc
+    SI_LOOP %41, %bb.11, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.14:
+    SI_END_CF %7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %138:sreg_64 = S_MOV_B64 40
+    %170:sreg_32 = COPY %46.sub0(p4)
+    %171:sreg_32 = COPY %46.sub1(p4)
+    %172:sreg_32 = COPY %138.sub0
+    %173:sreg_32 = COPY %138.sub1
+    %168:sreg_32 = S_ADD_U32 %170, %172, implicit-def $scc
+    %169:sreg_32 = S_ADDC_U32 %171, %173, implicit-def $scc, implicit $scc
+    %139:sreg_64 = REG_SEQUENCE %168, %subreg.sub0, %169, %subreg.sub1
+    ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+    %141:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+    %142:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+    $sgpr4_sgpr5 = COPY %45
+    %143:sreg_64 = IMPLICIT_DEF
+    $sgpr6_sgpr7 = COPY %143
+    $sgpr8_sgpr9 = COPY %139
+    $sgpr10_sgpr11 = COPY %47
+    $sgpr12 = COPY %49
+    $sgpr13 = COPY %50
+    $sgpr14 = COPY %51
+    %144:sreg_32 = IMPLICIT_DEF
+    $sgpr15 = COPY %144
+    $vgpr31 = COPY %43(s32)
+    $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %141
+    $vgpr0 = COPY %142
+    $sgpr30_sgpr31 = SI_CALL %77, @_Z7barrierj, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+    S_ENDPGM 0
+
+...
+
+
+---
+name: sink_salu
+alignment: 1
+tracksRegLiveness: true
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  stackPtrOffsetReg: '$sgpr32'
+
+body: |
+  ; CHECK-LABEL: name: sink_salu
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr12, $sgpr13, $sgpr14
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY4]](p4), 16, 0 :: (dereferenceable invariant load (s64), align 16, addrspace 4)
+  ; CHECK-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 40
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0(p4)
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1(p4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub0
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub1
+  ; CHECK-NEXT:   [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY7]], [[COPY9]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY8]], [[COPY10]], implicit-def $scc, implicit $scc
+  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; CHECK-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @_Z13get_global_idj + 4, target-flags(amdgpu-rel32-hi) @_Z13get_global_idj + 12, implicit-def dead $scc
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF]]
+  ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[REG_SEQUENCE]]
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY3]]
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY2]]
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY1]]
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY]]
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF1]]
+  ; CHECK-NEXT:   $vgpr31 = COPY [[COPY6]](s32)
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY11]]
+  ; CHECK-NEXT:   $vgpr0 = COPY [[V_MOV_B32_e32_]]
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = SI_CALL killed [[SI_PC_ADD_REL_OFFSET]], @_Z13get_global_idj, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit-def $vgpr0, implicit-def $vgpr1
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY13]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; CHECK-NEXT:   [[SI_PC_ADD_REL_OFFSET1:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @_Z12get_local_idj + 4, target-flags(amdgpu-rel32-hi) @_Z12get_local_idj + 12, implicit-def dead $scc
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF2]]
+  ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[REG_SEQUENCE]]
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY3]]
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY2]]
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY1]]
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY]]
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF3]]
+  ; CHECK-NEXT:   $vgpr31 = COPY [[COPY6]](s32)
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY15]]
+  ; CHECK-NEXT:   $vgpr0 = COPY [[V_MOV_B32_e32_]]
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = SI_CALL killed [[SI_PC_ADD_REL_OFFSET1]], @_Z12get_local_idj, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit-def $vgpr0, implicit-def $vgpr1
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; CHECK-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY16]], %subreg.sub0, [[COPY17]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0
+  ; CHECK-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 14
+  ; CHECK-NEXT:   [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 killed [[COPY18]], killed [[S_MOV_B32_]], implicit $exec
+  ; CHECK-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 15364
+  ; CHECK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MUL_LO_U32_e64_]], killed [[S_MOV_B32_1]], 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_]], 15360, 0, implicit $exec :: (store (s32), align 1024, addrspace 3)
+  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; CHECK-NEXT:   [[SI_PC_ADD_REL_OFFSET2:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @_Z7barrierj + 4, target-flags(amdgpu-rel32-hi) @_Z7barrierj + 12, implicit-def dead $scc
+  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF4]]
+  ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[REG_SEQUENCE]]
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY3]]
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY2]]
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY1]]
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY]]
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF5]]
+  ; CHECK-NEXT:   $vgpr31 = COPY [[COPY6]](s32)
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]]
+  ; CHECK-NEXT:   $vgpr0 = COPY [[V_MOV_B32_e32_1]]
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = SI_CALL [[SI_PC_ADD_REL_OFFSET2]], @_Z7barrierj, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; CHECK-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2
+  ; CHECK-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[S_MOV_B32_2]], [[COPY14]], implicit $exec
+  ; CHECK-NEXT:   [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 28
+  ; CHECK-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 killed [[V_LSHLREV_B32_e64_]], killed [[S_MOV_B32_3]], implicit $exec
+  ; CHECK-NEXT:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_1]], [[COPY14]], implicit $exec
+  ; CHECK-NEXT:   [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483644
+  ; CHECK-NEXT:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 killed [[V_LSHRREV_B32_e64_]], killed [[S_MOV_B32_4]], implicit $exec
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR killed [[S_LOAD_DWORDX2_IMM]], killed [[V_AND_B32_e64_1]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+  ; CHECK-NEXT:   [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 killed [[GLOBAL_LOAD_DWORD_SADDR]], killed [[V_AND_B32_e64_]], 4, implicit $exec
+  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; CHECK-NEXT:   [[SI_PC_ADD_REL_OFFSET3:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @_Z3minjj + 4, target-flags(amdgpu-rel32-hi) @_Z3minjj + 12, implicit-def dead $scc
+  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 12, implicit $exec
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF6]]
+  ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[REG_SEQUENCE]]
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY3]]
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY2]]
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY1]]
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY]]
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF7]]
+  ; CHECK-NEXT:   $vgpr31 = COPY [[COPY6]](s32)
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]]
+  ; CHECK-NEXT:   $vgpr0 = COPY [[V_BFE_U32_e64_]]
+  ; CHECK-NEXT:   $vgpr1 = COPY [[V_MOV_B32_e32_2]]
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = SI_CALL killed [[SI_PC_ADD_REL_OFFSET3]], @_Z3minjj, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1, implicit-def $vgpr0
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; CHECK-NEXT:   [[COPY21:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; CHECK-NEXT:   [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+  ; CHECK-NEXT:   [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY21]], killed [[S_MOV_B32_5]], 0, implicit $exec
+  ; CHECK-NEXT:   [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+  ; CHECK-NEXT:   [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_6]]
+  ; CHECK-NEXT:   DS_WRITE_B8_gfx9 [[V_MUL_LO_U32_e64_]], killed [[COPY22]], 15364, 0, implicit $exec :: (store (s8), addrspace 3)
+  ; CHECK-NEXT:   [[S_MOV_B32_7:%[0-9]+]]:sreg_32 = S_MOV_B32 10
+  ; CHECK-NEXT:   [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 killed [[S_MOV_B32_7]], [[COPY14]], implicit $exec
+  ; CHECK-NEXT:   [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_6]], implicit $exec
+  ; CHECK-NEXT:   [[S_MOV_B32_8:%[0-9]+]]:sreg_32 = S_MOV_B32 5
+  ; CHECK-NEXT:   [[S_MOV_B32_9:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+  ; CHECK-NEXT:   [[S_MOV_B32_10:%[0-9]+]]:sreg_32 = S_MOV_B32 4
+  ; CHECK-NEXT:   [[S_MOV_B32_11:%[0-9]+]]:sreg_32 = S_MOV_B32 255
+  ; CHECK-NEXT:   [[SI_PC_ADD_REL_OFFSET4:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @_Z10atomic_incPU3AS3Vj + 4, target-flags(amdgpu-rel32-hi) @_Z10atomic_incPU3AS3Vj + 12, implicit-def dead $scc
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 15360, implicit $exec
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[S_MOV_B32_12:%[0-9]+]]:sreg_32 = S_MOV_B32 59
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.10(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:vgpr_32 = PHI %74, %bb.13
+  ; CHECK-NEXT:   SI_END_CF %75, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.10
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.14(0x04000000), %bb.3(0x7c000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[V_CMP_GE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_GE_U32_e64 %77, [[V_ADD_U32_e64_1]], implicit $exec
+  ; CHECK-NEXT:   [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_GT_U32_e64 %79, [[S_MOV_B32_12]], implicit $exec
+  ; CHECK-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 killed [[V_CMP_GE_U32_e64_]], killed [[V_CMP_GT_U32_e64_]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[SI_IF_BREAK:%[0-9]+]]:sreg_32 = SI_IF_BREAK killed [[S_OR_B32_]], %82, implicit-def dead $scc
+  ; CHECK-NEXT:   SI_LOOP [[SI_IF_BREAK]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.14
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.8(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_6]], %bb.0, [[SI_IF_BREAK]], %bb.2
+  ; CHECK-NEXT:   [[PHI2:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_6]], %bb.0, %77, %bb.2
+  ; CHECK-NEXT:   [[PHI3:%[0-9]+]]:vgpr_32 = PHI [[COPY23]], %bb.0, %79, %bb.2
+  ; CHECK-NEXT:   [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_]], [[PHI2]], 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_U8_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 killed [[V_ADD_U32_e64_2]], 0, 0, implicit $exec :: (load (s8), addrspace 3)
+  ; CHECK-NEXT:   [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[PHI2]], [[S_MOV_B32_8]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = nuw S_ADD_I32 [[PHI2]], [[S_MOV_B32_9]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[V_OR3_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR3_B32_e64 killed [[S_LSHL_B32_]], [[V_LSHLREV_B32_e64_1]], [[S_ADD_I32_]], implicit $exec
+  ; CHECK-NEXT:   [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[PHI2]], [[S_MOV_B32_8]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[V_CMP_LT_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_LT_U32_e64 killed [[S_ADD_I32_1]], [[COPY21]], implicit $exec
+  ; CHECK-NEXT:   [[COPY24:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]], implicit $exec
+  ; CHECK-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_LT_U32_e64_]], %bb.8, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.4
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.9(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[S_MOV_B32_13:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+  ; CHECK-NEXT:   S_BRANCH %bb.9
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.8(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI4:%[0-9]+]]:vgpr_32 = PHI %95, %bb.9
+  ; CHECK-NEXT:   SI_END_CF %96, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   [[S_ADD_I32_2:%[0-9]+]]:sreg_32 = S_ADD_I32 %108, 1, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_ADD_I32_3:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_ADD_I32_2]], 2, implicit-def dead $scc
+  ; CHECK-NEXT:   [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_OR3_B32_e64_]], [[S_ADD_I32_3]], 0, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.8
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6:
+  ; CHECK-NEXT:   successors: %bb.7(0x40000000), %bb.10(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[V_CMP_LT_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_LT_U32_e64 %98, [[COPY21]], implicit $exec
+  ; CHECK-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_LT_U32_e64_1]], %bb.10, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7:
+  ; CHECK-NEXT:   successors: %bb.11(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[S_MOV_B32_14:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+  ; CHECK-NEXT:   [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[DS_READ_U8_gfx9_]], [[S_MOV_B32_11]], implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.11
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.8:
+  ; CHECK-NEXT:   successors: %bb.6(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI5:%[0-9]+]]:vgpr_32 = PHI [[COPY24]], %bb.3, [[PHI4]], %bb.5
+  ; CHECK-NEXT:   [[PHI6:%[0-9]+]]:vgpr_32 = PHI [[PHI3]], %bb.3, %103, %bb.5
+  ; CHECK-NEXT:   [[PHI7:%[0-9]+]]:vgpr_32 = PHI [[V_OR3_B32_e64_]], %bb.3, [[V_ADD_U32_e64_3]], %bb.5
+  ; CHECK-NEXT:   SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.6
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.9:
+  ; CHECK-NEXT:   successors: %bb.5(0x04000000), %bb.9(0x7c000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI8:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_13]], %bb.4, %96, %bb.9
+  ; CHECK-NEXT:   [[PHI9:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_13]], %bb.4, %108, %bb.9
+  ; CHECK-NEXT:   [[PHI10:%[0-9]+]]:vgpr_32 = PHI [[PHI3]], %bb.4, %103, %bb.9
+  ; CHECK-NEXT:   [[V_ADD_U32_e64_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI10]], [[S_MOV_B32_9]], 0, implicit $exec
+  ; CHECK-NEXT:   [[S_ADD_I32_4:%[0-9]+]]:sreg_32 = S_ADD_I32 [[PHI9]], [[S_MOV_B32_10]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_ADD_I32_5:%[0-9]+]]:sreg_32 = S_ADD_I32 [[PHI2]], [[S_ADD_I32_4]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_ADD_I32_6:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_ADD_I32_5]], [[S_MOV_B32_8]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[V_CMP_GE_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_GE_U32_e64 killed [[S_ADD_I32_6]], [[COPY21]], implicit $exec
+  ; CHECK-NEXT:   [[S_ADD_I32_7:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_ADD_I32_5]], [[S_MOV_B32_9]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[SI_IF_BREAK1:%[0-9]+]]:sreg_32 = SI_IF_BREAK killed [[V_CMP_GE_U32_e64_1]], [[PHI8]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_7]], implicit $exec
+  ; CHECK-NEXT:   SI_LOOP [[SI_IF_BREAK1]], %bb.9, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.10:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI11:%[0-9]+]]:vgpr_32 = PHI [[PHI6]], %bb.6, [[PHI]], %bb.1
+  ; CHECK-NEXT:   SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.11:
+  ; CHECK-NEXT:   successors: %bb.12(0x40000000), %bb.13(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI12:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_14]], %bb.7, %75, %bb.13
+  ; CHECK-NEXT:   [[PHI13:%[0-9]+]]:vgpr_32 = PHI [[PHI5]], %bb.7, %118, %bb.13
+  ; CHECK-NEXT:   [[PHI14:%[0-9]+]]:vgpr_32 = PHI [[PHI6]], %bb.7, %74, %bb.13
+  ; CHECK-NEXT:   [[PHI15:%[0-9]+]]:vgpr_32 = PHI [[PHI7]], %bb.7, %121, %bb.13
+  ; CHECK-NEXT:   [[V_ADD_U32_e64_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_]], [[PHI13]], 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_U8_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 killed [[V_ADD_U32_e64_5]], 0, 0, implicit $exec :: (load (s8), addrspace 3)
+  ; CHECK-NEXT:   [[V_CMP_EQ_U16_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U16_e64 [[V_AND_B32_e64_2]], killed [[DS_READ_U8_gfx9_1]], implicit $exec
+  ; CHECK-NEXT:   [[SI_IF2:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_EQ_U16_e64_]], %bb.13, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.12
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.12:
+  ; CHECK-NEXT:   successors: %bb.13(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[V_ADD_U32_e64_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI14]], [[S_MOV_B32_9]], 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY26:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0(p4)
+  ; CHECK-NEXT:   [[COPY27:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1(p4)
+  ; CHECK-NEXT:   [[COPY28:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub0
+  ; CHECK-NEXT:   [[COPY29:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub1
+  ; CHECK-NEXT:   [[S_ADD_U32_1:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY26]], [[COPY28]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADDC_U32_1:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY27]], [[COPY29]], implicit-def $scc, implicit $scc
+  ; CHECK-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_ADD_U32_1]], %subreg.sub0, [[S_ADDC_U32_1]], %subreg.sub1
+  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; CHECK-NEXT:   [[COPY30:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY5]]
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF8]]
+  ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[REG_SEQUENCE3]]
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY3]]
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY2]]
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY1]]
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY]]
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF9]]
+  ; CHECK-NEXT:   $vgpr31 = COPY [[COPY6]](s32)
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY30]]
+  ; CHECK-NEXT:   $vgpr0 = COPY [[V_MOV_B32_e32_3]]
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = SI_CALL [[SI_PC_ADD_REL_OFFSET4]], @_Z10atomic_incPU3AS3Vj, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit-def $vgpr0
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; CHECK-NEXT:   [[COPY31:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; CHECK-NEXT:   [[V_LSHLREV_B32_e64_2:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[S_MOV_B32_2]], [[COPY31]], implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 killed [[V_LSHLREV_B32_e64_2]], [[PHI15]], 0, 0, implicit $exec :: (store (s32), addrspace 3)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.13:
+  ; CHECK-NEXT:   successors: %bb.1(0x04000000), %bb.11(0x7c000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI16:%[0-9]+]]:vgpr_32 = PHI [[PHI14]], %bb.11, [[V_ADD_U32_e64_6]], %bb.12
+  ; CHECK-NEXT:   SI_END_CF [[SI_IF2]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e64_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI15]], [[S_MOV_B32_9]], 0, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e64_8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI13]], [[S_MOV_B32_9]], 0, implicit $exec
+  ; CHECK-NEXT:   [[V_CMP_GE_U32_e64_2:%[0-9]+]]:sreg_32 = V_CMP_GE_U32_e64 [[V_ADD_U32_e64_8]], [[COPY21]], implicit $exec
+  ; CHECK-NEXT:   [[SI_IF_BREAK2:%[0-9]+]]:sreg_32 = SI_IF_BREAK killed [[V_CMP_GE_U32_e64_2]], [[PHI12]], implicit-def dead $scc
+  ; CHECK-NEXT:   SI_LOOP [[SI_IF_BREAK2]], %bb.11, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.14:
+  ; CHECK-NEXT:   SI_END_CF [[SI_IF_BREAK]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 40
+  ; CHECK-NEXT:   [[COPY32:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0(p4)
+  ; CHECK-NEXT:   [[COPY33:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1(p4)
+  ; CHECK-NEXT:   [[COPY34:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_1]].sub0
+  ; CHECK-NEXT:   [[COPY35:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_1]].sub1
+  ; CHECK-NEXT:   [[S_ADD_U32_2:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY32]], [[COPY34]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADDC_U32_2:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY33]], [[COPY35]], implicit-def $scc, implicit $scc
+  ; CHECK-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_ADD_U32_2]], %subreg.sub0, [[S_ADDC_U32_2]], %subreg.sub1
+  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; CHECK-NEXT:   [[COPY36:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF10]]
+  ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[REG_SEQUENCE4]]
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY3]]
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY2]]
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY1]]
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY]]
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   $sgpr15 = COPY [[DEF11]]
+  ; CHECK-NEXT:   $vgpr31 = COPY [[COPY6]](s32)
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY36]]
+  ; CHECK-NEXT:   $vgpr0 = COPY [[V_MOV_B32_e32_4]]
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = SI_CALL [[SI_PC_ADD_REL_OFFSET2]], @_Z7barrierj, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.3(0x80000000)
+    liveins: $vgpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr12, $sgpr13, $sgpr14
+
+    %51:sgpr_32 = COPY $sgpr14
+    %50:sgpr_32 = COPY $sgpr13
+    %49:sgpr_32 = COPY $sgpr12
+    %47:sgpr_64 = COPY $sgpr8_sgpr9
+    %46:sgpr_64(p4) = COPY $sgpr6_sgpr7
+    %45:sgpr_64 = COPY $sgpr4_sgpr5
+    %43:vgpr_32(s32) = COPY $vgpr0
+    %54:sreg_64_xexec = S_LOAD_DWORDX2_IMM %46(p4), 16, 0 :: (dereferenceable invariant load (s64), align 16, addrspace 4)
+    %55:sreg_64 = S_MOV_B64 40
+    %158:sreg_32 = COPY %46.sub0(p4)
+    %159:sreg_32 = COPY %46.sub1(p4)
+    %160:sreg_32 = COPY %55.sub0
+    %161:sreg_32 = COPY %55.sub1
+    %156:sreg_32 = S_ADD_U32 %158, %160, implicit-def $scc
+    %157:sreg_32 = S_ADDC_U32 %159, %161, implicit-def $scc, implicit $scc
+    %56:sreg_64 = REG_SEQUENCE %156, %subreg.sub0, %157, %subreg.sub1
+    ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+    %57:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @_Z13get_global_idj + 4, target-flags(amdgpu-rel32-hi) @_Z13get_global_idj + 12, implicit-def dead $scc
+    %58:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+    %59:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    $sgpr4_sgpr5 = COPY %45
+    %60:sreg_64 = IMPLICIT_DEF
+    $sgpr6_sgpr7 = COPY %60
+    $sgpr8_sgpr9 = COPY %56
+    $sgpr10_sgpr11 = COPY %47
+    $sgpr12 = COPY %49
+    $sgpr13 = COPY %50
+    $sgpr14 = COPY %51
+    %61:sreg_32 = IMPLICIT_DEF
+    $sgpr15 = COPY %61
+    $vgpr31 = COPY %43(s32)
+    $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %58
+    $vgpr0 = COPY %59
+    $sgpr30_sgpr31 = SI_CALL killed %57, @_Z13get_global_idj, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit-def $vgpr0, implicit-def $vgpr1
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+    %62:vgpr_32 = COPY $vgpr0
+    %63:vgpr_32 = COPY $vgpr1
+    %150:vreg_64 = REG_SEQUENCE %62, %subreg.sub0, %63, %subreg.sub1
+    %152:vgpr_32 = COPY %150.sub0
+    ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+    %66:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @_Z12get_local_idj + 4, target-flags(amdgpu-rel32-hi) @_Z12get_local_idj + 12, implicit-def dead $scc
+    %67:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+    $sgpr4_sgpr5 = COPY %45
+    %68:sreg_64 = IMPLICIT_DEF
+    $sgpr6_sgpr7 = COPY %68
+    $sgpr8_sgpr9 = COPY %56
+    $sgpr10_sgpr11 = COPY %47
+    $sgpr12 = COPY %49
+    $sgpr13 = COPY %50
+    $sgpr14 = COPY %51
+    %69:sreg_32 = IMPLICIT_DEF
+    $sgpr15 = COPY %69
+    $vgpr31 = COPY %43(s32)
+    $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %67
+    $vgpr0 = COPY %59
+    $sgpr30_sgpr31 = SI_CALL killed %66, @_Z12get_local_idj, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit-def $vgpr0, implicit-def $vgpr1
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+    %70:vgpr_32 = COPY $vgpr0
+    %71:vgpr_32 = COPY $vgpr1
+    %149:vreg_64 = REG_SEQUENCE %70, %subreg.sub0, %71, %subreg.sub1
+    %151:vgpr_32 = COPY %149.sub0
+    %74:sreg_32 = S_MOV_B32 14
+    %75:vgpr_32 = V_MUL_LO_U32_e64 killed %151, killed %74, implicit $exec
+    %76:sreg_32 = S_MOV_B32 15364
+    %4:vgpr_32 = V_ADD_U32_e64 %75, killed %76, 0, implicit $exec
+    DS_WRITE_B32_gfx9 %59, %59, 15360, 0, implicit $exec :: (store (s32), align 1024, addrspace 3)
+    ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+    %77:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @_Z7barrierj + 4, target-flags(amdgpu-rel32-hi) @_Z7barrierj + 12, implicit-def dead $scc
+    %78:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+    %79:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+    $sgpr4_sgpr5 = COPY %45
+    %80:sreg_64 = IMPLICIT_DEF
+    $sgpr6_sgpr7 = COPY %80
+    $sgpr8_sgpr9 = COPY %56
+    $sgpr10_sgpr11 = COPY %47
+    $sgpr12 = COPY %49
+    $sgpr13 = COPY %50
+    $sgpr14 = COPY %51
+    %81:sreg_32 = IMPLICIT_DEF
+    $sgpr15 = COPY %81
+    $vgpr31 = COPY %43(s32)
+    $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %78
+    $vgpr0 = COPY %79
+    $sgpr30_sgpr31 = SI_CALL %77, @_Z7barrierj, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+    %82:sreg_32 = S_MOV_B32 2
+    %83:vgpr_32 = V_LSHLREV_B32_e64 %82, %152, implicit $exec
+    %84:sreg_32 = S_MOV_B32 28
+    %85:vgpr_32 = V_AND_B32_e64 killed %83, killed %84, implicit $exec
+    %86:vgpr_32 = V_LSHRREV_B32_e64 %79, %152, implicit $exec
+    %87:sreg_32 = S_MOV_B32 2147483644
+    %88:vgpr_32 = V_AND_B32_e64 killed %86, killed %87, implicit $exec
+    %89:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR killed %54, killed %88, 0, 0, implicit $exec :: (load (s32), addrspace 1)
+    %90:vgpr_32 = V_BFE_U32_e64 killed %89, killed %85, 4, implicit $exec
+    ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+    %91:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @_Z3minjj + 4, target-flags(amdgpu-rel32-hi) @_Z3minjj + 12, implicit-def dead $scc
+    %92:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+    %93:vgpr_32 = V_MOV_B32_e32 12, implicit $exec
+    $sgpr4_sgpr5 = COPY %45
+    %94:sreg_64 = IMPLICIT_DEF
+    $sgpr6_sgpr7 = COPY %94
+    $sgpr8_sgpr9 = COPY %56
+    $sgpr10_sgpr11 = COPY %47
+    $sgpr12 = COPY %49
+    $sgpr13 = COPY %50
+    $sgpr14 = COPY %51
+    %95:sreg_32 = IMPLICIT_DEF
+    $sgpr15 = COPY %95
+    $vgpr31 = COPY %43(s32)
+    $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %92
+    $vgpr0 = COPY %90
+    $vgpr1 = COPY %93
+    $sgpr30_sgpr31 = SI_CALL killed %91, @_Z3minjj, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1, implicit-def $vgpr0
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+    %96:vgpr_32 = COPY $vgpr0
+    %97:sreg_32 = S_MOV_B32 -1
+    %2:vgpr_32 = V_ADD_U32_e64 %96, killed %97, 0, implicit $exec
+    %98:sreg_32 = S_MOV_B32 0
+    %99:vgpr_32 = COPY %98
+    DS_WRITE_B8_gfx9 %75, killed %99, 15364, 0, implicit $exec :: (store (s8), addrspace 3)
+    %100:sreg_32 = S_MOV_B32 10
+    %3:vgpr_32 = V_LSHLREV_B32_e64 killed %100, %152, implicit $exec
+    %153:vgpr_32 = COPY %98, implicit $exec
+    %102:sreg_32 = S_MOV_B32 5
+    %104:sreg_32 = S_MOV_B32 1
+    %109:sreg_32 = S_MOV_B32 4
+    %118:sreg_32 = S_MOV_B32 255
+    %124:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @_Z10atomic_incPU3AS3Vj + 4, target-flags(amdgpu-rel32-hi) @_Z10atomic_incPU3AS3Vj + 12, implicit-def dead $scc
+    %126:vgpr_32 = V_MOV_B32_e32 15360, implicit $exec
+    %127:sreg_64 = IMPLICIT_DEF
+    %128:sreg_32 = IMPLICIT_DEF
+    %135:sreg_32 = S_MOV_B32 59
+    S_BRANCH %bb.3
+
+  bb.1:
+    successors: %bb.10(0x80000000)
+
+    %5:vgpr_32 = PHI %38, %bb.13
+    SI_END_CF %41, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.10
+
+  bb.2:
+    successors: %bb.14(0x04000000), %bb.3(0x7c000000)
+
+    %134:sreg_32 = V_CMP_GE_U32_e64 %12, %2, implicit $exec
+    %136:sreg_32 = V_CMP_GT_U32_e64 %31, %135, implicit $exec
+    %137:sreg_32 = S_OR_B32 killed %134, killed %136, implicit-def dead $scc
+    %7:sreg_32 = SI_IF_BREAK killed %137, %8, implicit-def dead $scc
+    SI_LOOP %7, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.14
+
+  bb.3:
+    successors: %bb.4(0x40000000), %bb.8(0x40000000)
+
+    %8:sreg_32 = PHI %98, %bb.0, %7, %bb.2
+    %9:sreg_32 = PHI %98, %bb.0, %12, %bb.2
+    %10:vgpr_32 = PHI %153, %bb.0, %31, %bb.2
+    %101:vgpr_32 = V_ADD_U32_e64 %4, %9, 0, implicit $exec
+    %11:vgpr_32 = DS_READ_U8_gfx9 killed %101, 0, 0, implicit $exec :: (load (s8), addrspace 3)
+    %103:sreg_32 = S_LSHL_B32 %9, %102, implicit-def dead $scc
+    %12:sreg_32 = nuw S_ADD_I32 %9, %104, implicit-def dead $scc
+    %13:vgpr_32 = V_OR3_B32_e64 killed %103, %3, %12, implicit $exec
+    %105:sreg_32 = S_ADD_I32 %9, %102, implicit-def dead $scc
+    %106:sreg_32 = V_CMP_LT_U32_e64 killed %105, %96, implicit $exec
+    %155:vgpr_32 = COPY %12, implicit $exec
+    %14:sreg_32 = SI_IF killed %106, %bb.8, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.4:
+    successors: %bb.9(0x80000000)
+
+    %107:sreg_32 = S_MOV_B32 0
+    S_BRANCH %bb.9
+
+  bb.5:
+    successors: %bb.8(0x80000000)
+
+    %17:vgpr_32 = PHI %154, %bb.9
+    SI_END_CF %30, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.8
+
+  bb.6:
+    successors: %bb.7(0x40000000), %bb.10(0x40000000)
+
+    %114:sreg_32 = V_CMP_LT_U32_e64 %20, %96, implicit $exec
+    %19:sreg_32 = SI_IF killed %114, %bb.10, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.7
+
+  bb.7:
+    successors: %bb.11(0x80000000)
+
+    %115:sreg_32 = S_MOV_B32 0
+    %119:vgpr_32 = V_AND_B32_e64 %11, %118, implicit $exec
+    S_BRANCH %bb.11
+
+  bb.8:
+    successors: %bb.6(0x80000000)
+
+    %20:vgpr_32 = PHI %155, %bb.3, %17, %bb.5
+    %21:vgpr_32 = PHI %10, %bb.3, %26, %bb.5
+    %22:vgpr_32 = PHI %13, %bb.3, %28, %bb.5
+    SI_END_CF %14, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.6
+
+  bb.9:
+    successors: %bb.5(0x04000000), %bb.9(0x7c000000)
+
+    %23:sreg_32 = PHI %107, %bb.4, %30, %bb.9
+    %24:sreg_32 = PHI %107, %bb.4, %27, %bb.9
+    %25:vgpr_32 = PHI %10, %bb.4, %26, %bb.9
+    %26:vgpr_32 = V_ADD_U32_e64 %25, %104, 0, implicit $exec
+    %27:sreg_32 = S_ADD_I32 %24, %109, implicit-def dead $scc
+    %110:sreg_32 = S_ADD_I32 %9, %27, implicit-def dead $scc
+    %112:sreg_32 = S_ADD_I32 %110, %102, implicit-def dead $scc
+    %113:sreg_32 = V_CMP_GE_U32_e64 killed %112, %96, implicit $exec
+    %1000:sreg_32 = S_ADD_I32 %27, 1, implicit-def dead $scc
+    %1001:sreg_32 = S_ADD_I32 %1000, 2, implicit-def dead $scc
+    %28:vgpr_32 = V_ADD_U32_e64 %13, %1001, 0, implicit $exec
+    %29:sreg_32 = S_ADD_I32 %110, %104, implicit-def dead $scc
+    %30:sreg_32 = SI_IF_BREAK killed %113, %23, implicit-def dead $scc
+    %154:vgpr_32 = COPY %29, implicit $exec
+    SI_LOOP %30, %bb.9, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.5
+
+  bb.10:
+    successors: %bb.2(0x80000000)
+
+    %31:vgpr_32 = PHI %21, %bb.6, %5, %bb.1
+    SI_END_CF %19, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.2
+
+  bb.11:
+    successors: %bb.12(0x40000000), %bb.13(0x40000000)
+
+    %32:sreg_32 = PHI %115, %bb.7, %41, %bb.13
+    %33:vgpr_32 = PHI %20, %bb.7, %40, %bb.13
+    %34:vgpr_32 = PHI %21, %bb.7, %38, %bb.13
+    %35:vgpr_32 = PHI %22, %bb.7, %39, %bb.13
+    %116:vgpr_32 = V_ADD_U32_e64 %4, %33, 0, implicit $exec
+    %117:vgpr_32 = DS_READ_U8_gfx9 killed %116, 0, 0, implicit $exec :: (load (s8), addrspace 3)
+    %120:sreg_32 = V_CMP_EQ_U16_e64 %119, killed %117, implicit $exec
+    %36:sreg_32 = SI_IF killed %120, %bb.13, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.12
+
+  bb.12:
+    successors: %bb.13(0x80000000)
+
+    %37:vgpr_32 = V_ADD_U32_e64 %34, %104, 0, implicit $exec
+    %164:sreg_32 = COPY %46.sub0(p4)
+    %165:sreg_32 = COPY %46.sub1(p4)
+    %166:sreg_32 = COPY %55.sub0
+    %167:sreg_32 = COPY %55.sub1
+    %162:sreg_32 = S_ADD_U32 %164, %166, implicit-def $scc
+    %163:sreg_32 = S_ADDC_U32 %165, %167, implicit-def $scc, implicit $scc
+    %123:sreg_64 = REG_SEQUENCE %162, %subreg.sub0, %163, %subreg.sub1
+    ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+    %125:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+    $sgpr4_sgpr5 = COPY %45
+    $sgpr6_sgpr7 = COPY %127
+    $sgpr8_sgpr9 = COPY %123
+    $sgpr10_sgpr11 = COPY %47
+    $sgpr12 = COPY %49
+    $sgpr13 = COPY %50
+    $sgpr14 = COPY %51
+    $sgpr15 = COPY %128
+    $vgpr31 = COPY %43(s32)
+    $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %125
+    $vgpr0 = COPY %126
+    $sgpr30_sgpr31 = SI_CALL %124, @_Z10atomic_incPU3AS3Vj, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit-def $vgpr0
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+    %129:vgpr_32 = COPY $vgpr0
+    %131:vgpr_32 = V_LSHLREV_B32_e64 %82, %129, implicit $exec
+    DS_WRITE_B32_gfx9 killed %131, %35, 0, 0, implicit $exec :: (store (s32), addrspace 3)
+
+  bb.13:
+    successors: %bb.1(0x04000000), %bb.11(0x7c000000)
+
+    %38:vgpr_32 = PHI %34, %bb.11, %37, %bb.12
+    SI_END_CF %36, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %39:vgpr_32 = V_ADD_U32_e64 %35, %104, 0, implicit $exec
+    %40:vgpr_32 = V_ADD_U32_e64 %33, %104, 0, implicit $exec
+    %133:sreg_32 = V_CMP_GE_U32_e64 %40, %96, implicit $exec
+    %41:sreg_32 = SI_IF_BREAK killed %133, %32, implicit-def dead $scc
+    SI_LOOP %41, %bb.11, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.14:
+    SI_END_CF %7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %138:sreg_64 = S_MOV_B64 40
+    %170:sreg_32 = COPY %46.sub0(p4)
+    %171:sreg_32 = COPY %46.sub1(p4)
+    %172:sreg_32 = COPY %138.sub0
+    %173:sreg_32 = COPY %138.sub1
+    %168:sreg_32 = S_ADD_U32 %170, %172, implicit-def $scc
+    %169:sreg_32 = S_ADDC_U32 %171, %173, implicit-def $scc, implicit $scc
+    %139:sreg_64 = REG_SEQUENCE %168, %subreg.sub0, %169, %subreg.sub1
+    ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+    %141:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+    %142:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+    $sgpr4_sgpr5 = COPY %45
+    %143:sreg_64 = IMPLICIT_DEF
+    $sgpr6_sgpr7 = COPY %143
+    $sgpr8_sgpr9 = COPY %139
+    $sgpr10_sgpr11 = COPY %47
+    $sgpr12 = COPY %49
+    $sgpr13 = COPY %50
+    $sgpr14 = COPY %51
+    %144:sreg_32 = IMPLICIT_DEF
+    $sgpr15 = COPY %144
+    $vgpr31 = COPY %43(s32)
+    $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %141
+    $vgpr0 = COPY %142
+    $sgpr30_sgpr31 = SI_CALL %77, @_Z7barrierj, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+    S_ENDPGM 0
+
+...

>From b615a6b6666670c78e4369172fb53b1f005184e6 Mon Sep 17 00:00:00 2001
From: Petar Avramovic <Petar.Avramovic at amd.com>
Date: Tue, 26 Sep 2023 18:23:52 +0200
Subject: [PATCH 3/3] AMDGPU: Fix temporal divergence introduced by
 machine-sink

Temporal divergence that was present in input or introduced in IR
transforms, like code-sinking or LICM, is handled in SIFixSGPRCopies
by changing sgpr source instr to vgpr instr.
After 5b657f50b8e8dc5836fb80e566ca7569fd04c26f, that moved LICM after
AMDGPUCodeGenPrepare, machine-sinking can introduce temporal divergence
by sinking instructions outside of the cycle.
Add callback in TargetInstrInfo for targets to fix temporal divergence
that can be introduced by sinking instruction outside cycle with divergent
exit. Uses machine uniformity analysis to detect temporal divergent uses.
Cover theoretical case when SALU instruction is sunk and turned to VALU.
---
 llvm/include/llvm/ADT/GenericUniformityImpl.h |  43 +++++++
 llvm/include/llvm/ADT/GenericUniformityInfo.h |   6 +
 llvm/include/llvm/CodeGen/TargetInstrInfo.h   |   8 ++
 llvm/lib/Analysis/UniformityAnalysis.cpp      |   9 +-
 llvm/lib/CodeGen/MachineSink.cpp              |   2 +
 .../lib/CodeGen/MachineUniformityAnalysis.cpp |   9 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |  76 ++++++++++++
 llvm/lib/Target/AMDGPU/SIInstrInfo.h          |   3 +
 ...ne-sink-temporal-divergence-swdev407790.ll |   6 +-
 ...e-sink-temporal-divergence-swdev407790.mir | 114 +++++++++---------
 10 files changed, 214 insertions(+), 62 deletions(-)

diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h
index ddd0746ccd91632..755e1161b41b521 100644
--- a/llvm/include/llvm/ADT/GenericUniformityImpl.h
+++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h
@@ -341,6 +341,9 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
   using DivergenceDescriptorT =
       typename SyncDependenceAnalysisT::DivergenceDescriptor;
   using BlockLabelMapT = typename SyncDependenceAnalysisT::BlockLabelMap;
+  using UseOutsideCycleWithDivergentExitInfoT =
+      typename std::tuple<ConstValueRefT, const InstructionT *,
+                          SmallVector<BlockT *, 4>>;
 
   GenericUniformityAnalysisImpl(const DominatorTreeT &DT, const CycleInfoT &CI,
                                 const TargetTransformInfo *TTI)
@@ -396,6 +399,9 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
 
   void print(raw_ostream &out) const;
 
+  iterator_range<const UseOutsideCycleWithDivergentExitInfoT *>
+  uses_outside_cycles_with_divergent_exit() const;
+
 protected:
   /// \brief Value/block pair representing a single phi input.
   struct PhiInput {
@@ -427,6 +433,8 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
 
   // Recognized cycles with divergent exits.
   SmallPtrSet<const CycleT *, 16> DivergentExitCycles;
+  SmallVector<UseOutsideCycleWithDivergentExitInfoT, 4>
+      UsesOutsideCyclesWithDivergentExit;
 
   // Cycles assumed to be divergent.
   //
@@ -470,6 +478,10 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
   /// \brief Whether \p Def is divergent when read in \p ObservingBlock.
   bool isTemporalDivergent(const BlockT &ObservingBlock,
                            const InstructionT &Def) const;
+
+  void recordUseOutsideCycleWithDivergentExit(ConstValueRefT Src,
+                                              const InstructionT *UserInstr,
+                                              const CycleT &DefCycle);
 };
 
 template <typename ImplT>
@@ -1210,6 +1222,20 @@ void GenericUniformityAnalysisImpl<ContextT>::print(raw_ostream &OS) const {
   }
 }
 
+template <typename ContextT>
+using UseOutsideCycleWithDivergentExitInfoT =
+    typename std::tuple<typename ContextT::ConstValueRefT,
+                        const typename ContextT::InstructionT *,
+                        SmallVector<typename ContextT::BlockT *, 4>>;
+
+template <typename ContextT>
+iterator_range<const UseOutsideCycleWithDivergentExitInfoT<ContextT> *>
+GenericUniformityAnalysisImpl<
+    ContextT>::uses_outside_cycles_with_divergent_exit() const {
+  return make_range(UsesOutsideCyclesWithDivergentExit.begin(),
+                    UsesOutsideCyclesWithDivergentExit.end());
+}
+
 template <typename ContextT>
 bool GenericUniformityInfo<ContextT>::hasDivergence() const {
   return DA->hasDivergence();
@@ -1248,6 +1274,13 @@ void GenericUniformityInfo<ContextT>::print(raw_ostream &out) const {
   DA->print(out);
 }
 
+template <typename ContextT>
+iterator_range<const UseOutsideCycleWithDivergentExitInfoT<ContextT> *>
+GenericUniformityInfo<ContextT>::uses_outside_cycles_with_divergent_exit()
+    const {
+  return DA->uses_outside_cycles_with_divergent_exit();
+}
+
 template <typename ContextT>
 void llvm::ModifiedPostOrder<ContextT>::computeStackPO(
     SmallVectorImpl<const BlockT *> &Stack, const CycleInfoT &CI,
@@ -1367,6 +1400,16 @@ void llvm::ModifiedPostOrder<ContextT>::compute(const CycleInfoT &CI) {
   computeStackPO(Stack, CI, nullptr, Finalized);
 }
 
+template <typename ContextT>
+void GenericUniformityAnalysisImpl<ContextT>::
+    recordUseOutsideCycleWithDivergentExit(ConstValueRefT Src,
+                                           const InstructionT *UserInstr,
+                                           const CycleT &DefCycle) {
+  SmallVector<BlockT *, 4> TmpExitBlocks;
+  DefCycle.getExitBlocks(TmpExitBlocks);
+  UsesOutsideCyclesWithDivergentExit.push_back({Src, UserInstr, TmpExitBlocks});
+}
+
 } // namespace llvm
 
 #undef DEBUG_TYPE
diff --git a/llvm/include/llvm/ADT/GenericUniformityInfo.h b/llvm/include/llvm/ADT/GenericUniformityInfo.h
index e53afccc020b469..a76813d4bb964a1 100644
--- a/llvm/include/llvm/ADT/GenericUniformityInfo.h
+++ b/llvm/include/llvm/ADT/GenericUniformityInfo.h
@@ -39,6 +39,9 @@ template <typename ContextT> class GenericUniformityInfo {
 
   using CycleInfoT = GenericCycleInfo<ContextT>;
   using CycleT = typename CycleInfoT::CycleT;
+  using UseOutsideCycleWithDivergentExitInfoT =
+      typename std::tuple<ConstValueRefT, const InstructionT *,
+                          SmallVector<BlockT *, 4>>;
 
   GenericUniformityInfo(const DominatorTreeT &DT, const CycleInfoT &CI,
                         const TargetTransformInfo *TTI = nullptr);
@@ -78,6 +81,9 @@ template <typename ContextT> class GenericUniformityInfo {
 
   void print(raw_ostream &Out) const;
 
+  iterator_range<const UseOutsideCycleWithDivergentExitInfoT *>
+  uses_outside_cycles_with_divergent_exit() const;
+
 private:
   using ImplT = GenericUniformityAnalysisImpl<ContextT>;
 
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 98679b4dcf3cbfb..2135484448ef4ad 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -19,6 +19,8 @@
 #include "llvm/ADT/Uniformity.h"
 #include "llvm/CodeGen/MIRFormatter.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineCycleAnalysis.h"
+#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -150,6 +152,12 @@ class TargetInstrInfo : public MCInstrInfo {
     return false;
   }
 
+  virtual void fixTemporalDivergence(MachineFunction &MF,
+                                     MachineDominatorTree *DT,
+                                     MachineCycleInfo *CI) const {
+    return;
+  }
+
 protected:
   /// For instructions with opcodes for which the M_REMATERIALIZABLE flag is
   /// set, this hook lets the target specify whether the instruction is actually
diff --git a/llvm/lib/Analysis/UniformityAnalysis.cpp b/llvm/lib/Analysis/UniformityAnalysis.cpp
index 2d617db431c5888..df1299610469d30 100644
--- a/llvm/lib/Analysis/UniformityAnalysis.cpp
+++ b/llvm/lib/Analysis/UniformityAnalysis.cpp
@@ -80,12 +80,17 @@ template <>
 void llvm::GenericUniformityAnalysisImpl<
     SSAContext>::propagateTemporalDivergence(const Instruction &I,
                                              const Cycle &DefCycle) {
-  if (isDivergent(I))
-    return;
   for (auto *User : I.users()) {
     auto *UserInstr = cast<Instruction>(User);
     if (DefCycle.contains(UserInstr->getParent()))
       continue;
+
+    recordUseOutsideCycleWithDivergentExit(cast<Value>(&I), UserInstr,
+                                           DefCycle);
+
+    if (isDivergent(I))
+      continue;
+
     markDivergent(*UserInstr);
   }
 }
diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index 02c7880f86f00a1..488354ceb101bc1 100644
--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -779,6 +779,8 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {
     MRI->clearKillFlags(I);
   RegsToClearKillFlags.clear();
 
+  TII->fixTemporalDivergence(MF, DT, CI);
+
   return EverMadeChange;
 }
 
diff --git a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
index 3e0fe2b1ba087fe..5bd87d069285857 100644
--- a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
+++ b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
@@ -117,11 +117,16 @@ void llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::
     if (!Op.getReg().isVirtual())
       continue;
     auto Reg = Op.getReg();
-    if (isDivergent(Reg))
-      continue;
+
     for (MachineInstr &UserInstr : RegInfo.use_instructions(Reg)) {
       if (DefCycle.contains(UserInstr.getParent()))
         continue;
+
+      recordUseOutsideCycleWithDivergentExit(Reg, &UserInstr, DefCycle);
+
+      if (isDivergent(Reg))
+        continue;
+
       markDivergent(UserInstr);
     }
   }
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 2799a3e78b04d22..d372ae4602aa788 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -24,6 +24,7 @@
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/CodeGen/MachineUniformityAnalysis.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/IR/DiagnosticInfo.h"
@@ -171,6 +172,81 @@ bool SIInstrInfo::isIgnorableUse(const MachineOperand &MO) const {
          isVALU(*MO.getParent()) && !resultDependsOnExec(*MO.getParent());
 }
 
+// Get poiners to build instruction after MI (skips phis if needed)
+static std::pair<MachineBasicBlock *, MachineBasicBlock::iterator>
+getInsertAfterPtrs(MachineInstr *MI) {
+  MachineBasicBlock *InsertMBB = MI->getParent();
+  return std::make_pair(
+      InsertMBB, InsertMBB->SkipPHIsAndLabels(std::next(MI->getIterator())));
+}
+
+static void replaceUseRegisterWith(const MachineInstr *MI, Register Reg,
+                                   Register Newreg) {
+  for (unsigned i = 0; i < MI->getNumOperands(); ++i) {
+    const MachineOperand &Op = MI->getOperand(i);
+    if (Op.isReg() && Op.getReg() == Reg) {
+      const_cast<MachineInstr *>(MI)->getOperand(i).setReg(Newreg);
+    }
+  }
+}
+
+void SIInstrInfo::fixTemporalDivergence(MachineFunction &MF,
+                                        MachineDominatorTree *DT,
+                                        MachineCycleInfo *CI) const {
+  DT->calculate(MF);
+  CI->clear();
+  CI->compute(MF);
+  MachineUniformityInfo MUI =
+      computeMachineUniformityInfo(MF, *CI, DT->getBase(), true);
+
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
+  const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+  const SIRegisterInfo &TRI = *Subtarget.getRegisterInfo();
+
+  // Temporal divergence lowering is required for uniform UniformSourceReg and
+  // divergent UserInstr. UserInstr is uniform only when cycle has uniform exit.
+  for (auto [SrcReg, UserInstr, CycleExitBlocks] :
+       MUI.uses_outside_cycles_with_divergent_exit()) {
+
+    MachineInstr *UniformSourceInstr = MRI.getVRegDef(SrcReg);
+    const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
+    // FixMe: use uniformity info instead of register classes and opcode checks.
+    if (!RI.isSGPRClass(SrcRC))
+      continue;
+
+    if (UniformSourceInstr->getOpcode() == AMDGPU::SI_IF_BREAK ||
+        UserInstr->getOpcode() == AMDGPU::SI_IF ||
+        UserInstr->getOpcode() == AMDGPU::SI_IF_BREAK ||
+        UserInstr->getOpcode() == AMDGPU::COPY)
+      continue;
+
+    // We could move UniformSourceInstr to VALU but it feels that it moves many
+    // instructions to vgpr, this way we keep loop in sgpr and not increase vgpr
+    // register pressure
+    unsigned Size = TRI.getRegSizeInBits(*MRI.getRegClass(SrcReg));
+    Register VgprDst =
+        MRI.createVirtualRegister(TRI.getVGPRClassForBitWidth(Size));
+
+    auto [MBB, AfterUniformSourceReg] = getInsertAfterPtrs(UniformSourceInstr);
+    BuildMI(*MBB, AfterUniformSourceReg, {}, TII.get(AMDGPU::COPY))
+        .addDef(VgprDst)
+        .addReg(SrcReg)
+        .addReg(AMDGPU::EXEC, RegState::Implicit);
+
+    replaceUseRegisterWith(UserInstr, SrcReg, VgprDst);
+
+    // If sgpr instruction was sunk, move it to vgpr
+    if (RI.isSGPRClass(MRI.getRegClass(UserInstr->getOperand(0).getReg()))) {
+      SIInstrWorklist Worklist;
+      Worklist.insert(const_cast<MachineInstr *>(UserInstr));
+      moveToVALU(Worklist, DT);
+    }
+  }
+
+  return;
+}
+
 bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
                                           int64_t &Offset0,
                                           int64_t &Offset1) const {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index a4f59fc3513d646..6a54653314b8131 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -222,6 +222,9 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
 
   bool isIgnorableUse(const MachineOperand &MO) const override;
 
+  void fixTemporalDivergence(MachineFunction &MF, MachineDominatorTree *DT,
+                             MachineCycleInfo *CI) const override;
+
   bool areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, int64_t &Offset0,
                                int64_t &Offset1) const override;
 
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
index ca1cf526d949a14..0c631bdcdd374a8 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
@@ -167,6 +167,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s59
 ; CHECK-NEXT:    s_add_i32 s58, s58, 4
 ; CHECK-NEXT:    s_add_i32 s4, s55, s58
+; CHECK-NEXT:    v_mov_b32_e32 v0, s58
 ; CHECK-NEXT:    s_add_i32 s5, s4, 5
 ; CHECK-NEXT:    s_add_i32 s4, s4, 1
 ; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc_lo, s5, v42
@@ -267,7 +268,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:  .LBB0_16: ; %Flow43
 ; CHECK-NEXT:    ; in Loop: Header=BB0_5 Depth=1
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s57
-; CHECK-NEXT:    v_add_nc_u32_e32 v57, s58, v57
+; CHECK-NEXT:    v_add_nc_u32_e32 v57, v57, v0
 ; CHECK-NEXT:  .LBB0_17: ; %Flow44
 ; CHECK-NEXT:    ; in Loop: Header=BB0_5 Depth=1
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s56
@@ -869,6 +870,7 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
 ; CHECK-NEXT:    s_add_i32 s7, s7, 4
 ; CHECK-NEXT:    v_add_nc_u32_e32 v43, 1, v43
 ; CHECK-NEXT:    s_add_i32 s8, s4, s7
+; CHECK-NEXT:    v_mov_b32_e32 v0, s7
 ; CHECK-NEXT:    s_add_i32 s9, s8, 5
 ; CHECK-NEXT:    s_add_i32 s8, s8, 1
 ; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc_lo, s9, v41
@@ -879,7 +881,7 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
 ; CHECK-NEXT:  ; %bb.4: ; %Flow3
 ; CHECK-NEXT:    ; in Loop: Header=BB1_1 Depth=1
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s6
-; CHECK-NEXT:    v_add_nc_u32_e32 v47, s7, v47
+; CHECK-NEXT:    v_add_nc_u32_e32 v47, v47, v0
 ; CHECK-NEXT:  .LBB1_5: ; %Flow4
 ; CHECK-NEXT:    ; in Loop: Header=BB1_1 Depth=1
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s5
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.mir b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.mir
index 191b400011b6b2b..691f08840890755 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.mir
@@ -212,7 +212,7 @@ body: |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[PHI4:%[0-9]+]]:vgpr_32 = PHI %95, %bb.9
   ; CHECK-NEXT:   SI_END_CF %96, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_OR3_B32_e64_]], %108, 0, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_OR3_B32_e64_]], %148, 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.8
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.6:
@@ -246,12 +246,13 @@ body: |
   ; CHECK-NEXT:   [[PHI10:%[0-9]+]]:vgpr_32 = PHI [[PHI3]], %bb.4, %103, %bb.9
   ; CHECK-NEXT:   [[V_ADD_U32_e64_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI10]], [[S_MOV_B32_9]], 0, implicit $exec
   ; CHECK-NEXT:   [[S_ADD_I32_2:%[0-9]+]]:sreg_32 = S_ADD_I32 [[PHI9]], [[S_MOV_B32_10]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_2]], implicit $exec
   ; CHECK-NEXT:   [[S_ADD_I32_3:%[0-9]+]]:sreg_32 = S_ADD_I32 [[PHI2]], [[S_ADD_I32_2]], implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_4:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_ADD_I32_3]], [[S_MOV_B32_8]], implicit-def dead $scc
   ; CHECK-NEXT:   [[V_CMP_GE_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_GE_U32_e64 killed [[S_ADD_I32_4]], [[COPY21]], implicit $exec
   ; CHECK-NEXT:   [[S_ADD_I32_5:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_ADD_I32_3]], [[S_MOV_B32_9]], implicit-def dead $scc
   ; CHECK-NEXT:   [[SI_IF_BREAK1:%[0-9]+]]:sreg_32 = SI_IF_BREAK killed [[V_CMP_GE_U32_e64_1]], [[PHI8]], implicit-def dead $scc
-  ; CHECK-NEXT:   [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_5]], implicit $exec
+  ; CHECK-NEXT:   [[COPY26:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_5]], implicit $exec
   ; CHECK-NEXT:   SI_LOOP [[SI_IF_BREAK1]], %bb.9, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.5
   ; CHECK-NEXT: {{  $}}
@@ -279,15 +280,15 @@ body: |
   ; CHECK-NEXT:   successors: %bb.13(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[V_ADD_U32_e64_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI14]], [[S_MOV_B32_9]], 0, implicit $exec
-  ; CHECK-NEXT:   [[COPY26:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0(p4)
-  ; CHECK-NEXT:   [[COPY27:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1(p4)
-  ; CHECK-NEXT:   [[COPY28:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub0
-  ; CHECK-NEXT:   [[COPY29:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub1
-  ; CHECK-NEXT:   [[S_ADD_U32_1:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY26]], [[COPY28]], implicit-def $scc
-  ; CHECK-NEXT:   [[S_ADDC_U32_1:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY27]], [[COPY29]], implicit-def $scc, implicit $scc
+  ; CHECK-NEXT:   [[COPY27:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0(p4)
+  ; CHECK-NEXT:   [[COPY28:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1(p4)
+  ; CHECK-NEXT:   [[COPY29:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub0
+  ; CHECK-NEXT:   [[COPY30:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub1
+  ; CHECK-NEXT:   [[S_ADD_U32_1:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY27]], [[COPY29]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADDC_U32_1:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY28]], [[COPY30]], implicit-def $scc, implicit $scc
   ; CHECK-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_ADD_U32_1]], %subreg.sub0, [[S_ADDC_U32_1]], %subreg.sub1
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
-  ; CHECK-NEXT:   [[COPY30:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   [[COPY31:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY5]]
   ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF8]]
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[REG_SEQUENCE3]]
@@ -297,12 +298,12 @@ body: |
   ; CHECK-NEXT:   $sgpr14 = COPY [[COPY]]
   ; CHECK-NEXT:   $sgpr15 = COPY [[DEF9]]
   ; CHECK-NEXT:   $vgpr31 = COPY [[COPY6]](s32)
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY30]]
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY31]]
   ; CHECK-NEXT:   $vgpr0 = COPY [[V_MOV_B32_e32_3]]
   ; CHECK-NEXT:   $sgpr30_sgpr31 = SI_CALL [[SI_PC_ADD_REL_OFFSET4]], @_Z10atomic_incPU3AS3Vj, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit-def $vgpr0
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
-  ; CHECK-NEXT:   [[COPY31:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; CHECK-NEXT:   [[V_LSHLREV_B32_e64_2:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[S_MOV_B32_2]], [[COPY31]], implicit $exec
+  ; CHECK-NEXT:   [[COPY32:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; CHECK-NEXT:   [[V_LSHLREV_B32_e64_2:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[S_MOV_B32_2]], [[COPY32]], implicit $exec
   ; CHECK-NEXT:   DS_WRITE_B32_gfx9 killed [[V_LSHLREV_B32_e64_2]], [[PHI15]], 0, 0, implicit $exec :: (store (s32), addrspace 3)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.13:
@@ -320,15 +321,15 @@ body: |
   ; CHECK-NEXT: bb.14:
   ; CHECK-NEXT:   SI_END_CF [[SI_IF_BREAK]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; CHECK-NEXT:   [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 40
-  ; CHECK-NEXT:   [[COPY32:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0(p4)
-  ; CHECK-NEXT:   [[COPY33:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1(p4)
-  ; CHECK-NEXT:   [[COPY34:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_1]].sub0
-  ; CHECK-NEXT:   [[COPY35:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_1]].sub1
-  ; CHECK-NEXT:   [[S_ADD_U32_2:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY32]], [[COPY34]], implicit-def $scc
-  ; CHECK-NEXT:   [[S_ADDC_U32_2:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY33]], [[COPY35]], implicit-def $scc, implicit $scc
+  ; CHECK-NEXT:   [[COPY33:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0(p4)
+  ; CHECK-NEXT:   [[COPY34:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1(p4)
+  ; CHECK-NEXT:   [[COPY35:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_1]].sub0
+  ; CHECK-NEXT:   [[COPY36:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_1]].sub1
+  ; CHECK-NEXT:   [[S_ADD_U32_2:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY33]], [[COPY35]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADDC_U32_2:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY34]], [[COPY36]], implicit-def $scc, implicit $scc
   ; CHECK-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_ADD_U32_2]], %subreg.sub0, [[S_ADDC_U32_2]], %subreg.sub1
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
-  ; CHECK-NEXT:   [[COPY36:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   [[COPY37:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
   ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY5]]
   ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
@@ -341,7 +342,7 @@ body: |
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   $sgpr15 = COPY [[DEF11]]
   ; CHECK-NEXT:   $vgpr31 = COPY [[COPY6]](s32)
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY36]]
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY37]]
   ; CHECK-NEXT:   $vgpr0 = COPY [[V_MOV_B32_e32_4]]
   ; CHECK-NEXT:   $sgpr30_sgpr31 = SI_CALL [[SI_PC_ADD_REL_OFFSET2]], @_Z7barrierj, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
@@ -860,9 +861,9 @@ body: |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[PHI4:%[0-9]+]]:vgpr_32 = PHI %95, %bb.9
   ; CHECK-NEXT:   SI_END_CF %96, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
-  ; CHECK-NEXT:   [[S_ADD_I32_2:%[0-9]+]]:sreg_32 = S_ADD_I32 %108, 1, implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_ADD_I32_3:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_ADD_I32_2]], 2, implicit-def dead $scc
-  ; CHECK-NEXT:   [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_OR3_B32_e64_]], [[S_ADD_I32_3]], 0, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %150, 1, 0, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e64_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_3]], 2, 0, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e64_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_OR3_B32_e64_]], [[V_ADD_U32_e64_4]], 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.8
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.6:
@@ -884,7 +885,7 @@ body: |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[PHI5:%[0-9]+]]:vgpr_32 = PHI [[COPY24]], %bb.3, [[PHI4]], %bb.5
   ; CHECK-NEXT:   [[PHI6:%[0-9]+]]:vgpr_32 = PHI [[PHI3]], %bb.3, %103, %bb.5
-  ; CHECK-NEXT:   [[PHI7:%[0-9]+]]:vgpr_32 = PHI [[V_OR3_B32_e64_]], %bb.3, [[V_ADD_U32_e64_3]], %bb.5
+  ; CHECK-NEXT:   [[PHI7:%[0-9]+]]:vgpr_32 = PHI [[V_OR3_B32_e64_]], %bb.3, [[V_ADD_U32_e64_5]], %bb.5
   ; CHECK-NEXT:   SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.6
   ; CHECK-NEXT: {{  $}}
@@ -894,14 +895,15 @@ body: |
   ; CHECK-NEXT:   [[PHI8:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_13]], %bb.4, %96, %bb.9
   ; CHECK-NEXT:   [[PHI9:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_13]], %bb.4, %108, %bb.9
   ; CHECK-NEXT:   [[PHI10:%[0-9]+]]:vgpr_32 = PHI [[PHI3]], %bb.4, %103, %bb.9
-  ; CHECK-NEXT:   [[V_ADD_U32_e64_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI10]], [[S_MOV_B32_9]], 0, implicit $exec
-  ; CHECK-NEXT:   [[S_ADD_I32_4:%[0-9]+]]:sreg_32 = S_ADD_I32 [[PHI9]], [[S_MOV_B32_10]], implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_ADD_I32_5:%[0-9]+]]:sreg_32 = S_ADD_I32 [[PHI2]], [[S_ADD_I32_4]], implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_ADD_I32_6:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_ADD_I32_5]], [[S_MOV_B32_8]], implicit-def dead $scc
-  ; CHECK-NEXT:   [[V_CMP_GE_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_GE_U32_e64 killed [[S_ADD_I32_6]], [[COPY21]], implicit $exec
-  ; CHECK-NEXT:   [[S_ADD_I32_7:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_ADD_I32_5]], [[S_MOV_B32_9]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[V_ADD_U32_e64_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI10]], [[S_MOV_B32_9]], 0, implicit $exec
+  ; CHECK-NEXT:   [[S_ADD_I32_2:%[0-9]+]]:sreg_32 = S_ADD_I32 [[PHI9]], [[S_MOV_B32_10]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_2]], implicit $exec
+  ; CHECK-NEXT:   [[S_ADD_I32_3:%[0-9]+]]:sreg_32 = S_ADD_I32 [[PHI2]], [[S_ADD_I32_2]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_ADD_I32_4:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_ADD_I32_3]], [[S_MOV_B32_8]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[V_CMP_GE_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_GE_U32_e64 killed [[S_ADD_I32_4]], [[COPY21]], implicit $exec
+  ; CHECK-NEXT:   [[S_ADD_I32_5:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_ADD_I32_3]], [[S_MOV_B32_9]], implicit-def dead $scc
   ; CHECK-NEXT:   [[SI_IF_BREAK1:%[0-9]+]]:sreg_32 = SI_IF_BREAK killed [[V_CMP_GE_U32_e64_1]], [[PHI8]], implicit-def dead $scc
-  ; CHECK-NEXT:   [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_7]], implicit $exec
+  ; CHECK-NEXT:   [[COPY26:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_5]], implicit $exec
   ; CHECK-NEXT:   SI_LOOP [[SI_IF_BREAK1]], %bb.9, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.5
   ; CHECK-NEXT: {{  $}}
@@ -919,8 +921,8 @@ body: |
   ; CHECK-NEXT:   [[PHI13:%[0-9]+]]:vgpr_32 = PHI [[PHI5]], %bb.7, %118, %bb.13
   ; CHECK-NEXT:   [[PHI14:%[0-9]+]]:vgpr_32 = PHI [[PHI6]], %bb.7, %74, %bb.13
   ; CHECK-NEXT:   [[PHI15:%[0-9]+]]:vgpr_32 = PHI [[PHI7]], %bb.7, %121, %bb.13
-  ; CHECK-NEXT:   [[V_ADD_U32_e64_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_]], [[PHI13]], 0, implicit $exec
-  ; CHECK-NEXT:   [[DS_READ_U8_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 killed [[V_ADD_U32_e64_5]], 0, 0, implicit $exec :: (load (s8), addrspace 3)
+  ; CHECK-NEXT:   [[V_ADD_U32_e64_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_]], [[PHI13]], 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_U8_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 killed [[V_ADD_U32_e64_7]], 0, 0, implicit $exec :: (load (s8), addrspace 3)
   ; CHECK-NEXT:   [[V_CMP_EQ_U16_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U16_e64 [[V_AND_B32_e64_2]], killed [[DS_READ_U8_gfx9_1]], implicit $exec
   ; CHECK-NEXT:   [[SI_IF2:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_EQ_U16_e64_]], %bb.13, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.12
@@ -928,16 +930,16 @@ body: |
   ; CHECK-NEXT: bb.12:
   ; CHECK-NEXT:   successors: %bb.13(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_ADD_U32_e64_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI14]], [[S_MOV_B32_9]], 0, implicit $exec
-  ; CHECK-NEXT:   [[COPY26:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0(p4)
-  ; CHECK-NEXT:   [[COPY27:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1(p4)
-  ; CHECK-NEXT:   [[COPY28:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub0
-  ; CHECK-NEXT:   [[COPY29:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub1
-  ; CHECK-NEXT:   [[S_ADD_U32_1:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY26]], [[COPY28]], implicit-def $scc
-  ; CHECK-NEXT:   [[S_ADDC_U32_1:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY27]], [[COPY29]], implicit-def $scc, implicit $scc
+  ; CHECK-NEXT:   [[V_ADD_U32_e64_8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI14]], [[S_MOV_B32_9]], 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY27:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0(p4)
+  ; CHECK-NEXT:   [[COPY28:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1(p4)
+  ; CHECK-NEXT:   [[COPY29:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub0
+  ; CHECK-NEXT:   [[COPY30:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub1
+  ; CHECK-NEXT:   [[S_ADD_U32_1:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY27]], [[COPY29]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADDC_U32_1:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY28]], [[COPY30]], implicit-def $scc, implicit $scc
   ; CHECK-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_ADD_U32_1]], %subreg.sub0, [[S_ADDC_U32_1]], %subreg.sub1
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
-  ; CHECK-NEXT:   [[COPY30:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   [[COPY31:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY5]]
   ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[DEF8]]
   ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[REG_SEQUENCE3]]
@@ -947,22 +949,22 @@ body: |
   ; CHECK-NEXT:   $sgpr14 = COPY [[COPY]]
   ; CHECK-NEXT:   $sgpr15 = COPY [[DEF9]]
   ; CHECK-NEXT:   $vgpr31 = COPY [[COPY6]](s32)
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY30]]
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY31]]
   ; CHECK-NEXT:   $vgpr0 = COPY [[V_MOV_B32_e32_3]]
   ; CHECK-NEXT:   $sgpr30_sgpr31 = SI_CALL [[SI_PC_ADD_REL_OFFSET4]], @_Z10atomic_incPU3AS3Vj, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit-def $vgpr0
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
-  ; CHECK-NEXT:   [[COPY31:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; CHECK-NEXT:   [[V_LSHLREV_B32_e64_2:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[S_MOV_B32_2]], [[COPY31]], implicit $exec
+  ; CHECK-NEXT:   [[COPY32:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; CHECK-NEXT:   [[V_LSHLREV_B32_e64_2:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[S_MOV_B32_2]], [[COPY32]], implicit $exec
   ; CHECK-NEXT:   DS_WRITE_B32_gfx9 killed [[V_LSHLREV_B32_e64_2]], [[PHI15]], 0, 0, implicit $exec :: (store (s32), addrspace 3)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.13:
   ; CHECK-NEXT:   successors: %bb.1(0x04000000), %bb.11(0x7c000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[PHI16:%[0-9]+]]:vgpr_32 = PHI [[PHI14]], %bb.11, [[V_ADD_U32_e64_6]], %bb.12
+  ; CHECK-NEXT:   [[PHI16:%[0-9]+]]:vgpr_32 = PHI [[PHI14]], %bb.11, [[V_ADD_U32_e64_8]], %bb.12
   ; CHECK-NEXT:   SI_END_CF [[SI_IF2]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e64_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI15]], [[S_MOV_B32_9]], 0, implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e64_8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI13]], [[S_MOV_B32_9]], 0, implicit $exec
-  ; CHECK-NEXT:   [[V_CMP_GE_U32_e64_2:%[0-9]+]]:sreg_32 = V_CMP_GE_U32_e64 [[V_ADD_U32_e64_8]], [[COPY21]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e64_9:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI15]], [[S_MOV_B32_9]], 0, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e64_10:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI13]], [[S_MOV_B32_9]], 0, implicit $exec
+  ; CHECK-NEXT:   [[V_CMP_GE_U32_e64_2:%[0-9]+]]:sreg_32 = V_CMP_GE_U32_e64 [[V_ADD_U32_e64_10]], [[COPY21]], implicit $exec
   ; CHECK-NEXT:   [[SI_IF_BREAK2:%[0-9]+]]:sreg_32 = SI_IF_BREAK killed [[V_CMP_GE_U32_e64_2]], [[PHI12]], implicit-def dead $scc
   ; CHECK-NEXT:   SI_LOOP [[SI_IF_BREAK2]], %bb.11, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.1
@@ -970,15 +972,15 @@ body: |
   ; CHECK-NEXT: bb.14:
   ; CHECK-NEXT:   SI_END_CF [[SI_IF_BREAK]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; CHECK-NEXT:   [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 40
-  ; CHECK-NEXT:   [[COPY32:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0(p4)
-  ; CHECK-NEXT:   [[COPY33:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1(p4)
-  ; CHECK-NEXT:   [[COPY34:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_1]].sub0
-  ; CHECK-NEXT:   [[COPY35:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_1]].sub1
-  ; CHECK-NEXT:   [[S_ADD_U32_2:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY32]], [[COPY34]], implicit-def $scc
-  ; CHECK-NEXT:   [[S_ADDC_U32_2:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY33]], [[COPY35]], implicit-def $scc, implicit $scc
+  ; CHECK-NEXT:   [[COPY33:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0(p4)
+  ; CHECK-NEXT:   [[COPY34:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1(p4)
+  ; CHECK-NEXT:   [[COPY35:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_1]].sub0
+  ; CHECK-NEXT:   [[COPY36:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_1]].sub1
+  ; CHECK-NEXT:   [[S_ADD_U32_2:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY33]], [[COPY35]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADDC_U32_2:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY34]], [[COPY36]], implicit-def $scc, implicit $scc
   ; CHECK-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_ADD_U32_2]], %subreg.sub0, [[S_ADDC_U32_2]], %subreg.sub1
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
-  ; CHECK-NEXT:   [[COPY36:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   [[COPY37:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
   ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY5]]
   ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
@@ -991,7 +993,7 @@ body: |
   ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   $sgpr15 = COPY [[DEF11]]
   ; CHECK-NEXT:   $vgpr31 = COPY [[COPY6]](s32)
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY36]]
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY37]]
   ; CHECK-NEXT:   $vgpr0 = COPY [[V_MOV_B32_e32_4]]
   ; CHECK-NEXT:   $sgpr30_sgpr31 = SI_CALL [[SI_PC_ADD_REL_OFFSET2]], @_Z7barrierj, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32



More information about the llvm-commits mailing list