[llvm] AMDGPU: Fix temporal divergence introduced by machine-sink and performance regression introduced by D155343 (PR #67456)

Tue Sep 26 09:46:40 PDT 2023

https://github.com/petar-avramovic created https://github.com/llvm/llvm-project/pull/67456

D155343 introduced performance regressions by not allowing machine instruction sinking in some cases - SWEDEV-414443.
It originally fixed SWDEV-407790

Revert D155343 to fix performance regression SWEDEV-414443
and introduce temporal divergence lowering pass that also fixes SWDEV-407790

Add a target hook to TargetInstrInfo that can be used to fix temporal divergence that can be introduced by sinking instruction outside cycle with divergent exit. AMDGPU implementation uses machine uniformity analysis to detect temporal divergent uses.

>From e0b9863fbcda87a2bfbda08571d90c70670c4dca Mon Sep 17 00:00:00 2001
From: Petar Avramovic <Petar.Avramovic at amd.com>
Date: Thu, 21 Sep 2023 13:02:43 +0200
Subject: [PATCH 1/3] Revert "MachineSink: Fix sinking VGPR def out of a
 divergent loop"

This reverts commit 3f8ef57bede94445b1a1042c987cc914a886e7ff.
---
 llvm/lib/CodeGen/MachineSink.cpp                  | 15 ++++-----------
 ...-loop-var-out-of-divergent-loop-swdev407790.ll |  2 +-
 ...loop-var-out-of-divergent-loop-swdev407790.mir |  2 +-
 .../CodeGen/AMDGPU/sink-after-control-flow.mir    |  2 +-
 4 files changed, 7 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index 480ac23d43ad879..15a6c00bce892c6 100644
--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -300,7 +300,8 @@ static bool blockPrologueInterferes(const MachineBasicBlock *BB,
       if (!Reg)
         continue;
       if (MO.isUse()) {
-        if (Reg.isPhysical() && MRI && MRI->isConstantPhysReg(Reg))
+        if (Reg.isPhysical() &&
+            (TII->isIgnorableUse(MO) || (MRI && MRI->isConstantPhysReg(Reg))))
           continue;
         if (PI->modifiesRegister(Reg, TRI))
           return true;
@@ -1245,24 +1246,16 @@ MachineSinking::FindSuccToSinkTo(MachineInstr &MI, MachineBasicBlock *MBB,
   if (MBB == SuccToSinkTo)
     return nullptr;
 
-  if (!SuccToSinkTo)
-    return nullptr;
-
   // It's not safe to sink instructions to EH landing pad. Control flow into
   // landing pad is implicitly defined.
-  if (SuccToSinkTo->isEHPad())
+  if (SuccToSinkTo && SuccToSinkTo->isEHPad())
     return nullptr;
 
   // It ought to be okay to sink instructions into an INLINEASM_BR target, but
   // only if we make sure that MI occurs _before_ an INLINEASM_BR instruction in
   // the source block (which this code does not yet do). So for now, forbid
   // doing so.
-  if (SuccToSinkTo->isInlineAsmBrIndirectTarget())
-    return nullptr;
-
-  MachineBasicBlock::const_iterator InsertPos =
-      SuccToSinkTo->SkipPHIsAndLabels(SuccToSinkTo->begin());
-  if (blockPrologueInterferes(SuccToSinkTo, InsertPos, MI, TRI, TII, MRI))
+  if (SuccToSinkTo && SuccToSinkTo->isInlineAsmBrIndirectTarget())
     return nullptr;
 
   return SuccToSinkTo;
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll
index e2456b74f7ef1fa..b8e74bc7db09a1a 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll
@@ -21,6 +21,7 @@ define void @machinesink_loop_variable_out_of_divergent_loop(i32 %arg, i1 %cmp49
 ; CHECK-NEXT:  .LBB0_1: ; %Flow
 ; CHECK-NEXT:    ; in Loop: Header=BB0_3 Depth=1
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s8
+; CHECK-NEXT:    v_add_nc_u32_e32 v4, -4, v4
 ; CHECK-NEXT:  .LBB0_2: ; %Flow1
 ; CHECK-NEXT:    ; in Loop: Header=BB0_3 Depth=1
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s7
@@ -53,7 +54,6 @@ define void @machinesink_loop_variable_out_of_divergent_loop(i32 %arg, i1 %cmp49
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    v_add_nc_u32_e32 v4, s9, v2
 ; CHECK-NEXT:    v_cmp_ge_u32_e64 s4, v4, v0
-; CHECK-NEXT:    v_add_nc_u32_e32 v4, -4, v4
 ; CHECK-NEXT:    s_or_b32 s8, s4, s8
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_1
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.mir b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.mir
index cc14b4a80d58a7d..037a285794120da 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.mir
@@ -42,7 +42,6 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.5(0x40000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */
-  ; CHECK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec
   ; CHECK-NEXT:   [[SI_IF_BREAK:%[0-9]+]]:sreg_32 = SI_IF_BREAK killed [[SI_IF1]], [[SI_IF]], implicit-def dead $scc
   ; CHECK-NEXT:   SI_LOOP [[SI_IF_BREAK]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.5
@@ -52,6 +51,7 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY]], %bb.4
   ; CHECK-NEXT:   SI_END_CF [[SI_IF_BREAK]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec
   ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]]
   ; CHECK-NEXT:   S_BRANCH %bb.2
   ; CHECK-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/sink-after-control-flow.mir b/llvm/test/CodeGen/AMDGPU/sink-after-control-flow.mir
index ee3d7aeb454f96b..4feef2149b42249 100644
--- a/llvm/test/CodeGen/AMDGPU/sink-after-control-flow.mir
+++ b/llvm/test/CodeGen/AMDGPU/sink-after-control-flow.mir
@@ -17,7 +17,6 @@ body:             |
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 8
-  ; GFX10-NEXT:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[S_MOV_B32_]], [[DEF]], implicit $exec
   ; GFX10-NEXT:   [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[DEF]], 8, 5, implicit $exec
   ; GFX10-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 5
   ; GFX10-NEXT:   [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 [[V_BFE_U32_e64_]], killed [[S_MOV_B32_1]], implicit $exec
@@ -38,6 +37,7 @@ body:             |
   ; GFX10-NEXT:   successors: %bb.3(0x40000000), %bb.4(0x40000000)
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   $exec_lo = S_OR_B32 $exec_lo, [[S_XOR_B32_1]], implicit-def $scc
+  ; GFX10-NEXT:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[S_MOV_B32_]], [[DEF]], implicit $exec
   ; GFX10-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 31
   ; GFX10-NEXT:   [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 [[V_BFE_U32_e64_]], killed [[S_MOV_B32_2]], implicit $exec
   ; GFX10-NEXT:   [[S_XOR_B32_2:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_NE_U32_e64_1]], -1, implicit-def $scc

>From e4476b764215be28f6fe30f04601c064faee4ab0 Mon Sep 17 00:00:00 2001
From: Petar Avramovic <Petar.Avramovic at amd.com>
Date: Thu, 21 Sep 2023 14:20:49 +0200
Subject: [PATCH 2/3] AMDGPU: Add test for temporal divergence introduced by
 machine-sink

Introduced by 5b657f50b8e8dc5836fb80e566ca7569fd04c26f that moved
LICM after AMDGPUCodeGenPrepare. Some instructions are no longer
sunk during ir optimizations but in machine-sinking instead.
If vgpr instruction used sgpr defined inside the loop is sunk outside
of the loop we end up with not-handled case of temporal divergence.
---
 ...ne-sink-temporal-divergence-swdev407790.ll | 1092 +++++++++++++++++
 1 file changed, 1092 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll

diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
new file mode 100644
index 000000000000000..ca1cf526d949a14
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
@@ -0,0 +1,1092 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 < %s | FileCheck %s
+
+; ModuleID = 'kernel_round1_passing.bc'
+source_filename = "/tmp/comgr-295d04/input/CompileSource"
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8"
+target triple = "amdgcn-amd-amdhsa"
+
+ at kernel_round1.first_words_data = external hidden unnamed_addr addrspace(3) global [896 x i8], align 1
+ at kernel_round1.collisionsData = external hidden unnamed_addr addrspace(3) global [3840 x i32], align 4
+ at kernel_round1.collisionsNum = external hidden addrspace(3) global i32, align 4
+
+; Function Attrs: convergent mustprogress nofree nounwind willreturn memory(none)
+declare hidden i64 @_Z13get_global_idj(i32 noundef) local_unnamed_addr #0
+
+; Function Attrs: convergent nounwind
+declare hidden i32 @_Z10atomic_addPU3AS1Vjj(ptr addrspace(1) noundef, i32 noundef) local_unnamed_addr #1
+
+; Function Attrs: convergent nounwind
+declare hidden i32 @_Z10atomic_subPU3AS1Vjj(ptr addrspace(1) noundef, i32 noundef) local_unnamed_addr #1
+
+; Function Attrs: convergent mustprogress nofree nounwind willreturn memory(none)
+declare hidden i64 @_Z12get_local_idj(i32 noundef) local_unnamed_addr #0
+
+; Function Attrs: convergent nounwind
+declare hidden void @_Z7barrierj(i32 noundef) local_unnamed_addr #1
+
+; Function Attrs: convergent mustprogress nofree nounwind willreturn memory(none)
+declare hidden i32 @_Z3minjj(i32 noundef, i32 noundef) local_unnamed_addr #0
+
+; Function Attrs: convergent nounwind
+declare hidden i32 @_Z10atomic_incPU3AS3Vj(ptr addrspace(3) noundef) local_unnamed_addr #1
+
+; Function Attrs: convergent mustprogress nofree nounwind willreturn memory(none)
+declare hidden i64 @_Z14get_local_sizej(i32 noundef) local_unnamed_addr #0
+
+; Function Attrs: convergent norecurse nounwind
+define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture noundef readonly align 1 %0, ptr addrspace(1) nocapture noundef writeonly align 1 %1, ptr addrspace(1) nocapture noundef readonly align 4 %2, ptr addrspace(1) noundef align 4 %3, ptr addrspace(1) nocapture noundef readnone align 4 %4) local_unnamed_addr #2 !kernel_arg_addr_space !5 !kernel_arg_access_qual !6 !kernel_arg_type !7 !kernel_arg_base_type !7 !kernel_arg_type_qual !8 !kernel_arg_name !9 !reqd_work_group_size !10 {
+; CHECK-LABEL: kernel_round1:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_add_u32 s10, s10, s15
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    s_addc_u32 s11, s11, 0
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
+; CHECK-NEXT:    s_load_dwordx8 s[44:51], s[6:7], 0x0
+; CHECK-NEXT:    s_add_u32 s0, s0, s15
+; CHECK-NEXT:    s_mov_b64 s[34:35], s[6:7]
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    v_mov_b32_e32 v41, v0
+; CHECK-NEXT:    s_add_u32 s42, s34, 40
+; CHECK-NEXT:    v_mov_b32_e32 v31, v0
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    s_mov_b64 s[36:37], s[8:9]
+; CHECK-NEXT:    s_addc_u32 s43, s35, 0
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
+; CHECK-NEXT:    s_mov_b64 s[8:9], s[42:43]
+; CHECK-NEXT:    s_mov_b32 s33, s14
+; CHECK-NEXT:    s_mov_b32 s40, s13
+; CHECK-NEXT:    s_mov_b32 s41, s12
+; CHECK-NEXT:    s_mov_b64 s[38:39], s[4:5]
+; CHECK-NEXT:    s_getpc_b64 s[6:7]
+; CHECK-NEXT:    s_add_u32 s6, s6, _Z13get_global_idj at rel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s7, s7, _Z13get_global_idj at rel32@hi+12
+; CHECK-NEXT:    v_mov_b32_e32 v45, 0
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; CHECK-NEXT:    v_mov_b32_e32 v43, v0
+; CHECK-NEXT:    v_mov_b32_e32 v31, v41
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[38:39]
+; CHECK-NEXT:    s_mov_b64 s[8:9], s[42:43]
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
+; CHECK-NEXT:    s_mov_b32 s12, s41
+; CHECK-NEXT:    s_mov_b32 s13, s40
+; CHECK-NEXT:    s_mov_b32 s14, s33
+; CHECK-NEXT:    s_getpc_b64 s[6:7]
+; CHECK-NEXT:    s_add_u32 s6, s6, _Z12get_local_idj at rel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s7, s7, _Z12get_local_idj at rel32@hi+12
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; CHECK-NEXT:    v_mov_b32_e32 v40, v0
+; CHECK-NEXT:    v_mov_b32_e32 v31, v41
+; CHECK-NEXT:    v_mov_b32_e32 v0, 1
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[38:39]
+; CHECK-NEXT:    s_mov_b64 s[8:9], s[42:43]
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
+; CHECK-NEXT:    s_mov_b32 s12, s41
+; CHECK-NEXT:    s_mov_b32 s13, s40
+; CHECK-NEXT:    s_mov_b32 s14, s33
+; CHECK-NEXT:    ds_write_b32 v45, v45 offset:15360
+; CHECK-NEXT:    s_getpc_b64 s[52:53]
+; CHECK-NEXT:    s_add_u32 s52, s52, _Z7barrierj at rel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s53, s53, _Z7barrierj at rel32@hi+12
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[52:53]
+; CHECK-NEXT:    v_lshrrev_b32_e32 v0, 1, v43
+; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 2, v43
+; CHECK-NEXT:    v_mov_b32_e32 v31, v41
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[38:39]
+; CHECK-NEXT:    s_mov_b64 s[8:9], s[42:43]
+; CHECK-NEXT:    v_and_b32_e32 v0, 0x7ffffffc, v0
+; CHECK-NEXT:    v_and_b32_e32 v1, 28, v1
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
+; CHECK-NEXT:    s_mov_b32 s12, s41
+; CHECK-NEXT:    s_mov_b32 s13, s40
+; CHECK-NEXT:    global_load_dword v0, v0, s[48:49]
+; CHECK-NEXT:    s_mov_b32 s14, s33
+; CHECK-NEXT:    s_getpc_b64 s[6:7]
+; CHECK-NEXT:    s_add_u32 s6, s6, _Z3minjj at rel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s7, s7, _Z3minjj at rel32@hi+12
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_bfe_u32 v0, v0, v1, 4
+; CHECK-NEXT:    v_mov_b32_e32 v1, 12
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; CHECK-NEXT:    v_mov_b32_e32 v42, v0
+; CHECK-NEXT:    s_mov_b32 s48, exec_lo
+; CHECK-NEXT:    v_cmpx_ne_u32_e32 0, v42
+; CHECK-NEXT:    s_cbranch_execz .LBB0_25
+; CHECK-NEXT:  ; %bb.1: ; %.preheader5
+; CHECK-NEXT:    v_mul_lo_u32 v0, v40, 14
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:    s_mov_b32 s5, 0
+; CHECK-NEXT:    v_add_nc_u32_e32 v44, 0x3c04, v0
+; CHECK-NEXT:  .LBB0_2: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    v_add_nc_u32_e32 v1, s5, v44
+; CHECK-NEXT:    s_add_i32 s5, s5, 1
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s5, v42
+; CHECK-NEXT:    ds_write_b8 v1, v45
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
+; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; CHECK-NEXT:    s_cbranch_execnz .LBB0_2
+; CHECK-NEXT:  ; %bb.3:
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; CHECK-NEXT:    v_add_nc_u32_e32 v45, -1, v42
+; CHECK-NEXT:    s_mov_b32 s49, 0
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v45
+; CHECK-NEXT:    s_and_b32 exec_lo, exec_lo, vcc_lo
+; CHECK-NEXT:    s_cbranch_execz .LBB0_25
+; CHECK-NEXT:  ; %bb.4:
+; CHECK-NEXT:    v_lshlrev_b32_e32 v43, 10, v43
+; CHECK-NEXT:    v_add_nc_u32_e32 v46, 0x3c05, v0
+; CHECK-NEXT:    v_mov_b32_e32 v47, 0
+; CHECK-NEXT:    s_getpc_b64 s[42:43]
+; CHECK-NEXT:    s_add_u32 s42, s42, _Z10atomic_incPU3AS3Vj at rel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s43, s43, _Z10atomic_incPU3AS3Vj at rel32@hi+12
+; CHECK-NEXT:    s_mov_b32 s55, 0
+; CHECK-NEXT:  .LBB0_5: ; =>This Loop Header: Depth=1
+; CHECK-NEXT:    ; Child Loop BB0_8 Depth 2
+; CHECK-NEXT:    ; Child Loop BB0_20 Depth 2
+; CHECK-NEXT:    v_add_nc_u32_e32 v0, s55, v44
+; CHECK-NEXT:    s_lshl_b32 s4, s55, 5
+; CHECK-NEXT:    s_add_i32 s54, s55, 1
+; CHECK-NEXT:    s_add_i32 s5, s55, 5
+; CHECK-NEXT:    v_or3_b32 v57, s4, v43, s54
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    ds_read_u8 v56, v0
+; CHECK-NEXT:    v_mov_b32_e32 v59, s54
+; CHECK-NEXT:    s_mov_b32 s56, exec_lo
+; CHECK-NEXT:    v_cmpx_lt_u32_e64 s5, v42
+; CHECK-NEXT:    s_cbranch_execz .LBB0_17
+; CHECK-NEXT:  ; %bb.6: ; %.preheader2
+; CHECK-NEXT:    ; in Loop: Header=BB0_5 Depth=1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_and_b32_e32 v58, 0xff, v56
+; CHECK-NEXT:    s_mov_b32 s57, 0
+; CHECK-NEXT:    s_mov_b32 s58, 0
+; CHECK-NEXT:    s_branch .LBB0_8
+; CHECK-NEXT:  .LBB0_7: ; in Loop: Header=BB0_8 Depth=2
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s59
+; CHECK-NEXT:    s_add_i32 s58, s58, 4
+; CHECK-NEXT:    s_add_i32 s4, s55, s58
+; CHECK-NEXT:    s_add_i32 s5, s4, 5
+; CHECK-NEXT:    s_add_i32 s4, s4, 1
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc_lo, s5, v42
+; CHECK-NEXT:    v_mov_b32_e32 v59, s4
+; CHECK-NEXT:    s_or_b32 s57, vcc_lo, s57
+; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s57
+; CHECK-NEXT:    s_cbranch_execz .LBB0_16
+; CHECK-NEXT:  .LBB0_8: ; Parent Loop BB0_5 Depth=1
+; CHECK-NEXT:    ; => This Inner Loop Header: Depth=2
+; CHECK-NEXT:    v_add_nc_u32_e32 v60, s58, v46
+; CHECK-NEXT:    v_add_nc_u32_e32 v59, s58, v57
+; CHECK-NEXT:    s_mov_b32 s59, exec_lo
+; CHECK-NEXT:    ds_read_u8 v0, v60
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_cmpx_eq_u16_e64 v58, v0
+; CHECK-NEXT:    s_cbranch_execz .LBB0_10
+; CHECK-NEXT:  ; %bb.9: ; in Loop: Header=BB0_8 Depth=2
+; CHECK-NEXT:    v_mov_b32_e32 v31, v41
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0x3c00
+; CHECK-NEXT:    s_add_u32 s8, s34, 40
+; CHECK-NEXT:    s_addc_u32 s9, s35, 0
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[38:39]
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
+; CHECK-NEXT:    s_mov_b32 s12, s41
+; CHECK-NEXT:    s_mov_b32 s13, s40
+; CHECK-NEXT:    s_mov_b32 s14, s33
+; CHECK-NEXT:    v_add_nc_u32_e32 v47, 1, v47
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[42:43]
+; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; CHECK-NEXT:    ds_write_b32 v0, v59
+; CHECK-NEXT:  .LBB0_10: ; in Loop: Header=BB0_8 Depth=2
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s59
+; CHECK-NEXT:    ds_read_u8 v0, v60 offset:1
+; CHECK-NEXT:    s_mov_b32 s59, exec_lo
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_cmpx_eq_u16_e64 v58, v0
+; CHECK-NEXT:    s_cbranch_execz .LBB0_12
+; CHECK-NEXT:  ; %bb.11: ; in Loop: Header=BB0_8 Depth=2
+; CHECK-NEXT:    v_mov_b32_e32 v31, v41
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0x3c00
+; CHECK-NEXT:    s_add_u32 s8, s34, 40
+; CHECK-NEXT:    s_addc_u32 s9, s35, 0
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[38:39]
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
+; CHECK-NEXT:    s_mov_b32 s12, s41
+; CHECK-NEXT:    s_mov_b32 s13, s40
+; CHECK-NEXT:    s_mov_b32 s14, s33
+; CHECK-NEXT:    v_add_nc_u32_e32 v61, 1, v59
+; CHECK-NEXT:    v_add_nc_u32_e32 v47, 1, v47
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[42:43]
+; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; CHECK-NEXT:    ds_write_b32 v0, v61
+; CHECK-NEXT:  .LBB0_12: ; in Loop: Header=BB0_8 Depth=2
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s59
+; CHECK-NEXT:    ds_read_u8 v0, v60 offset:2
+; CHECK-NEXT:    s_mov_b32 s59, exec_lo
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_cmpx_eq_u16_e64 v58, v0
+; CHECK-NEXT:    s_cbranch_execz .LBB0_14
+; CHECK-NEXT:  ; %bb.13: ; in Loop: Header=BB0_8 Depth=2
+; CHECK-NEXT:    v_mov_b32_e32 v31, v41
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0x3c00
+; CHECK-NEXT:    s_add_u32 s8, s34, 40
+; CHECK-NEXT:    s_addc_u32 s9, s35, 0
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[38:39]
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
+; CHECK-NEXT:    s_mov_b32 s12, s41
+; CHECK-NEXT:    s_mov_b32 s13, s40
+; CHECK-NEXT:    s_mov_b32 s14, s33
+; CHECK-NEXT:    v_add_nc_u32_e32 v61, 2, v59
+; CHECK-NEXT:    v_add_nc_u32_e32 v47, 1, v47
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[42:43]
+; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; CHECK-NEXT:    ds_write_b32 v0, v61
+; CHECK-NEXT:  .LBB0_14: ; in Loop: Header=BB0_8 Depth=2
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s59
+; CHECK-NEXT:    ds_read_u8 v0, v60 offset:3
+; CHECK-NEXT:    s_mov_b32 s59, exec_lo
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_cmpx_eq_u16_e64 v58, v0
+; CHECK-NEXT:    s_cbranch_execz .LBB0_7
+; CHECK-NEXT:  ; %bb.15: ; in Loop: Header=BB0_8 Depth=2
+; CHECK-NEXT:    v_mov_b32_e32 v31, v41
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0x3c00
+; CHECK-NEXT:    s_add_u32 s8, s34, 40
+; CHECK-NEXT:    s_addc_u32 s9, s35, 0
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[38:39]
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
+; CHECK-NEXT:    s_mov_b32 s12, s41
+; CHECK-NEXT:    s_mov_b32 s13, s40
+; CHECK-NEXT:    s_mov_b32 s14, s33
+; CHECK-NEXT:    v_add_nc_u32_e32 v59, 3, v59
+; CHECK-NEXT:    v_add_nc_u32_e32 v47, 1, v47
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[42:43]
+; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; CHECK-NEXT:    ds_write_b32 v0, v59
+; CHECK-NEXT:    s_branch .LBB0_7
+; CHECK-NEXT:  .LBB0_16: ; %Flow43
+; CHECK-NEXT:    ; in Loop: Header=BB0_5 Depth=1
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s57
+; CHECK-NEXT:    v_add_nc_u32_e32 v57, s58, v57
+; CHECK-NEXT:  .LBB0_17: ; %Flow44
+; CHECK-NEXT:    ; in Loop: Header=BB0_5 Depth=1
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s56
+; CHECK-NEXT:    s_mov_b32 s55, exec_lo
+; CHECK-NEXT:    v_cmpx_lt_u32_e64 v59, v42
+; CHECK-NEXT:    s_cbranch_execz .LBB0_23
+; CHECK-NEXT:  ; %bb.18: ; %.preheader
+; CHECK-NEXT:    ; in Loop: Header=BB0_5 Depth=1
+; CHECK-NEXT:    s_mov_b32 s56, 0
+; CHECK-NEXT:    s_inst_prefetch 0x1
+; CHECK-NEXT:    s_branch .LBB0_20
+; CHECK-NEXT:    .p2align 6
+; CHECK-NEXT:  .LBB0_19: ; in Loop: Header=BB0_20 Depth=2
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s57
+; CHECK-NEXT:    v_add_nc_u32_e32 v59, 1, v59
+; CHECK-NEXT:    v_add_nc_u32_e32 v57, 1, v57
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v59, v42
+; CHECK-NEXT:    s_or_b32 s56, vcc_lo, s56
+; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s56
+; CHECK-NEXT:    s_cbranch_execz .LBB0_22
+; CHECK-NEXT:  .LBB0_20: ; Parent Loop BB0_5 Depth=1
+; CHECK-NEXT:    ; => This Inner Loop Header: Depth=2
+; CHECK-NEXT:    v_add_nc_u32_e32 v0, v44, v59
+; CHECK-NEXT:    ds_read_u8 v0, v0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
+; CHECK-NEXT:    s_and_saveexec_b32 s57, s4
+; CHECK-NEXT:    s_cbranch_execz .LBB0_19
+; CHECK-NEXT:  ; %bb.21: ; in Loop: Header=BB0_20 Depth=2
+; CHECK-NEXT:    v_mov_b32_e32 v31, v41
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0x3c00
+; CHECK-NEXT:    s_add_u32 s8, s34, 40
+; CHECK-NEXT:    s_addc_u32 s9, s35, 0
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[38:39]
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
+; CHECK-NEXT:    s_mov_b32 s12, s41
+; CHECK-NEXT:    s_mov_b32 s13, s40
+; CHECK-NEXT:    s_mov_b32 s14, s33
+; CHECK-NEXT:    v_add_nc_u32_e32 v47, 1, v47
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[42:43]
+; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; CHECK-NEXT:    ds_write_b32 v0, v57
+; CHECK-NEXT:    s_branch .LBB0_19
+; CHECK-NEXT:  .LBB0_22: ; %Flow41
+; CHECK-NEXT:    ; in Loop: Header=BB0_5 Depth=1
+; CHECK-NEXT:    s_inst_prefetch 0x2
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s56
+; CHECK-NEXT:  .LBB0_23: ; %Flow42
+; CHECK-NEXT:    ; in Loop: Header=BB0_5 Depth=1
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s55
+; CHECK-NEXT:  ; %bb.24: ; in Loop: Header=BB0_5 Depth=1
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc_lo, s54, v45
+; CHECK-NEXT:    v_cmp_lt_u32_e64 s4, 59, v47
+; CHECK-NEXT:    v_add_nc_u32_e32 v46, 1, v46
+; CHECK-NEXT:    s_mov_b32 s55, s54
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
+; CHECK-NEXT:    s_and_b32 s4, exec_lo, s4
+; CHECK-NEXT:    s_or_b32 s49, s4, s49
+; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s49
+; CHECK-NEXT:    s_cbranch_execnz .LBB0_5
+; CHECK-NEXT:  .LBB0_25: ; %Flow49
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s48
+; CHECK-NEXT:    v_mov_b32_e32 v31, v41
+; CHECK-NEXT:    v_mov_b32_e32 v0, 1
+; CHECK-NEXT:    s_add_u32 s8, s34, 40
+; CHECK-NEXT:    s_addc_u32 s9, s35, 0
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[38:39]
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
+; CHECK-NEXT:    s_mov_b32 s12, s41
+; CHECK-NEXT:    s_mov_b32 s13, s40
+; CHECK-NEXT:    s_mov_b32 s14, s33
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[52:53]
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    s_mov_b32 s4, exec_lo
+; CHECK-NEXT:    ds_read_b32 v47, v0 offset:15360
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_cmpx_gt_u32_e64 v47, v40
+; CHECK-NEXT:    s_cbranch_execz .LBB0_33
+; CHECK-NEXT:  ; %bb.26:
+; CHECK-NEXT:    s_add_u32 s52, s44, 8
+; CHECK-NEXT:    s_addc_u32 s53, s45, 0
+; CHECK-NEXT:    s_getpc_b64 s[42:43]
+; CHECK-NEXT:    s_add_u32 s42, s42, _Z10atomic_addPU3AS1Vjj at rel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s43, s43, _Z10atomic_addPU3AS1Vjj at rel32@hi+12
+; CHECK-NEXT:    s_mov_b32 s54, 0
+; CHECK-NEXT:    s_getpc_b64 s[44:45]
+; CHECK-NEXT:    s_add_u32 s44, s44, _Z10atomic_subPU3AS1Vjj at rel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s45, s45, _Z10atomic_subPU3AS1Vjj at rel32@hi+12
+; CHECK-NEXT:    s_getpc_b64 s[48:49]
+; CHECK-NEXT:    s_add_u32 s48, s48, _Z14get_local_sizej at rel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s49, s49, _Z14get_local_sizej at rel32@hi+12
+; CHECK-NEXT:    s_branch .LBB0_28
+; CHECK-NEXT:  .LBB0_27: ; in Loop: Header=BB0_28 Depth=1
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s55
+; CHECK-NEXT:    v_mov_b32_e32 v31, v41
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    s_add_u32 s8, s34, 40
+; CHECK-NEXT:    s_addc_u32 s9, s35, 0
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[38:39]
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
+; CHECK-NEXT:    s_mov_b32 s12, s41
+; CHECK-NEXT:    s_mov_b32 s13, s40
+; CHECK-NEXT:    s_mov_b32 s14, s33
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[48:49]
+; CHECK-NEXT:    v_add_co_u32 v40, vcc_lo, v0, v40
+; CHECK-NEXT:    v_cmp_le_u32_e32 vcc_lo, v47, v40
+; CHECK-NEXT:    s_or_b32 s54, vcc_lo, s54
+; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s54
+; CHECK-NEXT:    s_cbranch_execz .LBB0_33
+; CHECK-NEXT:  .LBB0_28: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 2, v40
+; CHECK-NEXT:    s_mov_b32 s55, exec_lo
+; CHECK-NEXT:    ds_read_b32 v0, v0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_lshrrev_b32_e32 v63, 10, v0
+; CHECK-NEXT:    v_bfe_u32 v62, v0, 5, 5
+; CHECK-NEXT:    v_and_b32_e32 v72, 31, v0
+; CHECK-NEXT:    v_mul_u32_u24_e32 v1, 0x180, v63
+; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 5, v62
+; CHECK-NEXT:    v_lshlrev_b32_e32 v4, 5, v72
+; CHECK-NEXT:    v_add_co_u32 v2, s4, s52, v1
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v3, null, s53, 0, s4
+; CHECK-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
+; CHECK-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_xor_b32_e32 v46, v9, v5
+; CHECK-NEXT:    v_xor_b32_e32 v45, v8, v4
+; CHECK-NEXT:    v_xor_b32_e32 v57, v11, v7
+; CHECK-NEXT:    v_xor_b32_e32 v56, v10, v6
+; CHECK-NEXT:    v_or_b32_e32 v5, v46, v57
+; CHECK-NEXT:    v_or_b32_e32 v4, v45, v56
+; CHECK-NEXT:    v_cmpx_ne_u64_e32 0, v[4:5]
+; CHECK-NEXT:    s_cbranch_execz .LBB0_27
+; CHECK-NEXT:  ; %bb.29: ; in Loop: Header=BB0_28 Depth=1
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx2 v[58:59], v[2:3], off offset:16
+; CHECK-NEXT:    global_load_dwordx2 v[60:61], v[0:1], off offset:16
+; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 4, v45
+; CHECK-NEXT:    v_alignbit_b32 v1, v46, v45, 12
+; CHECK-NEXT:    v_and_b32_e32 v2, 0xf0000, v45
+; CHECK-NEXT:    v_mov_b32_e32 v31, v41
+; CHECK-NEXT:    s_add_u32 s8, s34, 40
+; CHECK-NEXT:    v_and_b32_e32 v3, 0xf000, v0
+; CHECK-NEXT:    v_and_b32_e32 v4, 0xf00, v1
+; CHECK-NEXT:    v_and_b32_e32 v0, 0xf0, v0
+; CHECK-NEXT:    v_and_b32_e32 v1, 15, v1
+; CHECK-NEXT:    s_addc_u32 s9, s35, 0
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
+; CHECK-NEXT:    v_or3_b32 v2, v3, v2, v4
+; CHECK-NEXT:    s_mov_b32 s12, s41
+; CHECK-NEXT:    s_mov_b32 s13, s40
+; CHECK-NEXT:    s_mov_b32 s14, s33
+; CHECK-NEXT:    v_or3_b32 v73, v2, v0, v1
+; CHECK-NEXT:    v_lshrrev_b32_e32 v0, 1, v73
+; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 2, v73
+; CHECK-NEXT:    v_and_b32_e32 v0, 0x7fffc, v0
+; CHECK-NEXT:    v_lshlrev_b32_e64 v44, v1, 1
+; CHECK-NEXT:    v_and_b32_e32 v74, 28, v1
+; CHECK-NEXT:    v_add_co_u32 v42, s4, s50, v0
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v43, null, s51, 0, s4
+; CHECK-NEXT:    v_mov_b32_e32 v2, v44
+; CHECK-NEXT:    v_mov_b32_e32 v0, v42
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[38:39]
+; CHECK-NEXT:    v_mov_b32_e32 v1, v43
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[42:43]
+; CHECK-NEXT:    v_bfe_u32 v0, v0, v74, 4
+; CHECK-NEXT:    s_mov_b32 s4, exec_lo
+; CHECK-NEXT:    v_cmpx_gt_u32_e32 12, v0
+; CHECK-NEXT:    s_xor_b32 s4, exec_lo, s4
+; CHECK-NEXT:    s_cbranch_execz .LBB0_31
+; CHECK-NEXT:  ; %bb.30: ; in Loop: Header=BB0_28 Depth=1
+; CHECK-NEXT:    v_xor_b32_e32 v5, v60, v58
+; CHECK-NEXT:    v_lshrrev_b64 v[3:4], 16, v[56:57]
+; CHECK-NEXT:    v_mul_u32_u24_e32 v11, 0x180, v73
+; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; CHECK-NEXT:    v_lshrrev_b64 v[1:2], 16, v[45:46]
+; CHECK-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
+; CHECK-NEXT:    v_lshlrev_b32_e32 v8, 6, v72
+; CHECK-NEXT:    v_lshlrev_b32_e32 v10, 12, v63
+; CHECK-NEXT:    v_xor_b32_e32 v6, v61, v59
+; CHECK-NEXT:    v_lshlrev_b32_e32 v9, 16, v56
+; CHECK-NEXT:    v_or_b32_e32 v4, v7, v4
+; CHECK-NEXT:    v_add_co_u32 v7, s5, s46, v11
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v11, null, s47, 0, s5
+; CHECK-NEXT:    v_or3_b32 v10, v8, v10, v62
+; CHECK-NEXT:    v_add_co_u32 v7, vcc_lo, v7, v0
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, 0, v11, vcc_lo
+; CHECK-NEXT:    v_lshrrev_b64 v[5:6], 16, v[5:6]
+; CHECK-NEXT:    v_or_b32_e32 v2, v9, v2
+; CHECK-NEXT:    global_store_dword v[7:8], v10, off offset:4
+; CHECK-NEXT:    global_store_dwordx4 v[7:8], v[1:4], off offset:8
+; CHECK-NEXT:    global_store_dwordx2 v[7:8], v[5:6], off offset:24
+; CHECK-NEXT:    ; implicit-def: $vgpr42
+; CHECK-NEXT:    ; implicit-def: $vgpr43
+; CHECK-NEXT:    ; implicit-def: $vgpr44
+; CHECK-NEXT:  .LBB0_31: ; %Flow
+; CHECK-NEXT:    ; in Loop: Header=BB0_28 Depth=1
+; CHECK-NEXT:    s_andn2_saveexec_b32 s4, s4
+; CHECK-NEXT:    s_cbranch_execz .LBB0_27
+; CHECK-NEXT:  ; %bb.32: ; in Loop: Header=BB0_28 Depth=1
+; CHECK-NEXT:    v_mov_b32_e32 v31, v41
+; CHECK-NEXT:    v_mov_b32_e32 v0, v42
+; CHECK-NEXT:    v_mov_b32_e32 v1, v43
+; CHECK-NEXT:    v_mov_b32_e32 v2, v44
+; CHECK-NEXT:    s_add_u32 s8, s34, 40
+; CHECK-NEXT:    s_addc_u32 s9, s35, 0
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[38:39]
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
+; CHECK-NEXT:    s_mov_b32 s12, s41
+; CHECK-NEXT:    s_mov_b32 s13, s40
+; CHECK-NEXT:    s_mov_b32 s14, s33
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[44:45]
+; CHECK-NEXT:    s_branch .LBB0_27
+; CHECK-NEXT:  .LBB0_33:
+; CHECK-NEXT:    s_endpgm
+  %6 = tail call i64 @_Z13get_global_idj(i32 noundef 0) #4
+  %7 = trunc i64 %6 to i32
+  %8 = tail call i64 @_Z12get_local_idj(i32 noundef 0) #4
+  %9 = trunc i64 %8 to i32
+  %10 = mul i32 %9, 14
+  %11 = getelementptr inbounds i8, ptr addrspace(3) @kernel_round1.first_words_data, i32 %10
+  store i32 0, ptr addrspace(3) @kernel_round1.collisionsNum, align 4, !tbaa !11
+  tail call void @_Z7barrierj(i32 noundef 1) #5
+  %12 = lshr i64 %6, 3
+  %13 = shl i32 %7, 2
+  %14 = and i32 %13, 28
+  %15 = and i64 %12, 536870911
+  %16 = getelementptr inbounds i32, ptr addrspace(1) %2, i64 %15
+  %17 = load i32, ptr addrspace(1) %16, align 4, !tbaa !11
+  %18 = lshr i32 %17, %14
+  %19 = and i32 %18, 15
+  %20 = tail call i32 @_Z3minjj(i32 noundef %19, i32 noundef 12) #4
+  %21 = icmp eq i32 %20, 0
+  br i1 %21, label %119, label %27
+
+22:                                               ; preds = %27
+  %23 = add i32 %20, -1
+  %24 = icmp eq i32 %23, 0
+  br i1 %24, label %119, label %25
+
+25:                                               ; preds = %22
+  %26 = shl i32 %7, 10
+  br label %37
+
+27:                                               ; preds = %5, %27
+  %28 = phi i32 [ %30, %27 ], [ 0, %5 ]
+  %29 = getelementptr inbounds i8, ptr addrspace(3) %11, i32 %28
+  store i8 0, ptr addrspace(3) %29, align 1, !tbaa !15
+  %30 = add nuw i32 %28, 1
+  %31 = icmp eq i32 %30, %20
+  br i1 %31, label %22, label %27
+
+32:                                               ; preds = %114, %48
+  %33 = phi i32 [ %50, %48 ], [ %115, %114 ]
+  %34 = icmp ult i32 %44, %23
+  %35 = icmp ult i32 %33, 60
+  %36 = select i1 %34, i1 %35, i1 false
+  br i1 %36, label %37, label %119
+
+37:                                               ; preds = %32, %25
+  %38 = phi i32 [ 0, %25 ], [ %44, %32 ]
+  %39 = phi i32 [ 0, %25 ], [ %33, %32 ]
+  %40 = getelementptr inbounds i8, ptr addrspace(3) %11, i32 %38
+  %41 = load i8, ptr addrspace(3) %40, align 1, !tbaa !15
+  %42 = shl i32 %38, 5
+  %43 = or i32 %42, %26
+  %44 = add nuw i32 %38, 1
+  %45 = or i32 %43, %44
+  %46 = add i32 %38, 5
+  %47 = icmp ult i32 %46, %20
+  br i1 %47, label %53, label %48
+
+48:                                               ; preds = %98, %37
+  %49 = phi i32 [ %45, %37 ], [ %100, %98 ]
+  %50 = phi i32 [ %39, %37 ], [ %99, %98 ]
+  %51 = phi i32 [ %44, %37 ], [ %54, %98 ]
+  %52 = icmp ult i32 %51, %20
+  br i1 %52, label %103, label %32
+
+53:                                               ; preds = %37, %98
+  %54 = phi i32 [ %101, %98 ], [ %46, %37 ]
+  %55 = phi i32 [ %54, %98 ], [ %44, %37 ]
+  %56 = phi i32 [ %99, %98 ], [ %39, %37 ]
+  %57 = phi i32 [ %100, %98 ], [ %45, %37 ]
+  %58 = getelementptr inbounds i8, ptr addrspace(3) %11, i32 %55
+  %59 = load i8, ptr addrspace(3) %58, align 1, !tbaa !15
+  %60 = icmp eq i8 %41, %59
+  br i1 %60, label %61, label %65
+
+61:                                               ; preds = %53
+  %62 = add i32 %56, 1
+  %63 = tail call i32 @_Z10atomic_incPU3AS3Vj(ptr addrspace(3) noundef @kernel_round1.collisionsNum) #5
+  %64 = getelementptr inbounds i32, ptr addrspace(3) @kernel_round1.collisionsData, i32 %63
+  store i32 %57, ptr addrspace(3) %64, align 4, !tbaa !11
+  br label %65
+
+65:                                               ; preds = %61, %53
+  %66 = phi i32 [ %62, %61 ], [ %56, %53 ]
+  %67 = add i32 %55, 1
+  %68 = getelementptr inbounds i8, ptr addrspace(3) %11, i32 %67
+  %69 = load i8, ptr addrspace(3) %68, align 1, !tbaa !15
+  %70 = icmp eq i8 %41, %69
+  br i1 %70, label %71, label %76
+
+71:                                               ; preds = %65
+  %72 = add i32 %57, 1
+  %73 = add i32 %66, 1
+  %74 = tail call i32 @_Z10atomic_incPU3AS3Vj(ptr addrspace(3) noundef @kernel_round1.collisionsNum) #5
+  %75 = getelementptr inbounds i32, ptr addrspace(3) @kernel_round1.collisionsData, i32 %74
+  store i32 %72, ptr addrspace(3) %75, align 4, !tbaa !11
+  br label %76
+
+76:                                               ; preds = %71, %65
+  %77 = phi i32 [ %73, %71 ], [ %66, %65 ]
+  %78 = add i32 %55, 2
+  %79 = getelementptr inbounds i8, ptr addrspace(3) %11, i32 %78
+  %80 = load i8, ptr addrspace(3) %79, align 1, !tbaa !15
+  %81 = icmp eq i8 %41, %80
+  br i1 %81, label %82, label %87
+
+82:                                               ; preds = %76
+  %83 = add i32 %57, 2
+  %84 = add i32 %77, 1
+  %85 = tail call i32 @_Z10atomic_incPU3AS3Vj(ptr addrspace(3) noundef @kernel_round1.collisionsNum) #5
+  %86 = getelementptr inbounds i32, ptr addrspace(3) @kernel_round1.collisionsData, i32 %85
+  store i32 %83, ptr addrspace(3) %86, align 4, !tbaa !11
+  br label %87
+
+87:                                               ; preds = %82, %76
+  %88 = phi i32 [ %84, %82 ], [ %77, %76 ]
+  %89 = add i32 %55, 3
+  %90 = getelementptr inbounds i8, ptr addrspace(3) %11, i32 %89
+  %91 = load i8, ptr addrspace(3) %90, align 1, !tbaa !15
+  %92 = icmp eq i8 %41, %91
+  br i1 %92, label %93, label %98
+
+93:                                               ; preds = %87
+  %94 = add i32 %57, 3
+  %95 = add i32 %88, 1
+  %96 = tail call i32 @_Z10atomic_incPU3AS3Vj(ptr addrspace(3) noundef @kernel_round1.collisionsNum) #5
+  %97 = getelementptr inbounds i32, ptr addrspace(3) @kernel_round1.collisionsData, i32 %96
+  store i32 %94, ptr addrspace(3) %97, align 4, !tbaa !11
+  br label %98
+
+98:                                               ; preds = %93, %87
+  %99 = phi i32 [ %95, %93 ], [ %88, %87 ]
+  %100 = add i32 %57, 4
+  %101 = add i32 %54, 4
+  %102 = icmp ult i32 %101, %20
+  br i1 %102, label %53, label %48
+
+103:                                              ; preds = %48, %114
+  %104 = phi i32 [ %117, %114 ], [ %51, %48 ]
+  %105 = phi i32 [ %115, %114 ], [ %50, %48 ]
+  %106 = phi i32 [ %116, %114 ], [ %49, %48 ]
+  %107 = getelementptr inbounds i8, ptr addrspace(3) %11, i32 %104
+  %108 = load i8, ptr addrspace(3) %107, align 1, !tbaa !15
+  %109 = icmp eq i8 %41, %108
+  br i1 %109, label %110, label %114
+
+110:                                              ; preds = %103
+  %111 = add i32 %105, 1
+  %112 = tail call i32 @_Z10atomic_incPU3AS3Vj(ptr addrspace(3) noundef @kernel_round1.collisionsNum) #5
+  %113 = getelementptr inbounds i32, ptr addrspace(3) @kernel_round1.collisionsData, i32 %112
+  store i32 %106, ptr addrspace(3) %113, align 4, !tbaa !11
+  br label %114
+
+114:                                              ; preds = %110, %103
+  %115 = phi i32 [ %111, %110 ], [ %105, %103 ]
+  %116 = add i32 %106, 1
+  %117 = add nuw i32 %104, 1
+  %118 = icmp ult i32 %117, %20
+  br i1 %118, label %103, label %32
+
+119:                                              ; preds = %32, %22, %5
+  tail call void @_Z7barrierj(i32 noundef 1) #5
+  %120 = load i32, ptr addrspace(3) @kernel_round1.collisionsNum, align 4, !tbaa !11
+  %121 = icmp ugt i32 %120, %9
+  br i1 %121, label %122, label %206
+
+122:                                              ; preds = %119
+  %123 = getelementptr inbounds i8, ptr addrspace(1) %0, i64 8
+  br label %124
+
+124:                                              ; preds = %201, %122
+  %125 = phi i32 [ %9, %122 ], [ %204, %201 ]
+  %126 = phi i64 [ %8, %122 ], [ %203, %201 ]
+  %127 = and i64 %126, 4294967295
+  %128 = getelementptr inbounds i32, ptr addrspace(3) @kernel_round1.collisionsData, i32 %125
+  %129 = load i32, ptr addrspace(3) %128, align 4, !tbaa !11
+  %130 = lshr i32 %129, 10
+  %131 = lshr i32 %129, 5
+  %132 = and i32 %131, 31
+  %133 = and i32 %129, 31
+  %134 = mul nuw nsw i32 %130, 384
+  %135 = zext i32 %134 to i64
+  %136 = getelementptr inbounds i8, ptr addrspace(1) %123, i64 %135
+  %137 = shl nuw nsw i32 %132, 5
+  %138 = zext i32 %137 to i64
+  %139 = getelementptr inbounds i8, ptr addrspace(1) %136, i64 %138
+  %140 = shl nuw nsw i32 %133, 5
+  %141 = zext i32 %140 to i64
+  %142 = getelementptr inbounds i8, ptr addrspace(1) %136, i64 %141
+  %143 = getelementptr inbounds i64, ptr addrspace(1) %139, i64 1
+  %144 = load i64, ptr addrspace(1) %139, align 8, !tbaa !16
+  %145 = getelementptr inbounds i64, ptr addrspace(1) %142, i64 1
+  %146 = load i64, ptr addrspace(1) %142, align 8, !tbaa !16
+  %147 = xor i64 %146, %144
+  %148 = load i64, ptr addrspace(1) %143, align 8, !tbaa !16
+  %149 = load i64, ptr addrspace(1) %145, align 8, !tbaa !16
+  %150 = xor i64 %149, %148
+  %151 = icmp ne i64 %147, 0
+  %152 = icmp ne i64 %150, 0
+  %153 = select i1 %151, i1 true, i1 %152
+  br i1 %153, label %154, label %201
+
+154:                                              ; preds = %124
+  %155 = getelementptr inbounds i64, ptr addrspace(1) %142, i64 2
+  %156 = load i64, ptr addrspace(1) %155, align 8, !tbaa !16
+  %157 = getelementptr inbounds i64, ptr addrspace(1) %139, i64 2
+  %158 = load i64, ptr addrspace(1) %157, align 8, !tbaa !16
+  %159 = and i64 %147, 983040
+  %160 = shl i64 %147, 4
+  %161 = and i64 %160, 61440
+  %162 = or i64 %161, %159
+  %163 = lshr i64 %147, 12
+  %164 = and i64 %163, 3840
+  %165 = or i64 %162, %164
+  %166 = and i64 %160, 240
+  %167 = or i64 %165, %166
+  %168 = and i64 %163, 15
+  %169 = or i64 %167, %168
+  %170 = trunc i64 %169 to i32
+  %171 = lshr i64 %169, 3
+  %172 = shl nuw nsw i32 %170, 2
+  %173 = and i32 %172, 28
+  %174 = getelementptr inbounds i32, ptr addrspace(1) %3, i64 %171
+  %175 = shl nuw nsw i32 1, %173
+  %176 = tail call i32 @_Z10atomic_addPU3AS1Vjj(ptr addrspace(1) noundef %174, i32 noundef %175) #5
+  %177 = lshr i32 %176, %173
+  %178 = and i32 %177, 15
+  %179 = icmp ugt i32 %178, 11
+  br i1 %179, label %180, label %182
+
+180:                                              ; preds = %154
+  %181 = tail call i32 @_Z10atomic_subPU3AS1Vjj(ptr addrspace(1) noundef %174, i32 noundef %175) #5
+  br label %201
+
+182:                                              ; preds = %154
+  %183 = xor i64 %158, %156
+  %184 = lshr i64 %183, 16
+  %185 = tail call i64 @llvm.fshl.i64(i64 %183, i64 %150, i64 48)
+  %186 = tail call i64 @llvm.fshl.i64(i64 %150, i64 %147, i64 48)
+  %187 = shl nuw nsw i32 %133, 6
+  %188 = shl i32 %130, 12
+  %189 = or i32 %187, %188
+  %190 = or i32 %189, %132
+  %191 = mul nuw nsw i64 %169, 384
+  %192 = and i64 %191, 4294967168
+  %193 = getelementptr inbounds i8, ptr addrspace(1) %1, i64 %192
+  %194 = shl nuw nsw i32 %178, 5
+  %195 = or i32 %194, 8
+  %196 = zext i32 %195 to i64
+  %197 = getelementptr inbounds i8, ptr addrspace(1) %193, i64 %196
+  %198 = getelementptr inbounds i8, ptr addrspace(1) %197, i64 -4
+  store i32 %190, ptr addrspace(1) %198, align 4, !tbaa !11
+  store i64 %186, ptr addrspace(1) %197, align 8, !tbaa !16
+  %199 = getelementptr inbounds i8, ptr addrspace(1) %197, i64 8
+  store i64 %185, ptr addrspace(1) %199, align 8, !tbaa !16
+  %200 = getelementptr inbounds i8, ptr addrspace(1) %197, i64 16
+  store i64 %184, ptr addrspace(1) %200, align 8, !tbaa !16
+  br label %201
+
+201:                                              ; preds = %182, %180, %124
+  %202 = tail call i64 @_Z14get_local_sizej(i32 noundef 0) #4
+  %203 = add i64 %202, %127
+  %204 = trunc i64 %203 to i32
+  %205 = icmp ugt i32 %120, %204
+  br i1 %205, label %124, label %206
+
+206:                                              ; preds = %201, %119
+  ret void
+}
+
+; Removed most of the if-else blocks
+
+define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapture noundef readonly align 1 %.0, ptr addrspace(1) nocapture noundef writeonly align 1 %.1, ptr addrspace(1) nocapture noundef readonly align 4 %.2, ptr addrspace(1) noundef align 4 %.3, ptr addrspace(1) nocapture noundef readnone align 4 %.4) local_unnamed_addr #2 !kernel_arg_addr_space !5 !kernel_arg_access_qual !6 !kernel_arg_type !7 !kernel_arg_base_type !7 !kernel_arg_type_qual !8 !kernel_arg_name !9 !reqd_work_group_size !10 {
+; CHECK-LABEL: kernel_round1_short:
+; CHECK:       ; %bb.0: ; %.5
+; CHECK-NEXT:    s_add_u32 s10, s10, s15
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    s_addc_u32 s11, s11, 0
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
+; CHECK-NEXT:    s_load_dwordx2 s[46:47], s[6:7], 0x10
+; CHECK-NEXT:    s_add_u32 s0, s0, s15
+; CHECK-NEXT:    s_mov_b64 s[36:37], s[6:7]
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    v_mov_b32_e32 v40, v0
+; CHECK-NEXT:    s_add_u32 s42, s36, 40
+; CHECK-NEXT:    v_mov_b32_e32 v31, v0
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    s_mov_b64 s[34:35], s[8:9]
+; CHECK-NEXT:    s_addc_u32 s43, s37, 0
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[34:35]
+; CHECK-NEXT:    s_mov_b64 s[8:9], s[42:43]
+; CHECK-NEXT:    s_mov_b32 s33, s14
+; CHECK-NEXT:    s_mov_b32 s40, s13
+; CHECK-NEXT:    s_mov_b32 s41, s12
+; CHECK-NEXT:    s_mov_b64 s[38:39], s[4:5]
+; CHECK-NEXT:    s_getpc_b64 s[6:7]
+; CHECK-NEXT:    s_add_u32 s6, s6, _Z13get_global_idj at rel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s7, s7, _Z13get_global_idj at rel32@hi+12
+; CHECK-NEXT:    v_mov_b32_e32 v43, 0
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; CHECK-NEXT:    v_mov_b32_e32 v42, v0
+; CHECK-NEXT:    v_mov_b32_e32 v31, v40
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[38:39]
+; CHECK-NEXT:    s_mov_b64 s[8:9], s[42:43]
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[34:35]
+; CHECK-NEXT:    s_mov_b32 s12, s41
+; CHECK-NEXT:    s_mov_b32 s13, s40
+; CHECK-NEXT:    s_mov_b32 s14, s33
+; CHECK-NEXT:    s_getpc_b64 s[6:7]
+; CHECK-NEXT:    s_add_u32 s6, s6, _Z12get_local_idj at rel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s7, s7, _Z12get_local_idj at rel32@hi+12
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; CHECK-NEXT:    v_mul_lo_u32 v46, v0, 14
+; CHECK-NEXT:    v_mov_b32_e32 v31, v40
+; CHECK-NEXT:    v_mov_b32_e32 v0, 1
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[38:39]
+; CHECK-NEXT:    s_mov_b64 s[8:9], s[42:43]
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[34:35]
+; CHECK-NEXT:    s_mov_b32 s12, s41
+; CHECK-NEXT:    s_mov_b32 s13, s40
+; CHECK-NEXT:    s_mov_b32 s14, s33
+; CHECK-NEXT:    ds_write_b32 v43, v43 offset:15360
+; CHECK-NEXT:    s_getpc_b64 s[44:45]
+; CHECK-NEXT:    s_add_u32 s44, s44, _Z7barrierj at rel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s45, s45, _Z7barrierj at rel32@hi+12
+; CHECK-NEXT:    v_add_nc_u32_e32 v44, 0x3c04, v46
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[44:45]
+; CHECK-NEXT:    v_lshrrev_b32_e32 v0, 1, v42
+; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 2, v42
+; CHECK-NEXT:    v_mov_b32_e32 v31, v40
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[38:39]
+; CHECK-NEXT:    s_mov_b64 s[8:9], s[42:43]
+; CHECK-NEXT:    v_and_b32_e32 v0, 0x7ffffffc, v0
+; CHECK-NEXT:    v_and_b32_e32 v1, 28, v1
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[34:35]
+; CHECK-NEXT:    s_mov_b32 s12, s41
+; CHECK-NEXT:    s_mov_b32 s13, s40
+; CHECK-NEXT:    global_load_dword v0, v0, s[46:47]
+; CHECK-NEXT:    s_mov_b32 s14, s33
+; CHECK-NEXT:    s_getpc_b64 s[6:7]
+; CHECK-NEXT:    s_add_u32 s6, s6, _Z3minjj at rel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s7, s7, _Z3minjj at rel32@hi+12
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_bfe_u32 v0, v0, v1, 4
+; CHECK-NEXT:    v_mov_b32_e32 v1, 12
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; CHECK-NEXT:    v_mov_b32_e32 v41, v0
+; CHECK-NEXT:    v_lshlrev_b32_e32 v42, 10, v42
+; CHECK-NEXT:    s_getpc_b64 s[42:43]
+; CHECK-NEXT:    s_add_u32 s42, s42, _Z10atomic_incPU3AS3Vj at rel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s43, s43, _Z10atomic_incPU3AS3Vj at rel32@hi+12
+; CHECK-NEXT:    s_mov_b32 s46, 0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:    v_add_nc_u32_e32 v45, -1, v41
+; CHECK-NEXT:    ds_write_b8 v46, v43 offset:15364
+; CHECK-NEXT:  .LBB1_1: ; %.37
+; CHECK-NEXT:    ; =>This Loop Header: Depth=1
+; CHECK-NEXT:    ; Child Loop BB1_3 Depth 2
+; CHECK-NEXT:    ; Child Loop BB1_8 Depth 2
+; CHECK-NEXT:    v_add_nc_u32_e32 v0, s4, v44
+; CHECK-NEXT:    s_lshl_b32 s5, s4, 5
+; CHECK-NEXT:    s_add_i32 s47, s4, 1
+; CHECK-NEXT:    s_add_i32 s6, s4, 5
+; CHECK-NEXT:    v_or3_b32 v47, s5, v42, s47
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    ds_read_u8 v46, v0
+; CHECK-NEXT:    v_mov_b32_e32 v56, s47
+; CHECK-NEXT:    s_mov_b32 s5, exec_lo
+; CHECK-NEXT:    v_cmpx_lt_u32_e64 s6, v41
+; CHECK-NEXT:    s_cbranch_execz .LBB1_5
+; CHECK-NEXT:  ; %bb.2: ; %.53.preheader
+; CHECK-NEXT:    ; in Loop: Header=BB1_1 Depth=1
+; CHECK-NEXT:    s_mov_b32 s6, 0
+; CHECK-NEXT:    s_mov_b32 s7, 0
+; CHECK-NEXT:  .LBB1_3: ; %.53
+; CHECK-NEXT:    ; Parent Loop BB1_1 Depth=1
+; CHECK-NEXT:    ; => This Inner Loop Header: Depth=2
+; CHECK-NEXT:    s_add_i32 s7, s7, 4
+; CHECK-NEXT:    v_add_nc_u32_e32 v43, 1, v43
+; CHECK-NEXT:    s_add_i32 s8, s4, s7
+; CHECK-NEXT:    s_add_i32 s9, s8, 5
+; CHECK-NEXT:    s_add_i32 s8, s8, 1
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc_lo, s9, v41
+; CHECK-NEXT:    v_mov_b32_e32 v56, s8
+; CHECK-NEXT:    s_or_b32 s6, vcc_lo, s6
+; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s6
+; CHECK-NEXT:    s_cbranch_execnz .LBB1_3
+; CHECK-NEXT:  ; %bb.4: ; %Flow3
+; CHECK-NEXT:    ; in Loop: Header=BB1_1 Depth=1
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s6
+; CHECK-NEXT:    v_add_nc_u32_e32 v47, s7, v47
+; CHECK-NEXT:  .LBB1_5: ; %Flow4
+; CHECK-NEXT:    ; in Loop: Header=BB1_1 Depth=1
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; CHECK-NEXT:    s_mov_b32 s48, exec_lo
+; CHECK-NEXT:    v_cmpx_lt_u32_e64 v56, v41
+; CHECK-NEXT:    s_cbranch_execz .LBB1_11
+; CHECK-NEXT:  ; %bb.6: ; %.103.preheader
+; CHECK-NEXT:    ; in Loop: Header=BB1_1 Depth=1
+; CHECK-NEXT:    s_mov_b32 s49, 0
+; CHECK-NEXT:    s_inst_prefetch 0x1
+; CHECK-NEXT:    s_branch .LBB1_8
+; CHECK-NEXT:    .p2align 6
+; CHECK-NEXT:  .LBB1_7: ; %.114
+; CHECK-NEXT:    ; in Loop: Header=BB1_8 Depth=2
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s50
+; CHECK-NEXT:    v_add_nc_u32_e32 v56, 1, v56
+; CHECK-NEXT:    v_add_nc_u32_e32 v47, 1, v47
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v56, v41
+; CHECK-NEXT:    s_or_b32 s49, vcc_lo, s49
+; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s49
+; CHECK-NEXT:    s_cbranch_execz .LBB1_10
+; CHECK-NEXT:  .LBB1_8: ; %.103
+; CHECK-NEXT:    ; Parent Loop BB1_1 Depth=1
+; CHECK-NEXT:    ; => This Inner Loop Header: Depth=2
+; CHECK-NEXT:    v_add_nc_u32_e32 v0, v44, v56
+; CHECK-NEXT:    ds_read_u8 v0, v0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_cmp_eq_u16_sdwa s4, v46, v0 src0_sel:BYTE_0 src1_sel:DWORD
+; CHECK-NEXT:    s_and_saveexec_b32 s50, s4
+; CHECK-NEXT:    s_cbranch_execz .LBB1_7
+; CHECK-NEXT:  ; %bb.9: ; %.110
+; CHECK-NEXT:    ; in Loop: Header=BB1_8 Depth=2
+; CHECK-NEXT:    v_mov_b32_e32 v31, v40
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0x3c00
+; CHECK-NEXT:    s_add_u32 s8, s36, 40
+; CHECK-NEXT:    s_addc_u32 s9, s37, 0
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[38:39]
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[34:35]
+; CHECK-NEXT:    s_mov_b32 s12, s41
+; CHECK-NEXT:    s_mov_b32 s13, s40
+; CHECK-NEXT:    s_mov_b32 s14, s33
+; CHECK-NEXT:    v_add_nc_u32_e32 v43, 1, v43
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[42:43]
+; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; CHECK-NEXT:    ds_write_b32 v0, v47
+; CHECK-NEXT:    s_branch .LBB1_7
+; CHECK-NEXT:  .LBB1_10: ; %Flow
+; CHECK-NEXT:    ; in Loop: Header=BB1_1 Depth=1
+; CHECK-NEXT:    s_inst_prefetch 0x2
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s49
+; CHECK-NEXT:  .LBB1_11: ; %Flow2
+; CHECK-NEXT:    ; in Loop: Header=BB1_1 Depth=1
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s48
+; CHECK-NEXT:  ; %bb.12: ; %.32
+; CHECK-NEXT:    ; in Loop: Header=BB1_1 Depth=1
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc_lo, s47, v45
+; CHECK-NEXT:    v_cmp_lt_u32_e64 s4, 59, v43
+; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
+; CHECK-NEXT:    s_and_b32 s4, exec_lo, s4
+; CHECK-NEXT:    s_or_b32 s46, s4, s46
+; CHECK-NEXT:    s_mov_b32 s4, s47
+; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s46
+; CHECK-NEXT:    s_cbranch_execnz .LBB1_1
+; CHECK-NEXT:  ; %bb.13: ; %.119
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s46
+; CHECK-NEXT:    v_mov_b32_e32 v31, v40
+; CHECK-NEXT:    v_mov_b32_e32 v0, 1
+; CHECK-NEXT:    s_add_u32 s8, s36, 40
+; CHECK-NEXT:    s_addc_u32 s9, s37, 0
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[38:39]
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[34:35]
+; CHECK-NEXT:    s_mov_b32 s12, s41
+; CHECK-NEXT:    s_mov_b32 s13, s40
+; CHECK-NEXT:    s_mov_b32 s14, s33
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[44:45]
+; CHECK-NEXT:    s_endpgm
+.5:
+  %.6 = tail call i64 @_Z13get_global_idj(i32 noundef 0) #4
+  %.7 = trunc i64 %.6 to i32
+  %.8 = tail call i64 @_Z12get_local_idj(i32 noundef 0) #4
+  %.9 = trunc i64 %.8 to i32
+  %.10 = mul i32 %.9, 14
+  %.11 = getelementptr inbounds i8, ptr addrspace(3) @kernel_round1.first_words_data, i32 %.10
+  store i32 0, ptr addrspace(3) @kernel_round1.collisionsNum, align 4, !tbaa !11
+  tail call void @_Z7barrierj(i32 noundef 1) #5
+  %.12 = lshr i64 %.6, 3
+  %.13 = shl i32 %.7, 2
+  %.14 = and i32 %.13, 28
+  %.15 = and i64 %.12, 536870911
+  %.16 = getelementptr inbounds i32, ptr addrspace(1) %.2, i64 %.15
+  %.17 = load i32, ptr addrspace(1) %.16, align 4, !tbaa !11
+  %.18 = lshr i32 %.17, %.14
+  %.19 = and i32 %.18, 15
+  %.20 = tail call i32 @_Z3minjj(i32 noundef %.19, i32 noundef 12) #4
+  %.21 = icmp eq i32 %.20, 0
+  %.23 = add i32 %.20, -1
+  %.24 = icmp eq i32 %.23, 0
+  store i8 0, ptr addrspace(3) %.11, align 1, !tbaa !15
+  br label %.37
+
+.32:                                               ; preds = %.114, %.48
+  %.33 = phi i32 [ %.50, %.48 ], [ %.115, %.114 ]
+  %.34 = icmp ult i32 %.44, %.23
+  %.35 = icmp ult i32 %.33, 60
+  %.36 = select i1 %.34, i1 %.35, i1 false
+  br i1 %.36, label %.37, label %.119
+
+.37:                                               ; preds = %.32, %.25
+  %.38 = phi i32 [ 0, %.5 ], [ %.44, %.32 ]
+  %.39 = phi i32 [ 0, %.5 ], [ %.33, %.32 ]
+  %.26 = shl i32 %.7, 10
+  %.40 = getelementptr inbounds i8, ptr addrspace(3) %.11, i32 %.38
+  %.41 = load i8, ptr addrspace(3) %.40, align 1, !tbaa !15
+  %.42 = shl i32 %.38, 5
+  %.43 = or i32 %.42, %.26
+  %.44 = add nuw i32 %.38, 1
+  %.45 = or i32 %.43, %.44
+  %.46 = add i32 %.38, 5
+  %.47 = icmp ult i32 %.46, %.20
+  br i1 %.47, label %.53, label %.48
+
+.48:                                               ; preds = %.98, %.37
+  %.49 = phi i32 [ %.45, %.37 ], [ %.100, %.98 ]
+  %.50 = phi i32 [ %.39, %.37 ], [ %.99, %.98 ]
+  %.51 = phi i32 [ %.44, %.37 ], [ %.54, %.98 ]
+  %.52 = icmp ult i32 %.51, %.20
+  br i1 %.52, label %.103, label %.32
+
+.53:                                               ; preds = %.37, %.98
+  %.54 = phi i32 [ %.101, %.98 ], [ %.46, %.37 ]
+  %.55 = phi i32 [ %.54, %.98 ], [ %.44, %.37 ]
+  %.56 = phi i32 [ %.99, %.98 ], [ %.39, %.37 ]
+  %.57 = phi i32 [ %.100, %.98 ], [ %.45, %.37 ]
+  %.58 = getelementptr inbounds i8, ptr addrspace(3) %.11, i32 %.55
+  %.59 = load i8, ptr addrspace(3) %.58, align 1, !tbaa !15
+  %.60 = icmp eq i8 %.41, %.59
+  br label %.98
+
+.98:                                               ; preds = %.93, %.87
+  %.99 = add i32 %.56, 1
+  %.100 = add i32 %.57, 4
+  %.101 = add i32 %.54, 4
+  %.102 = icmp ult i32 %.101, %.20
+  br i1 %.102, label %.53, label %.48
+
+.103:                                              ; preds = %.48, %.114
+  %.104 = phi i32 [ %.117, %.114 ], [ %.51, %.48 ]
+  %.105 = phi i32 [ %.115, %.114 ], [ %.50, %.48 ]
+  %.106 = phi i32 [ %.116, %.114 ], [ %.49, %.48 ]
+  %.107 = getelementptr inbounds i8, ptr addrspace(3) %.11, i32 %.104
+  %.108 = load i8, ptr addrspace(3) %.107, align 1, !tbaa !15
+  %.109 = icmp eq i8 %.41, %.108
+  br i1 %.109, label %.110, label %.114
+
+.110:                                              ; preds = %.103
+  %.111 = add i32 %.105, 1
+  %.112 = tail call i32 @_Z10atomic_incPU3AS3Vj(ptr addrspace(3) noundef @kernel_round1.collisionsNum) #5
+  %.113 = getelementptr inbounds i32, ptr addrspace(3) @kernel_round1.collisionsData, i32 %.112
+  store i32 %.106, ptr addrspace(3) %.113, align 4, !tbaa !11
+  br label %.114
+
+.114:                                              ; preds = %.110, %.103
+  %.115 = phi i32 [ %.111, %.110 ], [ %.105, %.103 ]
+  %.116 = add i32 %.106, 1
+  %.117 = add nuw i32 %.104, 1
+  %.118 = icmp ult i32 %.117, %.20
+  br i1 %.118, label %.103, label %.32
+
+.119:                                              ; preds = %.32, %.22, %.5
+  tail call void @_Z7barrierj(i32 noundef 1) #5
+  %.120 = load i32, ptr addrspace(3) @kernel_round1.collisionsNum, align 4, !tbaa !11
+  %.121 = icmp ugt i32 %.120, %.9
+  br label %.206
+
+.206:                                              ; preds = %.201, %.119
+  ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare i64 @llvm.fshl.i64(i64, i64, i64) #3
+
+attributes #0 = { convergent mustprogress nofree nounwind willreturn memory(none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1030" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" }
+attributes #1 = { convergent nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1030" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" }
+attributes #2 = { convergent norecurse nounwind "amdgpu-flat-work-group-size"="64,64" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1030" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" "uniform-work-group-size"="true" }
+attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #4 = { convergent nounwind willreturn memory(none) }
+attributes #5 = { convergent nounwind }
+
+!llvm.module.flags = !{!0, !1, !2}
+!opencl.ocl.version = !{!3}
+!llvm.ident = !{!4}
+
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
+!1 = !{i32 1, !"wchar_size", i32 4}
+!2 = !{i32 8, !"PIC Level", i32 2}
+!3 = !{i32 1, i32 2}
+!4 = !{!"clang version 17.0.0 (ssh://chfang@git.amd.com:29418/lightning/ec/llvm-project 06ead8cf696777b9f17876b60707ba9de4d0606f)"}
+!5 = !{i32 1, i32 1, i32 1, i32 1, i32 1}
+!6 = !{!"none", !"none", !"none", !"none", !"none"}
+!7 = !{!"char*", !"char*", !"uint*", !"uint*", !"uint*"}
+!8 = !{!"", !"", !"", !"", !""}
+!9 = !{!"ht_src", !"ht_dst", !"rowCountersSrc", !"rowCountersDst", !"debug"}
+!10 = !{i32 64, i32 1, i32 1}
+!11 = !{!12, !12, i64 0}
+!12 = !{!"int", !13, i64 0}
+!13 = !{!"omnipotent char", !14, i64 0}
+!14 = !{!"Simple C/C++ TBAA"}
+!15 = !{!13, !13, i64 0}
+!16 = !{!17, !17, i64 0}
+!17 = !{!"long", !13, i64 0}

>From c4cce6dbf4d95c7af15006256ad5f381ac883c23 Mon Sep 17 00:00:00 2001
From: Petar Avramovic <Petar.Avramovic at amd.com>
Date: Tue, 26 Sep 2023 18:23:52 +0200
Subject: [PATCH 3/3] AMDGPU: Fix temporal divergence introduced by
 machine-sink

Temporal divergence that was present in input or introduced in IR
transforms, like code-sinking or LICM, is handled in SIFixSGPRCopies
by changing sgpr source instr to vgpr instr.
After 5b657f50b8e8dc5836fb80e566ca7569fd04c26f, that moved LICM after
AMDGPUCodeGenPrepare, machine-sinking can introduce temporal divergence
by sinking instructions out of loops.
Add callback in TargetInstrInfo for targets to fix temporal divergence
that can be introduced by sinking instruction outsid cycle with divergent
exit. Uses machine uniformity analysis to detect temporal divergent uses.
---
 llvm/include/llvm/ADT/GenericUniformityImpl.h | 43 +++++++++++++
 llvm/include/llvm/ADT/GenericUniformityInfo.h |  6 ++
 llvm/include/llvm/CodeGen/TargetInstrInfo.h   |  8 +++
 llvm/lib/Analysis/UniformityAnalysis.cpp      |  9 ++-
 llvm/lib/CodeGen/MachineSink.cpp              |  6 ++
 .../lib/CodeGen/MachineUniformityAnalysis.cpp |  9 ++-
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        | 60 +++++++++++++++++++
 llvm/lib/Target/AMDGPU/SIInstrInfo.h          |  3 +
 ...ne-sink-temporal-divergence-swdev407790.ll |  6 +-
 9 files changed, 144 insertions(+), 6 deletions(-)

diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h
index ddd0746ccd91632..755e1161b41b521 100644
--- a/llvm/include/llvm/ADT/GenericUniformityImpl.h
+++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h
@@ -341,6 +341,9 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
   using DivergenceDescriptorT =
       typename SyncDependenceAnalysisT::DivergenceDescriptor;
   using BlockLabelMapT = typename SyncDependenceAnalysisT::BlockLabelMap;
+  using UseOutsideCycleWithDivergentExitInfoT =
+      typename std::tuple<ConstValueRefT, const InstructionT *,
+                          SmallVector<BlockT *, 4>>;
 
   GenericUniformityAnalysisImpl(const DominatorTreeT &DT, const CycleInfoT &CI,
                                 const TargetTransformInfo *TTI)
@@ -396,6 +399,9 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
 
   void print(raw_ostream &out) const;
 
+  iterator_range<const UseOutsideCycleWithDivergentExitInfoT *>
+  uses_outside_cycles_with_divergent_exit() const;
+
 protected:
   /// \brief Value/block pair representing a single phi input.
   struct PhiInput {
@@ -427,6 +433,8 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
 
   // Recognized cycles with divergent exits.
   SmallPtrSet<const CycleT *, 16> DivergentExitCycles;
+  SmallVector<UseOutsideCycleWithDivergentExitInfoT, 4>
+      UsesOutsideCyclesWithDivergentExit;
 
   // Cycles assumed to be divergent.
   //
@@ -470,6 +478,10 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
   /// \brief Whether \p Def is divergent when read in \p ObservingBlock.
   bool isTemporalDivergent(const BlockT &ObservingBlock,
                            const InstructionT &Def) const;
+
+  void recordUseOutsideCycleWithDivergentExit(ConstValueRefT Src,
+                                              const InstructionT *UserInstr,
+                                              const CycleT &DefCycle);
 };
 
 template <typename ImplT>
@@ -1210,6 +1222,20 @@ void GenericUniformityAnalysisImpl<ContextT>::print(raw_ostream &OS) const {
   }
 }
 
+template <typename ContextT>
+using UseOutsideCycleWithDivergentExitInfoT =
+    typename std::tuple<typename ContextT::ConstValueRefT,
+                        const typename ContextT::InstructionT *,
+                        SmallVector<typename ContextT::BlockT *, 4>>;
+
+template <typename ContextT>
+iterator_range<const UseOutsideCycleWithDivergentExitInfoT<ContextT> *>
+GenericUniformityAnalysisImpl<
+    ContextT>::uses_outside_cycles_with_divergent_exit() const {
+  return make_range(UsesOutsideCyclesWithDivergentExit.begin(),
+                    UsesOutsideCyclesWithDivergentExit.end());
+}
+
 template <typename ContextT>
 bool GenericUniformityInfo<ContextT>::hasDivergence() const {
   return DA->hasDivergence();
@@ -1248,6 +1274,13 @@ void GenericUniformityInfo<ContextT>::print(raw_ostream &out) const {
   DA->print(out);
 }
 
+template <typename ContextT>
+iterator_range<const UseOutsideCycleWithDivergentExitInfoT<ContextT> *>
+GenericUniformityInfo<ContextT>::uses_outside_cycles_with_divergent_exit()
+    const {
+  return DA->uses_outside_cycles_with_divergent_exit();
+}
+
 template <typename ContextT>
 void llvm::ModifiedPostOrder<ContextT>::computeStackPO(
     SmallVectorImpl<const BlockT *> &Stack, const CycleInfoT &CI,
@@ -1367,6 +1400,16 @@ void llvm::ModifiedPostOrder<ContextT>::compute(const CycleInfoT &CI) {
   computeStackPO(Stack, CI, nullptr, Finalized);
 }
 
+template <typename ContextT>
+void GenericUniformityAnalysisImpl<ContextT>::
+    recordUseOutsideCycleWithDivergentExit(ConstValueRefT Src,
+                                           const InstructionT *UserInstr,
+                                           const CycleT &DefCycle) {
+  SmallVector<BlockT *, 4> TmpExitBlocks;
+  DefCycle.getExitBlocks(TmpExitBlocks);
+  UsesOutsideCyclesWithDivergentExit.push_back({Src, UserInstr, TmpExitBlocks});
+}
+
 } // namespace llvm
 
 #undef DEBUG_TYPE
diff --git a/llvm/include/llvm/ADT/GenericUniformityInfo.h b/llvm/include/llvm/ADT/GenericUniformityInfo.h
index e53afccc020b469..a76813d4bb964a1 100644
--- a/llvm/include/llvm/ADT/GenericUniformityInfo.h
+++ b/llvm/include/llvm/ADT/GenericUniformityInfo.h
@@ -39,6 +39,9 @@ template <typename ContextT> class GenericUniformityInfo {
 
   using CycleInfoT = GenericCycleInfo<ContextT>;
   using CycleT = typename CycleInfoT::CycleT;
+  using UseOutsideCycleWithDivergentExitInfoT =
+      typename std::tuple<ConstValueRefT, const InstructionT *,
+                          SmallVector<BlockT *, 4>>;
 
   GenericUniformityInfo(const DominatorTreeT &DT, const CycleInfoT &CI,
                         const TargetTransformInfo *TTI = nullptr);
@@ -78,6 +81,9 @@ template <typename ContextT> class GenericUniformityInfo {
 
   void print(raw_ostream &Out) const;
 
+  iterator_range<const UseOutsideCycleWithDivergentExitInfoT *>
+  uses_outside_cycles_with_divergent_exit() const;
+
 private:
   using ImplT = GenericUniformityAnalysisImpl<ContextT>;
 
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 98679b4dcf3cbfb..2135484448ef4ad 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -19,6 +19,8 @@
 #include "llvm/ADT/Uniformity.h"
 #include "llvm/CodeGen/MIRFormatter.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineCycleAnalysis.h"
+#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -150,6 +152,12 @@ class TargetInstrInfo : public MCInstrInfo {
     return false;
   }
 
+  virtual void fixTemporalDivergence(MachineFunction &MF,
+                                     MachineDominatorTree *DT,
+                                     MachineCycleInfo *CI) const {
+    return;
+  }
+
 protected:
   /// For instructions with opcodes for which the M_REMATERIALIZABLE flag is
   /// set, this hook lets the target specify whether the instruction is actually
diff --git a/llvm/lib/Analysis/UniformityAnalysis.cpp b/llvm/lib/Analysis/UniformityAnalysis.cpp
index 2d617db431c5888..df1299610469d30 100644
--- a/llvm/lib/Analysis/UniformityAnalysis.cpp
+++ b/llvm/lib/Analysis/UniformityAnalysis.cpp
@@ -80,12 +80,17 @@ template <>
 void llvm::GenericUniformityAnalysisImpl<
     SSAContext>::propagateTemporalDivergence(const Instruction &I,
                                              const Cycle &DefCycle) {
-  if (isDivergent(I))
-    return;
   for (auto *User : I.users()) {
     auto *UserInstr = cast<Instruction>(User);
     if (DefCycle.contains(UserInstr->getParent()))
       continue;
+
+    recordUseOutsideCycleWithDivergentExit(cast<Value>(&I), UserInstr,
+                                           DefCycle);
+
+    if (isDivergent(I))
+      continue;
+
     markDivergent(*UserInstr);
   }
 }
diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index 15a6c00bce892c6..ef35ec57a56971a 100644
--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -777,6 +777,12 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {
     MRI->clearKillFlags(I);
   RegsToClearKillFlags.clear();
 
+  // In some tests there is a new baic block, need to recompute
+  DT->calculate(MF);
+  CI->clear();
+  CI->compute(MF);
+  TII->fixTemporalDivergence(MF, DT, CI);
+
   return EverMadeChange;
 }
 
diff --git a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
index 3e0fe2b1ba087fe..5bd87d069285857 100644
--- a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
+++ b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
@@ -117,11 +117,16 @@ void llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::
     if (!Op.getReg().isVirtual())
       continue;
     auto Reg = Op.getReg();
-    if (isDivergent(Reg))
-      continue;
+
     for (MachineInstr &UserInstr : RegInfo.use_instructions(Reg)) {
       if (DefCycle.contains(UserInstr.getParent()))
         continue;
+
+      recordUseOutsideCycleWithDivergentExit(Reg, &UserInstr, DefCycle);
+
+      if (isDivergent(Reg))
+        continue;
+
       markDivergent(UserInstr);
     }
   }
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 2799a3e78b04d22..5db140bb0fc6c94 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -24,6 +24,7 @@
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/CodeGen/MachineUniformityAnalysis.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/IR/DiagnosticInfo.h"
@@ -171,6 +172,65 @@ bool SIInstrInfo::isIgnorableUse(const MachineOperand &MO) const {
          isVALU(*MO.getParent()) && !resultDependsOnExec(*MO.getParent());
 }
 
+// Get poiners to build instruction after MI (skips phis if needed)
+static std::pair<MachineBasicBlock *, MachineBasicBlock::iterator>
+getInsertAfterPtrs(MachineInstr *MI) {
+  MachineBasicBlock *InsertMBB = MI->getParent();
+  return std::make_pair(
+      InsertMBB, InsertMBB->SkipPHIsAndLabels(std::next(MI->getIterator())));
+}
+
+static void replaceUseRegisterWith(const MachineInstr *MI, Register Reg,
+                                   Register Newreg) {
+  for (unsigned i = 0; i < MI->getNumOperands(); ++i) {
+    const MachineOperand &Op = MI->getOperand(i);
+    if (Op.isReg() && Op.getReg() == Reg) {
+      const_cast<MachineInstr *>(MI)->getOperand(i).setReg(Newreg);
+    }
+  }
+}
+
+void SIInstrInfo::fixTemporalDivergence(MachineFunction &MF,
+                                        MachineDominatorTree *DT,
+                                        MachineCycleInfo *CI) const {
+  MachineUniformityInfo MUI =
+      computeMachineUniformityInfo(MF, *CI, DT->getBase(), true);
+
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
+  const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+  const SIRegisterInfo &TRI = *Subtarget.getRegisterInfo();
+
+  // Temporal divergence lowering is required for uniform UniformSourceReg and
+  // divergent UserInstr. UserInstr is uniform only when cycle has uniform exit.
+  for (auto [SrcReg, UserInstr, CycleExitBlocks] :
+       MUI.uses_outside_cycles_with_divergent_exit()) {
+    if (!MUI.isUniform(SrcReg) || !MUI.isDivergent(UserInstr))
+      continue;
+
+    MachineInstr *UniformSourceInstr = MRI.getVRegDef(SrcReg);
+
+    // FixMe: SrcReg is lane mask in this case. Find a better way to detect it.
+    if (UniformSourceInstr->getOpcode() == AMDGPU::SI_IF_BREAK ||
+        UserInstr->getOpcode() == AMDGPU::SI_IF)
+      continue;
+
+    unsigned Size = TRI.getRegSizeInBits(*MRI.getRegClassOrNull(SrcReg));
+    Register VgprDst =
+        MRI.createVirtualRegister(TRI.getVGPRClassForBitWidth(Size));
+
+    auto [MBB, AfterUniformSourceReg] = getInsertAfterPtrs(UniformSourceInstr);
+    BuildMI(*MBB, AfterUniformSourceReg, {}, TII.get(AMDGPU::COPY))
+        .addDef(VgprDst)
+        .addReg(SrcReg)
+        .addReg(AMDGPU::EXEC, RegState::Implicit);
+
+    replaceUseRegisterWith(UserInstr, SrcReg, VgprDst);
+  }
+
+  return;
+}
+
 bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
                                           int64_t &Offset0,
                                           int64_t &Offset1) const {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index e85917a4c0f3296..4446a962d7cca44 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -222,6 +222,9 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
 
   bool isIgnorableUse(const MachineOperand &MO) const override;
 
+  void fixTemporalDivergence(MachineFunction &MF, MachineDominatorTree *DT,
+                             MachineCycleInfo *CI) const override;
+
   bool areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, int64_t &Offset0,
                                int64_t &Offset1) const override;
 
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
index ca1cf526d949a14..0c631bdcdd374a8 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
@@ -167,6 +167,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s59
 ; CHECK-NEXT:    s_add_i32 s58, s58, 4
 ; CHECK-NEXT:    s_add_i32 s4, s55, s58
+; CHECK-NEXT:    v_mov_b32_e32 v0, s58
 ; CHECK-NEXT:    s_add_i32 s5, s4, 5
 ; CHECK-NEXT:    s_add_i32 s4, s4, 1
 ; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc_lo, s5, v42
@@ -267,7 +268,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:  .LBB0_16: ; %Flow43
 ; CHECK-NEXT:    ; in Loop: Header=BB0_5 Depth=1
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s57
-; CHECK-NEXT:    v_add_nc_u32_e32 v57, s58, v57
+; CHECK-NEXT:    v_add_nc_u32_e32 v57, v57, v0
 ; CHECK-NEXT:  .LBB0_17: ; %Flow44
 ; CHECK-NEXT:    ; in Loop: Header=BB0_5 Depth=1
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s56
@@ -869,6 +870,7 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
 ; CHECK-NEXT:    s_add_i32 s7, s7, 4
 ; CHECK-NEXT:    v_add_nc_u32_e32 v43, 1, v43
 ; CHECK-NEXT:    s_add_i32 s8, s4, s7
+; CHECK-NEXT:    v_mov_b32_e32 v0, s7
 ; CHECK-NEXT:    s_add_i32 s9, s8, 5
 ; CHECK-NEXT:    s_add_i32 s8, s8, 1
 ; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc_lo, s9, v41
@@ -879,7 +881,7 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
 ; CHECK-NEXT:  ; %bb.4: ; %Flow3
 ; CHECK-NEXT:    ; in Loop: Header=BB1_1 Depth=1
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s6
-; CHECK-NEXT:    v_add_nc_u32_e32 v47, s7, v47
+; CHECK-NEXT:    v_add_nc_u32_e32 v47, v47, v0
 ; CHECK-NEXT:  .LBB1_5: ; %Flow4
 ; CHECK-NEXT:    ; in Loop: Header=BB1_1 Depth=1
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s5