[llvm] AMDGPU: Fix temporal divergence introduced by machine-sink and performance regression introduced by D155343 (PR #67456)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Oct 3 10:08:01 PDT 2023
https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/67456
>From cb0e1fe3206b2acb306c4eb07ea476a6c5960869 Mon Sep 17 00:00:00 2001
From: Petar Avramovic <Petar.Avramovic at amd.com>
Date: Thu, 21 Sep 2023 13:02:43 +0200
Subject: [PATCH 1/3] Revert "MachineSink: Fix sinking VGPR def out of a
divergent loop"
This reverts commit 3f8ef57bede94445b1a1042c987cc914a886e7ff.
---
llvm/lib/CodeGen/MachineSink.cpp | 15 ++++-----------
...-loop-var-out-of-divergent-loop-swdev407790.ll | 2 +-
...loop-var-out-of-divergent-loop-swdev407790.mir | 2 +-
.../CodeGen/AMDGPU/sink-after-control-flow.mir | 2 +-
4 files changed, 7 insertions(+), 14 deletions(-)
diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index 9d4e0c647048f53..02c7880f86f00a1 100644
--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -300,7 +300,8 @@ static bool blockPrologueInterferes(const MachineBasicBlock *BB,
if (!Reg)
continue;
if (MO.isUse()) {
- if (Reg.isPhysical() && MRI && MRI->isConstantPhysReg(Reg))
+ if (Reg.isPhysical() &&
+ (TII->isIgnorableUse(MO) || (MRI && MRI->isConstantPhysReg(Reg))))
continue;
if (PI->modifiesRegister(Reg, TRI))
return true;
@@ -1247,24 +1248,16 @@ MachineSinking::FindSuccToSinkTo(MachineInstr &MI, MachineBasicBlock *MBB,
if (MBB == SuccToSinkTo)
return nullptr;
- if (!SuccToSinkTo)
- return nullptr;
-
// It's not safe to sink instructions to EH landing pad. Control flow into
// landing pad is implicitly defined.
- if (SuccToSinkTo->isEHPad())
+ if (SuccToSinkTo && SuccToSinkTo->isEHPad())
return nullptr;
// It ought to be okay to sink instructions into an INLINEASM_BR target, but
// only if we make sure that MI occurs _before_ an INLINEASM_BR instruction in
// the source block (which this code does not yet do). So for now, forbid
// doing so.
- if (SuccToSinkTo->isInlineAsmBrIndirectTarget())
- return nullptr;
-
- MachineBasicBlock::const_iterator InsertPos =
- SuccToSinkTo->SkipPHIsAndLabels(SuccToSinkTo->begin());
- if (blockPrologueInterferes(SuccToSinkTo, InsertPos, MI, TRI, TII, MRI))
+ if (SuccToSinkTo && SuccToSinkTo->isInlineAsmBrIndirectTarget())
return nullptr;
return SuccToSinkTo;
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll
index e2456b74f7ef1fa..b8e74bc7db09a1a 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll
@@ -21,6 +21,7 @@ define void @machinesink_loop_variable_out_of_divergent_loop(i32 %arg, i1 %cmp49
; CHECK-NEXT: .LBB0_1: ; %Flow
; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8
+; CHECK-NEXT: v_add_nc_u32_e32 v4, -4, v4
; CHECK-NEXT: .LBB0_2: ; %Flow1
; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7
@@ -53,7 +54,6 @@ define void @machinesink_loop_variable_out_of_divergent_loop(i32 %arg, i1 %cmp49
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: v_add_nc_u32_e32 v4, s9, v2
; CHECK-NEXT: v_cmp_ge_u32_e64 s4, v4, v0
-; CHECK-NEXT: v_add_nc_u32_e32 v4, -4, v4
; CHECK-NEXT: s_or_b32 s8, s4, s8
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
; CHECK-NEXT: s_cbranch_execz .LBB0_1
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.mir b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.mir
index cc14b4a80d58a7d..037a285794120da 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.mir
@@ -42,7 +42,6 @@ body: |
; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.5(0x40000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */
- ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec
; CHECK-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_32 = SI_IF_BREAK killed [[SI_IF1]], [[SI_IF]], implicit-def dead $scc
; CHECK-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; CHECK-NEXT: S_BRANCH %bb.5
@@ -52,6 +51,7 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY]], %bb.4
; CHECK-NEXT: SI_END_CF [[SI_IF_BREAK]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec
; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]]
; CHECK-NEXT: S_BRANCH %bb.2
; CHECK-NEXT: {{ $}}
diff --git a/llvm/test/CodeGen/AMDGPU/sink-after-control-flow.mir b/llvm/test/CodeGen/AMDGPU/sink-after-control-flow.mir
index ee3d7aeb454f96b..4feef2149b42249 100644
--- a/llvm/test/CodeGen/AMDGPU/sink-after-control-flow.mir
+++ b/llvm/test/CodeGen/AMDGPU/sink-after-control-flow.mir
@@ -17,7 +17,6 @@ body: |
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 8
- ; GFX10-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[S_MOV_B32_]], [[DEF]], implicit $exec
; GFX10-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[DEF]], 8, 5, implicit $exec
; GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 5
; GFX10-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 [[V_BFE_U32_e64_]], killed [[S_MOV_B32_1]], implicit $exec
@@ -38,6 +37,7 @@ body: |
; GFX10-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000)
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[S_XOR_B32_1]], implicit-def $scc
+ ; GFX10-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[S_MOV_B32_]], [[DEF]], implicit $exec
; GFX10-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 31
; GFX10-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 [[V_BFE_U32_e64_]], killed [[S_MOV_B32_2]], implicit $exec
; GFX10-NEXT: [[S_XOR_B32_2:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_NE_U32_e64_1]], -1, implicit-def $scc
>From 45833ace0bf89fbee5ca9682d786b3fe7a490960 Mon Sep 17 00:00:00 2001
From: Petar Avramovic <Petar.Avramovic at amd.com>
Date: Thu, 21 Sep 2023 14:20:49 +0200
Subject: [PATCH 2/3] AMDGPU: Add test for temporal divergence introduced by
machine-sink
Introduced by 5b657f50b8e8dc5836fb80e566ca7569fd04c26f that moved
LICM after AMDGPUCodeGenPrepare. Some instructions are no longer
sunk during ir optimizations but in machine-sinking instead.
If vgpr instruction used sgpr defined inside the cycle is sunk outside
of the cycle we end up with not-handled case of temporal divergence.
Add test for theoretical case when SALU instruction (represents
uniform value) is sunk outside of the cycle.
---
.../CodeGen/AMDGPU/machine-sink-lane-mask.mir | 143 ++
...ne-sink-temporal-divergence-swdev407790.ll | 1092 ++++++++++++++
...e-sink-temporal-divergence-swdev407790.mir | 1319 +++++++++++++++++
3 files changed, 2554 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/machine-sink-lane-mask.mir
create mode 100644 llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.mir
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-lane-mask.mir b/llvm/test/CodeGen/AMDGPU/machine-sink-lane-mask.mir
new file mode 100644
index 000000000000000..04c80582f6f0797
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-lane-mask.mir
@@ -0,0 +1,143 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 -run-pass=machine-sink -o - %s | FileCheck %s
+
+---
+name: multi_else_break
+tracksRegLiveness: true
+body: |
+ ; CHECK-LABEL: name: multi_else_break
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: liveins: $vgpr4, $vgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]], implicit $exec
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.2(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.0, %9, %bb.6
+ ; CHECK-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, %11, %bb.6
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.5(0x40000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[PHI2:%[0-9]+]]:sreg_32 = PHI [[DEF1]], %bb.1, %13, %bb.5
+ ; CHECK-NEXT: [[PHI3:%[0-9]+]]:sreg_32 = PHI [[DEF]], %bb.1, %15, %bb.5
+ ; CHECK-NEXT: [[PHI4:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.1, %17, %bb.5
+ ; CHECK-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI [[PHI1]], %bb.1, %19, %bb.5
+ ; CHECK-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_32 = V_CMP_LT_I32_e64 [[PHI5]], [[COPY1]], implicit $exec
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF2]]
+ ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[PHI3]], $exec_lo, implicit-def $scc
+ ; CHECK-NEXT: [[S_OR_B32_1:%[0-9]+]]:sreg_32 = S_OR_B32 [[PHI2]], $exec_lo, implicit-def $scc
+ ; CHECK-NEXT: [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_LT_I32_e64_]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: S_BRANCH %bb.4
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.3:
+ ; CHECK-NEXT: SI_END_CF %9, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: S_ENDPGM 0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.4:
+ ; CHECK-NEXT: successors: %bb.5(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI5]], [[S_MOV_B32_1]], 0, implicit $exec
+ ; CHECK-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 [[COPY]], [[V_ADD_U32_e64_]], implicit $exec
+ ; CHECK-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32 = S_ANDN2_B32 [[S_OR_B32_]], $exec_lo, implicit-def $scc
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_ANDN2_B32_]]
+ ; CHECK-NEXT: [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32 = S_ANDN2_B32 [[S_OR_B32_1]], $exec_lo, implicit-def $scc
+ ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[V_CMP_NE_U32_e64_]], $exec_lo, implicit-def $scc
+ ; CHECK-NEXT: [[S_OR_B32_2:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_ANDN2_B32_1]], [[S_AND_B32_]], implicit-def $scc
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.5:
+ ; CHECK-NEXT: successors: %bb.6(0x04000000), %bb.2(0x7c000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[PHI6:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_1]], %bb.2, [[S_OR_B32_2]], %bb.4
+ ; CHECK-NEXT: [[PHI7:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]], %bb.2, [[COPY4]], %bb.4
+ ; CHECK-NEXT: [[PHI8:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.2, [[V_ADD_U32_e64_]], %bb.4
+ ; CHECK-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[PHI6]], [[PHI4]], implicit-def dead $scc
+ ; CHECK-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: S_BRANCH %bb.6
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.6:
+ ; CHECK-NEXT: successors: %bb.3(0x04000000), %bb.1(0x7c000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[PHI9:%[0-9]+]]:vgpr_32 = PHI [[PHI8]], %bb.5
+ ; CHECK-NEXT: SI_END_CF [[SI_IF_BREAK]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: [[SI_IF_BREAK1:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[PHI7]], [[PHI]], implicit-def dead $scc
+ ; CHECK-NEXT: SI_LOOP [[SI_IF_BREAK1]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: S_BRANCH %bb.3
+ bb.0:
+ successors: %bb.1(0x80000000)
+ liveins: $vgpr4, $vgpr5
+
+ %21:vgpr_32 = COPY $vgpr5
+ %20:vgpr_32 = COPY $vgpr4
+ %23:sreg_32 = S_MOV_B32 0
+ %33:vgpr_32 = COPY %23, implicit $exec
+ %38:sreg_32 = IMPLICIT_DEF
+ %44:sreg_32 = IMPLICIT_DEF
+ %26:sreg_32 = IMPLICIT_DEF
+ %29:sreg_32 = S_MOV_B32 1
+
+ bb.1:
+ successors: %bb.2(0x80000000)
+
+ %0:sreg_32 = PHI %23, %bb.0, %12, %bb.6
+ %1:vgpr_32 = PHI %33, %bb.0, %13, %bb.6
+
+ bb.2:
+ successors: %bb.4(0x40000000), %bb.5(0x40000000)
+
+ %48:sreg_32 = PHI %44, %bb.1, %10, %bb.5
+ %42:sreg_32 = PHI %38, %bb.1, %8, %bb.5
+ %2:sreg_32 = PHI %23, %bb.1, %11, %bb.5
+ %3:vgpr_32 = PHI %1, %bb.1, %9, %bb.5
+ %27:sreg_32 = V_CMP_LT_I32_e64 %3, %20, implicit $exec
+ %36:vgpr_32 = COPY %26
+ %39:sreg_32 = S_OR_B32 %42, $exec_lo, implicit-def $scc
+ %45:sreg_32 = S_OR_B32 %48, $exec_lo, implicit-def $scc
+ %4:sreg_32 = SI_IF killed %27, %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ S_BRANCH %bb.4
+
+ bb.3:
+ SI_END_CF %12, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ S_ENDPGM 0
+
+ bb.4:
+ successors: %bb.5(0x80000000)
+
+ %6:vgpr_32 = V_ADD_U32_e64 %3, %29, 0, implicit $exec
+ %30:sreg_32 = V_CMP_NE_U32_e64 %21, %6, implicit $exec
+ %43:sreg_32 = S_ANDN2_B32 %39, $exec_lo, implicit-def $scc
+ %40:sreg_32 = COPY %43
+ %49:sreg_32 = S_ANDN2_B32 %45, $exec_lo, implicit-def $scc
+ %50:sreg_32 = S_AND_B32 %30, $exec_lo, implicit-def $scc
+ %46:sreg_32 = S_OR_B32 %49, %50, implicit-def $scc
+
+ bb.5:
+ successors: %bb.6(0x04000000), %bb.2(0x7c000000)
+
+ %10:sreg_32 = PHI %45, %bb.2, %46, %bb.4
+ %8:sreg_32 = PHI %39, %bb.2, %40, %bb.4
+ %9:vgpr_32 = PHI %36, %bb.2, %6, %bb.4
+ SI_END_CF %4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ %11:sreg_32 = SI_IF_BREAK %10, %2, implicit-def dead $scc
+ %12:sreg_32 = SI_IF_BREAK %8, %0, implicit-def dead $scc
+ SI_LOOP %11, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ S_BRANCH %bb.6
+
+ bb.6:
+ successors: %bb.3(0x04000000), %bb.1(0x7c000000)
+
+ %13:vgpr_32 = PHI %9, %bb.5
+ SI_END_CF %11, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ SI_LOOP %12, %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ S_BRANCH %bb.3
+...
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
new file mode 100644
index 000000000000000..ca1cf526d949a14
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
@@ -0,0 +1,1092 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 < %s | FileCheck %s
+
+; ModuleID = 'kernel_round1_passing.bc'
+source_filename = "/tmp/comgr-295d04/input/CompileSource"
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8"
+target triple = "amdgcn-amd-amdhsa"
+
+ at kernel_round1.first_words_data = external hidden unnamed_addr addrspace(3) global [896 x i8], align 1
+ at kernel_round1.collisionsData = external hidden unnamed_addr addrspace(3) global [3840 x i32], align 4
+ at kernel_round1.collisionsNum = external hidden addrspace(3) global i32, align 4
+
+; Function Attrs: convergent mustprogress nofree nounwind willreturn memory(none)
+declare hidden i64 @_Z13get_global_idj(i32 noundef) local_unnamed_addr #0
+
+; Function Attrs: convergent nounwind
+declare hidden i32 @_Z10atomic_addPU3AS1Vjj(ptr addrspace(1) noundef, i32 noundef) local_unnamed_addr #1
+
+; Function Attrs: convergent nounwind
+declare hidden i32 @_Z10atomic_subPU3AS1Vjj(ptr addrspace(1) noundef, i32 noundef) local_unnamed_addr #1
+
+; Function Attrs: convergent mustprogress nofree nounwind willreturn memory(none)
+declare hidden i64 @_Z12get_local_idj(i32 noundef) local_unnamed_addr #0
+
+; Function Attrs: convergent nounwind
+declare hidden void @_Z7barrierj(i32 noundef) local_unnamed_addr #1
+
+; Function Attrs: convergent mustprogress nofree nounwind willreturn memory(none)
+declare hidden i32 @_Z3minjj(i32 noundef, i32 noundef) local_unnamed_addr #0
+
+; Function Attrs: convergent nounwind
+declare hidden i32 @_Z10atomic_incPU3AS3Vj(ptr addrspace(3) noundef) local_unnamed_addr #1
+
+; Function Attrs: convergent mustprogress nofree nounwind willreturn memory(none)
+declare hidden i64 @_Z14get_local_sizej(i32 noundef) local_unnamed_addr #0
+
+; Function Attrs: convergent norecurse nounwind
+define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture noundef readonly align 1 %0, ptr addrspace(1) nocapture noundef writeonly align 1 %1, ptr addrspace(1) nocapture noundef readonly align 4 %2, ptr addrspace(1) noundef align 4 %3, ptr addrspace(1) nocapture noundef readnone align 4 %4) local_unnamed_addr #2 !kernel_arg_addr_space !5 !kernel_arg_access_qual !6 !kernel_arg_type !7 !kernel_arg_base_type !7 !kernel_arg_type_qual !8 !kernel_arg_name !9 !reqd_work_group_size !10 {
+; CHECK-LABEL: kernel_round1:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_add_u32 s10, s10, s15
+; CHECK-NEXT: s_mov_b32 s32, 0
+; CHECK-NEXT: s_addc_u32 s11, s11, 0
+; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
+; CHECK-NEXT: s_load_dwordx8 s[44:51], s[6:7], 0x0
+; CHECK-NEXT: s_add_u32 s0, s0, s15
+; CHECK-NEXT: s_mov_b64 s[34:35], s[6:7]
+; CHECK-NEXT: s_addc_u32 s1, s1, 0
+; CHECK-NEXT: v_mov_b32_e32 v41, v0
+; CHECK-NEXT: s_add_u32 s42, s34, 40
+; CHECK-NEXT: v_mov_b32_e32 v31, v0
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9]
+; CHECK-NEXT: s_addc_u32 s43, s35, 0
+; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
+; CHECK-NEXT: s_mov_b64 s[8:9], s[42:43]
+; CHECK-NEXT: s_mov_b32 s33, s14
+; CHECK-NEXT: s_mov_b32 s40, s13
+; CHECK-NEXT: s_mov_b32 s41, s12
+; CHECK-NEXT: s_mov_b64 s[38:39], s[4:5]
+; CHECK-NEXT: s_getpc_b64 s[6:7]
+; CHECK-NEXT: s_add_u32 s6, s6, _Z13get_global_idj at rel32@lo+4
+; CHECK-NEXT: s_addc_u32 s7, s7, _Z13get_global_idj at rel32@hi+12
+; CHECK-NEXT: v_mov_b32_e32 v45, 0
+; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; CHECK-NEXT: v_mov_b32_e32 v43, v0
+; CHECK-NEXT: v_mov_b32_e32 v31, v41
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39]
+; CHECK-NEXT: s_mov_b64 s[8:9], s[42:43]
+; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
+; CHECK-NEXT: s_mov_b32 s12, s41
+; CHECK-NEXT: s_mov_b32 s13, s40
+; CHECK-NEXT: s_mov_b32 s14, s33
+; CHECK-NEXT: s_getpc_b64 s[6:7]
+; CHECK-NEXT: s_add_u32 s6, s6, _Z12get_local_idj at rel32@lo+4
+; CHECK-NEXT: s_addc_u32 s7, s7, _Z12get_local_idj at rel32@hi+12
+; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; CHECK-NEXT: v_mov_b32_e32 v40, v0
+; CHECK-NEXT: v_mov_b32_e32 v31, v41
+; CHECK-NEXT: v_mov_b32_e32 v0, 1
+; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39]
+; CHECK-NEXT: s_mov_b64 s[8:9], s[42:43]
+; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
+; CHECK-NEXT: s_mov_b32 s12, s41
+; CHECK-NEXT: s_mov_b32 s13, s40
+; CHECK-NEXT: s_mov_b32 s14, s33
+; CHECK-NEXT: ds_write_b32 v45, v45 offset:15360
+; CHECK-NEXT: s_getpc_b64 s[52:53]
+; CHECK-NEXT: s_add_u32 s52, s52, _Z7barrierj at rel32@lo+4
+; CHECK-NEXT: s_addc_u32 s53, s53, _Z7barrierj at rel32@hi+12
+; CHECK-NEXT: s_swappc_b64 s[30:31], s[52:53]
+; CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v43
+; CHECK-NEXT: v_lshlrev_b32_e32 v1, 2, v43
+; CHECK-NEXT: v_mov_b32_e32 v31, v41
+; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39]
+; CHECK-NEXT: s_mov_b64 s[8:9], s[42:43]
+; CHECK-NEXT: v_and_b32_e32 v0, 0x7ffffffc, v0
+; CHECK-NEXT: v_and_b32_e32 v1, 28, v1
+; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
+; CHECK-NEXT: s_mov_b32 s12, s41
+; CHECK-NEXT: s_mov_b32 s13, s40
+; CHECK-NEXT: global_load_dword v0, v0, s[48:49]
+; CHECK-NEXT: s_mov_b32 s14, s33
+; CHECK-NEXT: s_getpc_b64 s[6:7]
+; CHECK-NEXT: s_add_u32 s6, s6, _Z3minjj at rel32@lo+4
+; CHECK-NEXT: s_addc_u32 s7, s7, _Z3minjj at rel32@hi+12
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_bfe_u32 v0, v0, v1, 4
+; CHECK-NEXT: v_mov_b32_e32 v1, 12
+; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; CHECK-NEXT: v_mov_b32_e32 v42, v0
+; CHECK-NEXT: s_mov_b32 s48, exec_lo
+; CHECK-NEXT: v_cmpx_ne_u32_e32 0, v42
+; CHECK-NEXT: s_cbranch_execz .LBB0_25
+; CHECK-NEXT: ; %bb.1: ; %.preheader5
+; CHECK-NEXT: v_mul_lo_u32 v0, v40, 14
+; CHECK-NEXT: s_mov_b32 s4, 0
+; CHECK-NEXT: s_mov_b32 s5, 0
+; CHECK-NEXT: v_add_nc_u32_e32 v44, 0x3c04, v0
+; CHECK-NEXT: .LBB0_2: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: v_add_nc_u32_e32 v1, s5, v44
+; CHECK-NEXT: s_add_i32 s5, s5, 1
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc_lo, s5, v42
+; CHECK-NEXT: ds_write_b8 v1, v45
+; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
+; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; CHECK-NEXT: s_cbranch_execnz .LBB0_2
+; CHECK-NEXT: ; %bb.3:
+; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; CHECK-NEXT: v_add_nc_u32_e32 v45, -1, v42
+; CHECK-NEXT: s_mov_b32 s49, 0
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v45
+; CHECK-NEXT: s_and_b32 exec_lo, exec_lo, vcc_lo
+; CHECK-NEXT: s_cbranch_execz .LBB0_25
+; CHECK-NEXT: ; %bb.4:
+; CHECK-NEXT: v_lshlrev_b32_e32 v43, 10, v43
+; CHECK-NEXT: v_add_nc_u32_e32 v46, 0x3c05, v0
+; CHECK-NEXT: v_mov_b32_e32 v47, 0
+; CHECK-NEXT: s_getpc_b64 s[42:43]
+; CHECK-NEXT: s_add_u32 s42, s42, _Z10atomic_incPU3AS3Vj at rel32@lo+4
+; CHECK-NEXT: s_addc_u32 s43, s43, _Z10atomic_incPU3AS3Vj at rel32@hi+12
+; CHECK-NEXT: s_mov_b32 s55, 0
+; CHECK-NEXT: .LBB0_5: ; =>This Loop Header: Depth=1
+; CHECK-NEXT: ; Child Loop BB0_8 Depth 2
+; CHECK-NEXT: ; Child Loop BB0_20 Depth 2
+; CHECK-NEXT: v_add_nc_u32_e32 v0, s55, v44
+; CHECK-NEXT: s_lshl_b32 s4, s55, 5
+; CHECK-NEXT: s_add_i32 s54, s55, 1
+; CHECK-NEXT: s_add_i32 s5, s55, 5
+; CHECK-NEXT: v_or3_b32 v57, s4, v43, s54
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: ds_read_u8 v56, v0
+; CHECK-NEXT: v_mov_b32_e32 v59, s54
+; CHECK-NEXT: s_mov_b32 s56, exec_lo
+; CHECK-NEXT: v_cmpx_lt_u32_e64 s5, v42
+; CHECK-NEXT: s_cbranch_execz .LBB0_17
+; CHECK-NEXT: ; %bb.6: ; %.preheader2
+; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_and_b32_e32 v58, 0xff, v56
+; CHECK-NEXT: s_mov_b32 s57, 0
+; CHECK-NEXT: s_mov_b32 s58, 0
+; CHECK-NEXT: s_branch .LBB0_8
+; CHECK-NEXT: .LBB0_7: ; in Loop: Header=BB0_8 Depth=2
+; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s59
+; CHECK-NEXT: s_add_i32 s58, s58, 4
+; CHECK-NEXT: s_add_i32 s4, s55, s58
+; CHECK-NEXT: s_add_i32 s5, s4, 5
+; CHECK-NEXT: s_add_i32 s4, s4, 1
+; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s5, v42
+; CHECK-NEXT: v_mov_b32_e32 v59, s4
+; CHECK-NEXT: s_or_b32 s57, vcc_lo, s57
+; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s57
+; CHECK-NEXT: s_cbranch_execz .LBB0_16
+; CHECK-NEXT: .LBB0_8: ; Parent Loop BB0_5 Depth=1
+; CHECK-NEXT: ; => This Inner Loop Header: Depth=2
+; CHECK-NEXT: v_add_nc_u32_e32 v60, s58, v46
+; CHECK-NEXT: v_add_nc_u32_e32 v59, s58, v57
+; CHECK-NEXT: s_mov_b32 s59, exec_lo
+; CHECK-NEXT: ds_read_u8 v0, v60
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_cmpx_eq_u16_e64 v58, v0
+; CHECK-NEXT: s_cbranch_execz .LBB0_10
+; CHECK-NEXT: ; %bb.9: ; in Loop: Header=BB0_8 Depth=2
+; CHECK-NEXT: v_mov_b32_e32 v31, v41
+; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00
+; CHECK-NEXT: s_add_u32 s8, s34, 40
+; CHECK-NEXT: s_addc_u32 s9, s35, 0
+; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39]
+; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
+; CHECK-NEXT: s_mov_b32 s12, s41
+; CHECK-NEXT: s_mov_b32 s13, s40
+; CHECK-NEXT: s_mov_b32 s14, s33
+; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47
+; CHECK-NEXT: s_swappc_b64 s[30:31], s[42:43]
+; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; CHECK-NEXT: ds_write_b32 v0, v59
+; CHECK-NEXT: .LBB0_10: ; in Loop: Header=BB0_8 Depth=2
+; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s59
+; CHECK-NEXT: ds_read_u8 v0, v60 offset:1
+; CHECK-NEXT: s_mov_b32 s59, exec_lo
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_cmpx_eq_u16_e64 v58, v0
+; CHECK-NEXT: s_cbranch_execz .LBB0_12
+; CHECK-NEXT: ; %bb.11: ; in Loop: Header=BB0_8 Depth=2
+; CHECK-NEXT: v_mov_b32_e32 v31, v41
+; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00
+; CHECK-NEXT: s_add_u32 s8, s34, 40
+; CHECK-NEXT: s_addc_u32 s9, s35, 0
+; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39]
+; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
+; CHECK-NEXT: s_mov_b32 s12, s41
+; CHECK-NEXT: s_mov_b32 s13, s40
+; CHECK-NEXT: s_mov_b32 s14, s33
+; CHECK-NEXT: v_add_nc_u32_e32 v61, 1, v59
+; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47
+; CHECK-NEXT: s_swappc_b64 s[30:31], s[42:43]
+; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; CHECK-NEXT: ds_write_b32 v0, v61
+; CHECK-NEXT: .LBB0_12: ; in Loop: Header=BB0_8 Depth=2
+; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s59
+; CHECK-NEXT: ds_read_u8 v0, v60 offset:2
+; CHECK-NEXT: s_mov_b32 s59, exec_lo
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_cmpx_eq_u16_e64 v58, v0
+; CHECK-NEXT: s_cbranch_execz .LBB0_14
+; CHECK-NEXT: ; %bb.13: ; in Loop: Header=BB0_8 Depth=2
+; CHECK-NEXT: v_mov_b32_e32 v31, v41
+; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00
+; CHECK-NEXT: s_add_u32 s8, s34, 40
+; CHECK-NEXT: s_addc_u32 s9, s35, 0
+; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39]
+; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
+; CHECK-NEXT: s_mov_b32 s12, s41
+; CHECK-NEXT: s_mov_b32 s13, s40
+; CHECK-NEXT: s_mov_b32 s14, s33
+; CHECK-NEXT: v_add_nc_u32_e32 v61, 2, v59
+; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47
+; CHECK-NEXT: s_swappc_b64 s[30:31], s[42:43]
+; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; CHECK-NEXT: ds_write_b32 v0, v61
+; CHECK-NEXT: .LBB0_14: ; in Loop: Header=BB0_8 Depth=2
+; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s59
+; CHECK-NEXT: ds_read_u8 v0, v60 offset:3
+; CHECK-NEXT: s_mov_b32 s59, exec_lo
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_cmpx_eq_u16_e64 v58, v0
+; CHECK-NEXT: s_cbranch_execz .LBB0_7
+; CHECK-NEXT: ; %bb.15: ; in Loop: Header=BB0_8 Depth=2
+; CHECK-NEXT: v_mov_b32_e32 v31, v41
+; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00
+; CHECK-NEXT: s_add_u32 s8, s34, 40
+; CHECK-NEXT: s_addc_u32 s9, s35, 0
+; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39]
+; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
+; CHECK-NEXT: s_mov_b32 s12, s41
+; CHECK-NEXT: s_mov_b32 s13, s40
+; CHECK-NEXT: s_mov_b32 s14, s33
+; CHECK-NEXT: v_add_nc_u32_e32 v59, 3, v59
+; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47
+; CHECK-NEXT: s_swappc_b64 s[30:31], s[42:43]
+; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; CHECK-NEXT: ds_write_b32 v0, v59
+; CHECK-NEXT: s_branch .LBB0_7
+; CHECK-NEXT: .LBB0_16: ; %Flow43
+; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
+; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s57
+; CHECK-NEXT: v_add_nc_u32_e32 v57, s58, v57
+; CHECK-NEXT: .LBB0_17: ; %Flow44
+; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
+; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s56
+; CHECK-NEXT: s_mov_b32 s55, exec_lo
+; CHECK-NEXT: v_cmpx_lt_u32_e64 v59, v42
+; CHECK-NEXT: s_cbranch_execz .LBB0_23
+; CHECK-NEXT: ; %bb.18: ; %.preheader
+; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
+; CHECK-NEXT: s_mov_b32 s56, 0
+; CHECK-NEXT: s_inst_prefetch 0x1
+; CHECK-NEXT: s_branch .LBB0_20
+; CHECK-NEXT: .p2align 6
+; CHECK-NEXT: .LBB0_19: ; in Loop: Header=BB0_20 Depth=2
+; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s57
+; CHECK-NEXT: v_add_nc_u32_e32 v59, 1, v59
+; CHECK-NEXT: v_add_nc_u32_e32 v57, 1, v57
+; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, v59, v42
+; CHECK-NEXT: s_or_b32 s56, vcc_lo, s56
+; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s56
+; CHECK-NEXT: s_cbranch_execz .LBB0_22
+; CHECK-NEXT: .LBB0_20: ; Parent Loop BB0_5 Depth=1
+; CHECK-NEXT: ; => This Inner Loop Header: Depth=2
+; CHECK-NEXT: v_add_nc_u32_e32 v0, v44, v59
+; CHECK-NEXT: ds_read_u8 v0, v0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
+; CHECK-NEXT: s_and_saveexec_b32 s57, s4
+; CHECK-NEXT: s_cbranch_execz .LBB0_19
+; CHECK-NEXT: ; %bb.21: ; in Loop: Header=BB0_20 Depth=2
+; CHECK-NEXT: v_mov_b32_e32 v31, v41
+; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00
+; CHECK-NEXT: s_add_u32 s8, s34, 40
+; CHECK-NEXT: s_addc_u32 s9, s35, 0
+; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39]
+; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
+; CHECK-NEXT: s_mov_b32 s12, s41
+; CHECK-NEXT: s_mov_b32 s13, s40
+; CHECK-NEXT: s_mov_b32 s14, s33
+; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47
+; CHECK-NEXT: s_swappc_b64 s[30:31], s[42:43]
+; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; CHECK-NEXT: ds_write_b32 v0, v57
+; CHECK-NEXT: s_branch .LBB0_19
+; CHECK-NEXT: .LBB0_22: ; %Flow41
+; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
+; CHECK-NEXT: s_inst_prefetch 0x2
+; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s56
+; CHECK-NEXT: .LBB0_23: ; %Flow42
+; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
+; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55
+; CHECK-NEXT: ; %bb.24: ; in Loop: Header=BB0_5 Depth=1
+; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s54, v45
+; CHECK-NEXT: v_cmp_lt_u32_e64 s4, 59, v47
+; CHECK-NEXT: v_add_nc_u32_e32 v46, 1, v46
+; CHECK-NEXT: s_mov_b32 s55, s54
+; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
+; CHECK-NEXT: s_and_b32 s4, exec_lo, s4
+; CHECK-NEXT: s_or_b32 s49, s4, s49
+; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s49
+; CHECK-NEXT: s_cbranch_execnz .LBB0_5
+; CHECK-NEXT: .LBB0_25: ; %Flow49
+; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s48
+; CHECK-NEXT: v_mov_b32_e32 v31, v41
+; CHECK-NEXT: v_mov_b32_e32 v0, 1
+; CHECK-NEXT: s_add_u32 s8, s34, 40
+; CHECK-NEXT: s_addc_u32 s9, s35, 0
+; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39]
+; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
+; CHECK-NEXT: s_mov_b32 s12, s41
+; CHECK-NEXT: s_mov_b32 s13, s40
+; CHECK-NEXT: s_mov_b32 s14, s33
+; CHECK-NEXT: s_swappc_b64 s[30:31], s[52:53]
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: s_mov_b32 s4, exec_lo
+; CHECK-NEXT: ds_read_b32 v47, v0 offset:15360
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_cmpx_gt_u32_e64 v47, v40
+; CHECK-NEXT: s_cbranch_execz .LBB0_33
+; CHECK-NEXT: ; %bb.26:
+; CHECK-NEXT: s_add_u32 s52, s44, 8
+; CHECK-NEXT: s_addc_u32 s53, s45, 0
+; CHECK-NEXT: s_getpc_b64 s[42:43]
+; CHECK-NEXT: s_add_u32 s42, s42, _Z10atomic_addPU3AS1Vjj at rel32@lo+4
+; CHECK-NEXT: s_addc_u32 s43, s43, _Z10atomic_addPU3AS1Vjj at rel32@hi+12
+; CHECK-NEXT: s_mov_b32 s54, 0
+; CHECK-NEXT: s_getpc_b64 s[44:45]
+; CHECK-NEXT: s_add_u32 s44, s44, _Z10atomic_subPU3AS1Vjj at rel32@lo+4
+; CHECK-NEXT: s_addc_u32 s45, s45, _Z10atomic_subPU3AS1Vjj at rel32@hi+12
+; CHECK-NEXT: s_getpc_b64 s[48:49]
+; CHECK-NEXT: s_add_u32 s48, s48, _Z14get_local_sizej at rel32@lo+4
+; CHECK-NEXT: s_addc_u32 s49, s49, _Z14get_local_sizej at rel32@hi+12
+; CHECK-NEXT: s_branch .LBB0_28
+; CHECK-NEXT: .LBB0_27: ; in Loop: Header=BB0_28 Depth=1
+; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55
+; CHECK-NEXT: v_mov_b32_e32 v31, v41
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: s_add_u32 s8, s34, 40
+; CHECK-NEXT: s_addc_u32 s9, s35, 0
+; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39]
+; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
+; CHECK-NEXT: s_mov_b32 s12, s41
+; CHECK-NEXT: s_mov_b32 s13, s40
+; CHECK-NEXT: s_mov_b32 s14, s33
+; CHECK-NEXT: s_swappc_b64 s[30:31], s[48:49]
+; CHECK-NEXT: v_add_co_u32 v40, vcc_lo, v0, v40
+; CHECK-NEXT: v_cmp_le_u32_e32 vcc_lo, v47, v40
+; CHECK-NEXT: s_or_b32 s54, vcc_lo, s54
+; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s54
+; CHECK-NEXT: s_cbranch_execz .LBB0_33
+; CHECK-NEXT: .LBB0_28: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v40
+; CHECK-NEXT: s_mov_b32 s55, exec_lo
+; CHECK-NEXT: ds_read_b32 v0, v0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_lshrrev_b32_e32 v63, 10, v0
+; CHECK-NEXT: v_bfe_u32 v62, v0, 5, 5
+; CHECK-NEXT: v_and_b32_e32 v72, 31, v0
+; CHECK-NEXT: v_mul_u32_u24_e32 v1, 0x180, v63
+; CHECK-NEXT: v_lshlrev_b32_e32 v0, 5, v62
+; CHECK-NEXT: v_lshlrev_b32_e32 v4, 5, v72
+; CHECK-NEXT: v_add_co_u32 v2, s4, s52, v1
+; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, s53, 0, s4
+; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0
+; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
+; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
+; CHECK-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[0:1], off
+; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_xor_b32_e32 v46, v9, v5
+; CHECK-NEXT: v_xor_b32_e32 v45, v8, v4
+; CHECK-NEXT: v_xor_b32_e32 v57, v11, v7
+; CHECK-NEXT: v_xor_b32_e32 v56, v10, v6
+; CHECK-NEXT: v_or_b32_e32 v5, v46, v57
+; CHECK-NEXT: v_or_b32_e32 v4, v45, v56
+; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[4:5]
+; CHECK-NEXT: s_cbranch_execz .LBB0_27
+; CHECK-NEXT: ; %bb.29: ; in Loop: Header=BB0_28 Depth=1
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx2 v[58:59], v[2:3], off offset:16
+; CHECK-NEXT: global_load_dwordx2 v[60:61], v[0:1], off offset:16
+; CHECK-NEXT: v_lshlrev_b32_e32 v0, 4, v45
+; CHECK-NEXT: v_alignbit_b32 v1, v46, v45, 12
+; CHECK-NEXT: v_and_b32_e32 v2, 0xf0000, v45
+; CHECK-NEXT: v_mov_b32_e32 v31, v41
+; CHECK-NEXT: s_add_u32 s8, s34, 40
+; CHECK-NEXT: v_and_b32_e32 v3, 0xf000, v0
+; CHECK-NEXT: v_and_b32_e32 v4, 0xf00, v1
+; CHECK-NEXT: v_and_b32_e32 v0, 0xf0, v0
+; CHECK-NEXT: v_and_b32_e32 v1, 15, v1
+; CHECK-NEXT: s_addc_u32 s9, s35, 0
+; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
+; CHECK-NEXT: v_or3_b32 v2, v3, v2, v4
+; CHECK-NEXT: s_mov_b32 s12, s41
+; CHECK-NEXT: s_mov_b32 s13, s40
+; CHECK-NEXT: s_mov_b32 s14, s33
+; CHECK-NEXT: v_or3_b32 v73, v2, v0, v1
+; CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v73
+; CHECK-NEXT: v_lshlrev_b32_e32 v1, 2, v73
+; CHECK-NEXT: v_and_b32_e32 v0, 0x7fffc, v0
+; CHECK-NEXT: v_lshlrev_b32_e64 v44, v1, 1
+; CHECK-NEXT: v_and_b32_e32 v74, 28, v1
+; CHECK-NEXT: v_add_co_u32 v42, s4, s50, v0
+; CHECK-NEXT: v_add_co_ci_u32_e64 v43, null, s51, 0, s4
+; CHECK-NEXT: v_mov_b32_e32 v2, v44
+; CHECK-NEXT: v_mov_b32_e32 v0, v42
+; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39]
+; CHECK-NEXT: v_mov_b32_e32 v1, v43
+; CHECK-NEXT: s_swappc_b64 s[30:31], s[42:43]
+; CHECK-NEXT: v_bfe_u32 v0, v0, v74, 4
+; CHECK-NEXT: s_mov_b32 s4, exec_lo
+; CHECK-NEXT: v_cmpx_gt_u32_e32 12, v0
+; CHECK-NEXT: s_xor_b32 s4, exec_lo, s4
+; CHECK-NEXT: s_cbranch_execz .LBB0_31
+; CHECK-NEXT: ; %bb.30: ; in Loop: Header=BB0_28 Depth=1
+; CHECK-NEXT: v_xor_b32_e32 v5, v60, v58
+; CHECK-NEXT: v_lshrrev_b64 v[3:4], 16, v[56:57]
+; CHECK-NEXT: v_mul_u32_u24_e32 v11, 0x180, v73
+; CHECK-NEXT: v_lshlrev_b32_e32 v0, 5, v0
+; CHECK-NEXT: v_lshrrev_b64 v[1:2], 16, v[45:46]
+; CHECK-NEXT: v_lshlrev_b32_e32 v7, 16, v5
+; CHECK-NEXT: v_lshlrev_b32_e32 v8, 6, v72
+; CHECK-NEXT: v_lshlrev_b32_e32 v10, 12, v63
+; CHECK-NEXT: v_xor_b32_e32 v6, v61, v59
+; CHECK-NEXT: v_lshlrev_b32_e32 v9, 16, v56
+; CHECK-NEXT: v_or_b32_e32 v4, v7, v4
+; CHECK-NEXT: v_add_co_u32 v7, s5, s46, v11
+; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, s47, 0, s5
+; CHECK-NEXT: v_or3_b32 v10, v8, v10, v62
+; CHECK-NEXT: v_add_co_u32 v7, vcc_lo, v7, v0
+; CHECK-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v11, vcc_lo
+; CHECK-NEXT: v_lshrrev_b64 v[5:6], 16, v[5:6]
+; CHECK-NEXT: v_or_b32_e32 v2, v9, v2
+; CHECK-NEXT: global_store_dword v[7:8], v10, off offset:4
+; CHECK-NEXT: global_store_dwordx4 v[7:8], v[1:4], off offset:8
+; CHECK-NEXT: global_store_dwordx2 v[7:8], v[5:6], off offset:24
+; CHECK-NEXT: ; implicit-def: $vgpr42
+; CHECK-NEXT: ; implicit-def: $vgpr43
+; CHECK-NEXT: ; implicit-def: $vgpr44
+; CHECK-NEXT: .LBB0_31: ; %Flow
+; CHECK-NEXT: ; in Loop: Header=BB0_28 Depth=1
+; CHECK-NEXT: s_andn2_saveexec_b32 s4, s4
+; CHECK-NEXT: s_cbranch_execz .LBB0_27
+; CHECK-NEXT: ; %bb.32: ; in Loop: Header=BB0_28 Depth=1
+; CHECK-NEXT: v_mov_b32_e32 v31, v41
+; CHECK-NEXT: v_mov_b32_e32 v0, v42
+; CHECK-NEXT: v_mov_b32_e32 v1, v43
+; CHECK-NEXT: v_mov_b32_e32 v2, v44
+; CHECK-NEXT: s_add_u32 s8, s34, 40
+; CHECK-NEXT: s_addc_u32 s9, s35, 0
+; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39]
+; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
+; CHECK-NEXT: s_mov_b32 s12, s41
+; CHECK-NEXT: s_mov_b32 s13, s40
+; CHECK-NEXT: s_mov_b32 s14, s33
+; CHECK-NEXT: s_swappc_b64 s[30:31], s[44:45]
+; CHECK-NEXT: s_branch .LBB0_27
+; CHECK-NEXT: .LBB0_33:
+; CHECK-NEXT: s_endpgm
+ %6 = tail call i64 @_Z13get_global_idj(i32 noundef 0) #4
+ %7 = trunc i64 %6 to i32
+ %8 = tail call i64 @_Z12get_local_idj(i32 noundef 0) #4
+ %9 = trunc i64 %8 to i32
+ %10 = mul i32 %9, 14
+ %11 = getelementptr inbounds i8, ptr addrspace(3) @kernel_round1.first_words_data, i32 %10
+ store i32 0, ptr addrspace(3) @kernel_round1.collisionsNum, align 4, !tbaa !11
+ tail call void @_Z7barrierj(i32 noundef 1) #5
+ %12 = lshr i64 %6, 3
+ %13 = shl i32 %7, 2
+ %14 = and i32 %13, 28
+ %15 = and i64 %12, 536870911
+ %16 = getelementptr inbounds i32, ptr addrspace(1) %2, i64 %15
+ %17 = load i32, ptr addrspace(1) %16, align 4, !tbaa !11
+ %18 = lshr i32 %17, %14
+ %19 = and i32 %18, 15
+ %20 = tail call i32 @_Z3minjj(i32 noundef %19, i32 noundef 12) #4
+ %21 = icmp eq i32 %20, 0
+ br i1 %21, label %119, label %27
+
+22: ; preds = %27
+ %23 = add i32 %20, -1
+ %24 = icmp eq i32 %23, 0
+ br i1 %24, label %119, label %25
+
+25: ; preds = %22
+ %26 = shl i32 %7, 10
+ br label %37
+
+27: ; preds = %5, %27
+ %28 = phi i32 [ %30, %27 ], [ 0, %5 ]
+ %29 = getelementptr inbounds i8, ptr addrspace(3) %11, i32 %28
+ store i8 0, ptr addrspace(3) %29, align 1, !tbaa !15
+ %30 = add nuw i32 %28, 1
+ %31 = icmp eq i32 %30, %20
+ br i1 %31, label %22, label %27
+
+32: ; preds = %114, %48
+ %33 = phi i32 [ %50, %48 ], [ %115, %114 ]
+ %34 = icmp ult i32 %44, %23
+ %35 = icmp ult i32 %33, 60
+ %36 = select i1 %34, i1 %35, i1 false
+ br i1 %36, label %37, label %119
+
+37: ; preds = %32, %25
+ %38 = phi i32 [ 0, %25 ], [ %44, %32 ]
+ %39 = phi i32 [ 0, %25 ], [ %33, %32 ]
+ %40 = getelementptr inbounds i8, ptr addrspace(3) %11, i32 %38
+ %41 = load i8, ptr addrspace(3) %40, align 1, !tbaa !15
+ %42 = shl i32 %38, 5
+ %43 = or i32 %42, %26
+ %44 = add nuw i32 %38, 1
+ %45 = or i32 %43, %44
+ %46 = add i32 %38, 5
+ %47 = icmp ult i32 %46, %20
+ br i1 %47, label %53, label %48
+
+48: ; preds = %98, %37
+ %49 = phi i32 [ %45, %37 ], [ %100, %98 ]
+ %50 = phi i32 [ %39, %37 ], [ %99, %98 ]
+ %51 = phi i32 [ %44, %37 ], [ %54, %98 ]
+ %52 = icmp ult i32 %51, %20
+ br i1 %52, label %103, label %32
+
+53: ; preds = %37, %98
+ %54 = phi i32 [ %101, %98 ], [ %46, %37 ]
+ %55 = phi i32 [ %54, %98 ], [ %44, %37 ]
+ %56 = phi i32 [ %99, %98 ], [ %39, %37 ]
+ %57 = phi i32 [ %100, %98 ], [ %45, %37 ]
+ %58 = getelementptr inbounds i8, ptr addrspace(3) %11, i32 %55
+ %59 = load i8, ptr addrspace(3) %58, align 1, !tbaa !15
+ %60 = icmp eq i8 %41, %59
+ br i1 %60, label %61, label %65
+
+61: ; preds = %53
+ %62 = add i32 %56, 1
+ %63 = tail call i32 @_Z10atomic_incPU3AS3Vj(ptr addrspace(3) noundef @kernel_round1.collisionsNum) #5
+ %64 = getelementptr inbounds i32, ptr addrspace(3) @kernel_round1.collisionsData, i32 %63
+ store i32 %57, ptr addrspace(3) %64, align 4, !tbaa !11
+ br label %65
+
+65: ; preds = %61, %53
+ %66 = phi i32 [ %62, %61 ], [ %56, %53 ]
+ %67 = add i32 %55, 1
+ %68 = getelementptr inbounds i8, ptr addrspace(3) %11, i32 %67
+ %69 = load i8, ptr addrspace(3) %68, align 1, !tbaa !15
+ %70 = icmp eq i8 %41, %69
+ br i1 %70, label %71, label %76
+
+71: ; preds = %65
+ %72 = add i32 %57, 1
+ %73 = add i32 %66, 1
+ %74 = tail call i32 @_Z10atomic_incPU3AS3Vj(ptr addrspace(3) noundef @kernel_round1.collisionsNum) #5
+ %75 = getelementptr inbounds i32, ptr addrspace(3) @kernel_round1.collisionsData, i32 %74
+ store i32 %72, ptr addrspace(3) %75, align 4, !tbaa !11
+ br label %76
+
+76: ; preds = %71, %65
+ %77 = phi i32 [ %73, %71 ], [ %66, %65 ]
+ %78 = add i32 %55, 2
+ %79 = getelementptr inbounds i8, ptr addrspace(3) %11, i32 %78
+ %80 = load i8, ptr addrspace(3) %79, align 1, !tbaa !15
+ %81 = icmp eq i8 %41, %80
+ br i1 %81, label %82, label %87
+
+82: ; preds = %76
+ %83 = add i32 %57, 2
+ %84 = add i32 %77, 1
+ %85 = tail call i32 @_Z10atomic_incPU3AS3Vj(ptr addrspace(3) noundef @kernel_round1.collisionsNum) #5
+ %86 = getelementptr inbounds i32, ptr addrspace(3) @kernel_round1.collisionsData, i32 %85
+ store i32 %83, ptr addrspace(3) %86, align 4, !tbaa !11
+ br label %87
+
+87: ; preds = %82, %76
+ %88 = phi i32 [ %84, %82 ], [ %77, %76 ]
+ %89 = add i32 %55, 3
+ %90 = getelementptr inbounds i8, ptr addrspace(3) %11, i32 %89
+ %91 = load i8, ptr addrspace(3) %90, align 1, !tbaa !15
+ %92 = icmp eq i8 %41, %91
+ br i1 %92, label %93, label %98
+
+93: ; preds = %87
+ %94 = add i32 %57, 3
+ %95 = add i32 %88, 1
+ %96 = tail call i32 @_Z10atomic_incPU3AS3Vj(ptr addrspace(3) noundef @kernel_round1.collisionsNum) #5
+ %97 = getelementptr inbounds i32, ptr addrspace(3) @kernel_round1.collisionsData, i32 %96
+ store i32 %94, ptr addrspace(3) %97, align 4, !tbaa !11
+ br label %98
+
+98: ; preds = %93, %87
+ %99 = phi i32 [ %95, %93 ], [ %88, %87 ]
+ %100 = add i32 %57, 4
+ %101 = add i32 %54, 4
+ %102 = icmp ult i32 %101, %20
+ br i1 %102, label %53, label %48
+
+103: ; preds = %48, %114
+ %104 = phi i32 [ %117, %114 ], [ %51, %48 ]
+ %105 = phi i32 [ %115, %114 ], [ %50, %48 ]
+ %106 = phi i32 [ %116, %114 ], [ %49, %48 ]
+ %107 = getelementptr inbounds i8, ptr addrspace(3) %11, i32 %104
+ %108 = load i8, ptr addrspace(3) %107, align 1, !tbaa !15
+ %109 = icmp eq i8 %41, %108
+ br i1 %109, label %110, label %114
+
+110: ; preds = %103
+ %111 = add i32 %105, 1
+ %112 = tail call i32 @_Z10atomic_incPU3AS3Vj(ptr addrspace(3) noundef @kernel_round1.collisionsNum) #5
+ %113 = getelementptr inbounds i32, ptr addrspace(3) @kernel_round1.collisionsData, i32 %112
+ store i32 %106, ptr addrspace(3) %113, align 4, !tbaa !11
+ br label %114
+
+114: ; preds = %110, %103
+ %115 = phi i32 [ %111, %110 ], [ %105, %103 ]
+ %116 = add i32 %106, 1
+ %117 = add nuw i32 %104, 1
+ %118 = icmp ult i32 %117, %20
+ br i1 %118, label %103, label %32
+
+119: ; preds = %32, %22, %5
+ tail call void @_Z7barrierj(i32 noundef 1) #5
+ %120 = load i32, ptr addrspace(3) @kernel_round1.collisionsNum, align 4, !tbaa !11
+ %121 = icmp ugt i32 %120, %9
+ br i1 %121, label %122, label %206
+
+122: ; preds = %119
+ %123 = getelementptr inbounds i8, ptr addrspace(1) %0, i64 8
+ br label %124
+
+124: ; preds = %201, %122
+ %125 = phi i32 [ %9, %122 ], [ %204, %201 ]
+ %126 = phi i64 [ %8, %122 ], [ %203, %201 ]
+ %127 = and i64 %126, 4294967295
+ %128 = getelementptr inbounds i32, ptr addrspace(3) @kernel_round1.collisionsData, i32 %125
+ %129 = load i32, ptr addrspace(3) %128, align 4, !tbaa !11
+ %130 = lshr i32 %129, 10
+ %131 = lshr i32 %129, 5
+ %132 = and i32 %131, 31
+ %133 = and i32 %129, 31
+ %134 = mul nuw nsw i32 %130, 384
+ %135 = zext i32 %134 to i64
+ %136 = getelementptr inbounds i8, ptr addrspace(1) %123, i64 %135
+ %137 = shl nuw nsw i32 %132, 5
+ %138 = zext i32 %137 to i64
+ %139 = getelementptr inbounds i8, ptr addrspace(1) %136, i64 %138
+ %140 = shl nuw nsw i32 %133, 5
+ %141 = zext i32 %140 to i64
+ %142 = getelementptr inbounds i8, ptr addrspace(1) %136, i64 %141
+ %143 = getelementptr inbounds i64, ptr addrspace(1) %139, i64 1
+ %144 = load i64, ptr addrspace(1) %139, align 8, !tbaa !16
+ %145 = getelementptr inbounds i64, ptr addrspace(1) %142, i64 1
+ %146 = load i64, ptr addrspace(1) %142, align 8, !tbaa !16
+ %147 = xor i64 %146, %144
+ %148 = load i64, ptr addrspace(1) %143, align 8, !tbaa !16
+ %149 = load i64, ptr addrspace(1) %145, align 8, !tbaa !16
+ %150 = xor i64 %149, %148
+ %151 = icmp ne i64 %147, 0
+ %152 = icmp ne i64 %150, 0
+ %153 = select i1 %151, i1 true, i1 %152
+ br i1 %153, label %154, label %201
+
+154: ; preds = %124
+ %155 = getelementptr inbounds i64, ptr addrspace(1) %142, i64 2
+ %156 = load i64, ptr addrspace(1) %155, align 8, !tbaa !16
+ %157 = getelementptr inbounds i64, ptr addrspace(1) %139, i64 2
+ %158 = load i64, ptr addrspace(1) %157, align 8, !tbaa !16
+ %159 = and i64 %147, 983040
+ %160 = shl i64 %147, 4
+ %161 = and i64 %160, 61440
+ %162 = or i64 %161, %159
+ %163 = lshr i64 %147, 12
+ %164 = and i64 %163, 3840
+ %165 = or i64 %162, %164
+ %166 = and i64 %160, 240
+ %167 = or i64 %165, %166
+ %168 = and i64 %163, 15
+ %169 = or i64 %167, %168
+ %170 = trunc i64 %169 to i32
+ %171 = lshr i64 %169, 3
+ %172 = shl nuw nsw i32 %170, 2
+ %173 = and i32 %172, 28
+ %174 = getelementptr inbounds i32, ptr addrspace(1) %3, i64 %171
+ %175 = shl nuw nsw i32 1, %173
+ %176 = tail call i32 @_Z10atomic_addPU3AS1Vjj(ptr addrspace(1) noundef %174, i32 noundef %175) #5
+ %177 = lshr i32 %176, %173
+ %178 = and i32 %177, 15
+ %179 = icmp ugt i32 %178, 11
+ br i1 %179, label %180, label %182
+
+180: ; preds = %154
+ %181 = tail call i32 @_Z10atomic_subPU3AS1Vjj(ptr addrspace(1) noundef %174, i32 noundef %175) #5
+ br label %201
+
+182: ; preds = %154
+ %183 = xor i64 %158, %156
+ %184 = lshr i64 %183, 16
+ %185 = tail call i64 @llvm.fshl.i64(i64 %183, i64 %150, i64 48)
+ %186 = tail call i64 @llvm.fshl.i64(i64 %150, i64 %147, i64 48)
+ %187 = shl nuw nsw i32 %133, 6
+ %188 = shl i32 %130, 12
+ %189 = or i32 %187, %188
+ %190 = or i32 %189, %132
+ %191 = mul nuw nsw i64 %169, 384
+ %192 = and i64 %191, 4294967168
+ %193 = getelementptr inbounds i8, ptr addrspace(1) %1, i64 %192
+ %194 = shl nuw nsw i32 %178, 5
+ %195 = or i32 %194, 8
+ %196 = zext i32 %195 to i64
+ %197 = getelementptr inbounds i8, ptr addrspace(1) %193, i64 %196
+ %198 = getelementptr inbounds i8, ptr addrspace(1) %197, i64 -4
+ store i32 %190, ptr addrspace(1) %198, align 4, !tbaa !11
+ store i64 %186, ptr addrspace(1) %197, align 8, !tbaa !16
+ %199 = getelementptr inbounds i8, ptr addrspace(1) %197, i64 8
+ store i64 %185, ptr addrspace(1) %199, align 8, !tbaa !16
+ %200 = getelementptr inbounds i8, ptr addrspace(1) %197, i64 16
+ store i64 %184, ptr addrspace(1) %200, align 8, !tbaa !16
+ br label %201
+
+201: ; preds = %182, %180, %124
+ %202 = tail call i64 @_Z14get_local_sizej(i32 noundef 0) #4
+ %203 = add i64 %202, %127
+ %204 = trunc i64 %203 to i32
+ %205 = icmp ugt i32 %120, %204
+ br i1 %205, label %124, label %206
+
+206: ; preds = %201, %119
+ ret void
+}
+
+; Removed most of the if-else blocks
+
+define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapture noundef readonly align 1 %.0, ptr addrspace(1) nocapture noundef writeonly align 1 %.1, ptr addrspace(1) nocapture noundef readonly align 4 %.2, ptr addrspace(1) noundef align 4 %.3, ptr addrspace(1) nocapture noundef readnone align 4 %.4) local_unnamed_addr #2 !kernel_arg_addr_space !5 !kernel_arg_access_qual !6 !kernel_arg_type !7 !kernel_arg_base_type !7 !kernel_arg_type_qual !8 !kernel_arg_name !9 !reqd_work_group_size !10 {
+; CHECK-LABEL: kernel_round1_short:
+; CHECK: ; %bb.0: ; %.5
+; CHECK-NEXT: s_add_u32 s10, s10, s15
+; CHECK-NEXT: s_mov_b32 s32, 0
+; CHECK-NEXT: s_addc_u32 s11, s11, 0
+; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
+; CHECK-NEXT: s_load_dwordx2 s[46:47], s[6:7], 0x10
+; CHECK-NEXT: s_add_u32 s0, s0, s15
+; CHECK-NEXT: s_mov_b64 s[36:37], s[6:7]
+; CHECK-NEXT: s_addc_u32 s1, s1, 0
+; CHECK-NEXT: v_mov_b32_e32 v40, v0
+; CHECK-NEXT: s_add_u32 s42, s36, 40
+; CHECK-NEXT: v_mov_b32_e32 v31, v0
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: s_mov_b64 s[34:35], s[8:9]
+; CHECK-NEXT: s_addc_u32 s43, s37, 0
+; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
+; CHECK-NEXT: s_mov_b64 s[8:9], s[42:43]
+; CHECK-NEXT: s_mov_b32 s33, s14
+; CHECK-NEXT: s_mov_b32 s40, s13
+; CHECK-NEXT: s_mov_b32 s41, s12
+; CHECK-NEXT: s_mov_b64 s[38:39], s[4:5]
+; CHECK-NEXT: s_getpc_b64 s[6:7]
+; CHECK-NEXT: s_add_u32 s6, s6, _Z13get_global_idj at rel32@lo+4
+; CHECK-NEXT: s_addc_u32 s7, s7, _Z13get_global_idj at rel32@hi+12
+; CHECK-NEXT: v_mov_b32_e32 v43, 0
+; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; CHECK-NEXT: v_mov_b32_e32 v42, v0
+; CHECK-NEXT: v_mov_b32_e32 v31, v40
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39]
+; CHECK-NEXT: s_mov_b64 s[8:9], s[42:43]
+; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
+; CHECK-NEXT: s_mov_b32 s12, s41
+; CHECK-NEXT: s_mov_b32 s13, s40
+; CHECK-NEXT: s_mov_b32 s14, s33
+; CHECK-NEXT: s_getpc_b64 s[6:7]
+; CHECK-NEXT: s_add_u32 s6, s6, _Z12get_local_idj at rel32@lo+4
+; CHECK-NEXT: s_addc_u32 s7, s7, _Z12get_local_idj at rel32@hi+12
+; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; CHECK-NEXT: v_mul_lo_u32 v46, v0, 14
+; CHECK-NEXT: v_mov_b32_e32 v31, v40
+; CHECK-NEXT: v_mov_b32_e32 v0, 1
+; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39]
+; CHECK-NEXT: s_mov_b64 s[8:9], s[42:43]
+; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
+; CHECK-NEXT: s_mov_b32 s12, s41
+; CHECK-NEXT: s_mov_b32 s13, s40
+; CHECK-NEXT: s_mov_b32 s14, s33
+; CHECK-NEXT: ds_write_b32 v43, v43 offset:15360
+; CHECK-NEXT: s_getpc_b64 s[44:45]
+; CHECK-NEXT: s_add_u32 s44, s44, _Z7barrierj at rel32@lo+4
+; CHECK-NEXT: s_addc_u32 s45, s45, _Z7barrierj at rel32@hi+12
+; CHECK-NEXT: v_add_nc_u32_e32 v44, 0x3c04, v46
+; CHECK-NEXT: s_swappc_b64 s[30:31], s[44:45]
+; CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v42
+; CHECK-NEXT: v_lshlrev_b32_e32 v1, 2, v42
+; CHECK-NEXT: v_mov_b32_e32 v31, v40
+; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39]
+; CHECK-NEXT: s_mov_b64 s[8:9], s[42:43]
+; CHECK-NEXT: v_and_b32_e32 v0, 0x7ffffffc, v0
+; CHECK-NEXT: v_and_b32_e32 v1, 28, v1
+; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
+; CHECK-NEXT: s_mov_b32 s12, s41
+; CHECK-NEXT: s_mov_b32 s13, s40
+; CHECK-NEXT: global_load_dword v0, v0, s[46:47]
+; CHECK-NEXT: s_mov_b32 s14, s33
+; CHECK-NEXT: s_getpc_b64 s[6:7]
+; CHECK-NEXT: s_add_u32 s6, s6, _Z3minjj at rel32@lo+4
+; CHECK-NEXT: s_addc_u32 s7, s7, _Z3minjj at rel32@hi+12
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_bfe_u32 v0, v0, v1, 4
+; CHECK-NEXT: v_mov_b32_e32 v1, 12
+; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; CHECK-NEXT: v_mov_b32_e32 v41, v0
+; CHECK-NEXT: v_lshlrev_b32_e32 v42, 10, v42
+; CHECK-NEXT: s_getpc_b64 s[42:43]
+; CHECK-NEXT: s_add_u32 s42, s42, _Z10atomic_incPU3AS3Vj at rel32@lo+4
+; CHECK-NEXT: s_addc_u32 s43, s43, _Z10atomic_incPU3AS3Vj at rel32@hi+12
+; CHECK-NEXT: s_mov_b32 s46, 0
+; CHECK-NEXT: s_mov_b32 s4, 0
+; CHECK-NEXT: v_add_nc_u32_e32 v45, -1, v41
+; CHECK-NEXT: ds_write_b8 v46, v43 offset:15364
+; CHECK-NEXT: .LBB1_1: ; %.37
+; CHECK-NEXT: ; =>This Loop Header: Depth=1
+; CHECK-NEXT: ; Child Loop BB1_3 Depth 2
+; CHECK-NEXT: ; Child Loop BB1_8 Depth 2
+; CHECK-NEXT: v_add_nc_u32_e32 v0, s4, v44
+; CHECK-NEXT: s_lshl_b32 s5, s4, 5
+; CHECK-NEXT: s_add_i32 s47, s4, 1
+; CHECK-NEXT: s_add_i32 s6, s4, 5
+; CHECK-NEXT: v_or3_b32 v47, s5, v42, s47
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: ds_read_u8 v46, v0
+; CHECK-NEXT: v_mov_b32_e32 v56, s47
+; CHECK-NEXT: s_mov_b32 s5, exec_lo
+; CHECK-NEXT: v_cmpx_lt_u32_e64 s6, v41
+; CHECK-NEXT: s_cbranch_execz .LBB1_5
+; CHECK-NEXT: ; %bb.2: ; %.53.preheader
+; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
+; CHECK-NEXT: s_mov_b32 s6, 0
+; CHECK-NEXT: s_mov_b32 s7, 0
+; CHECK-NEXT: .LBB1_3: ; %.53
+; CHECK-NEXT: ; Parent Loop BB1_1 Depth=1
+; CHECK-NEXT: ; => This Inner Loop Header: Depth=2
+; CHECK-NEXT: s_add_i32 s7, s7, 4
+; CHECK-NEXT: v_add_nc_u32_e32 v43, 1, v43
+; CHECK-NEXT: s_add_i32 s8, s4, s7
+; CHECK-NEXT: s_add_i32 s9, s8, 5
+; CHECK-NEXT: s_add_i32 s8, s8, 1
+; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s9, v41
+; CHECK-NEXT: v_mov_b32_e32 v56, s8
+; CHECK-NEXT: s_or_b32 s6, vcc_lo, s6
+; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s6
+; CHECK-NEXT: s_cbranch_execnz .LBB1_3
+; CHECK-NEXT: ; %bb.4: ; %Flow3
+; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
+; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
+; CHECK-NEXT: v_add_nc_u32_e32 v47, s7, v47
+; CHECK-NEXT: .LBB1_5: ; %Flow4
+; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
+; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; CHECK-NEXT: s_mov_b32 s48, exec_lo
+; CHECK-NEXT: v_cmpx_lt_u32_e64 v56, v41
+; CHECK-NEXT: s_cbranch_execz .LBB1_11
+; CHECK-NEXT: ; %bb.6: ; %.103.preheader
+; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
+; CHECK-NEXT: s_mov_b32 s49, 0
+; CHECK-NEXT: s_inst_prefetch 0x1
+; CHECK-NEXT: s_branch .LBB1_8
+; CHECK-NEXT: .p2align 6
+; CHECK-NEXT: .LBB1_7: ; %.114
+; CHECK-NEXT: ; in Loop: Header=BB1_8 Depth=2
+; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s50
+; CHECK-NEXT: v_add_nc_u32_e32 v56, 1, v56
+; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47
+; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, v56, v41
+; CHECK-NEXT: s_or_b32 s49, vcc_lo, s49
+; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s49
+; CHECK-NEXT: s_cbranch_execz .LBB1_10
+; CHECK-NEXT: .LBB1_8: ; %.103
+; CHECK-NEXT: ; Parent Loop BB1_1 Depth=1
+; CHECK-NEXT: ; => This Inner Loop Header: Depth=2
+; CHECK-NEXT: v_add_nc_u32_e32 v0, v44, v56
+; CHECK-NEXT: ds_read_u8 v0, v0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v46, v0 src0_sel:BYTE_0 src1_sel:DWORD
+; CHECK-NEXT: s_and_saveexec_b32 s50, s4
+; CHECK-NEXT: s_cbranch_execz .LBB1_7
+; CHECK-NEXT: ; %bb.9: ; %.110
+; CHECK-NEXT: ; in Loop: Header=BB1_8 Depth=2
+; CHECK-NEXT: v_mov_b32_e32 v31, v40
+; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00
+; CHECK-NEXT: s_add_u32 s8, s36, 40
+; CHECK-NEXT: s_addc_u32 s9, s37, 0
+; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39]
+; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
+; CHECK-NEXT: s_mov_b32 s12, s41
+; CHECK-NEXT: s_mov_b32 s13, s40
+; CHECK-NEXT: s_mov_b32 s14, s33
+; CHECK-NEXT: v_add_nc_u32_e32 v43, 1, v43
+; CHECK-NEXT: s_swappc_b64 s[30:31], s[42:43]
+; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; CHECK-NEXT: ds_write_b32 v0, v47
+; CHECK-NEXT: s_branch .LBB1_7
+; CHECK-NEXT: .LBB1_10: ; %Flow
+; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
+; CHECK-NEXT: s_inst_prefetch 0x2
+; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s49
+; CHECK-NEXT: .LBB1_11: ; %Flow2
+; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
+; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s48
+; CHECK-NEXT: ; %bb.12: ; %.32
+; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
+; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s47, v45
+; CHECK-NEXT: v_cmp_lt_u32_e64 s4, 59, v43
+; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
+; CHECK-NEXT: s_and_b32 s4, exec_lo, s4
+; CHECK-NEXT: s_or_b32 s46, s4, s46
+; CHECK-NEXT: s_mov_b32 s4, s47
+; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s46
+; CHECK-NEXT: s_cbranch_execnz .LBB1_1
+; CHECK-NEXT: ; %bb.13: ; %.119
+; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s46
+; CHECK-NEXT: v_mov_b32_e32 v31, v40
+; CHECK-NEXT: v_mov_b32_e32 v0, 1
+; CHECK-NEXT: s_add_u32 s8, s36, 40
+; CHECK-NEXT: s_addc_u32 s9, s37, 0
+; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39]
+; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
+; CHECK-NEXT: s_mov_b32 s12, s41
+; CHECK-NEXT: s_mov_b32 s13, s40
+; CHECK-NEXT: s_mov_b32 s14, s33
+; CHECK-NEXT: s_swappc_b64 s[30:31], s[44:45]
+; CHECK-NEXT: s_endpgm
+.5:
+ %.6 = tail call i64 @_Z13get_global_idj(i32 noundef 0) #4
+ %.7 = trunc i64 %.6 to i32
+ %.8 = tail call i64 @_Z12get_local_idj(i32 noundef 0) #4
+ %.9 = trunc i64 %.8 to i32
+ %.10 = mul i32 %.9, 14
+ %.11 = getelementptr inbounds i8, ptr addrspace(3) @kernel_round1.first_words_data, i32 %.10
+ store i32 0, ptr addrspace(3) @kernel_round1.collisionsNum, align 4, !tbaa !11
+ tail call void @_Z7barrierj(i32 noundef 1) #5
+ %.12 = lshr i64 %.6, 3
+ %.13 = shl i32 %.7, 2
+ %.14 = and i32 %.13, 28
+ %.15 = and i64 %.12, 536870911
+ %.16 = getelementptr inbounds i32, ptr addrspace(1) %.2, i64 %.15
+ %.17 = load i32, ptr addrspace(1) %.16, align 4, !tbaa !11
+ %.18 = lshr i32 %.17, %.14
+ %.19 = and i32 %.18, 15
+ %.20 = tail call i32 @_Z3minjj(i32 noundef %.19, i32 noundef 12) #4
+ %.21 = icmp eq i32 %.20, 0
+ %.23 = add i32 %.20, -1
+ %.24 = icmp eq i32 %.23, 0
+ store i8 0, ptr addrspace(3) %.11, align 1, !tbaa !15
+ br label %.37
+
+.32: ; preds = %.114, %.48
+ %.33 = phi i32 [ %.50, %.48 ], [ %.115, %.114 ]
+ %.34 = icmp ult i32 %.44, %.23
+ %.35 = icmp ult i32 %.33, 60
+ %.36 = select i1 %.34, i1 %.35, i1 false
+ br i1 %.36, label %.37, label %.119
+
+.37: ; preds = %.32, %.25
+ %.38 = phi i32 [ 0, %.5 ], [ %.44, %.32 ]
+ %.39 = phi i32 [ 0, %.5 ], [ %.33, %.32 ]
+ %.26 = shl i32 %.7, 10
+ %.40 = getelementptr inbounds i8, ptr addrspace(3) %.11, i32 %.38
+ %.41 = load i8, ptr addrspace(3) %.40, align 1, !tbaa !15
+ %.42 = shl i32 %.38, 5
+ %.43 = or i32 %.42, %.26
+ %.44 = add nuw i32 %.38, 1
+ %.45 = or i32 %.43, %.44
+ %.46 = add i32 %.38, 5
+ %.47 = icmp ult i32 %.46, %.20
+ br i1 %.47, label %.53, label %.48
+
+.48: ; preds = %.98, %.37
+ %.49 = phi i32 [ %.45, %.37 ], [ %.100, %.98 ]
+ %.50 = phi i32 [ %.39, %.37 ], [ %.99, %.98 ]
+ %.51 = phi i32 [ %.44, %.37 ], [ %.54, %.98 ]
+ %.52 = icmp ult i32 %.51, %.20
+ br i1 %.52, label %.103, label %.32
+
+.53: ; preds = %.37, %.98
+ %.54 = phi i32 [ %.101, %.98 ], [ %.46, %.37 ]
+ %.55 = phi i32 [ %.54, %.98 ], [ %.44, %.37 ]
+ %.56 = phi i32 [ %.99, %.98 ], [ %.39, %.37 ]
+ %.57 = phi i32 [ %.100, %.98 ], [ %.45, %.37 ]
+ %.58 = getelementptr inbounds i8, ptr addrspace(3) %.11, i32 %.55
+ %.59 = load i8, ptr addrspace(3) %.58, align 1, !tbaa !15
+ %.60 = icmp eq i8 %.41, %.59
+ br label %.98
+
+.98: ; preds = %.93, %.87
+ %.99 = add i32 %.56, 1
+ %.100 = add i32 %.57, 4
+ %.101 = add i32 %.54, 4
+ %.102 = icmp ult i32 %.101, %.20
+ br i1 %.102, label %.53, label %.48
+
+.103: ; preds = %.48, %.114
+ %.104 = phi i32 [ %.117, %.114 ], [ %.51, %.48 ]
+ %.105 = phi i32 [ %.115, %.114 ], [ %.50, %.48 ]
+ %.106 = phi i32 [ %.116, %.114 ], [ %.49, %.48 ]
+ %.107 = getelementptr inbounds i8, ptr addrspace(3) %.11, i32 %.104
+ %.108 = load i8, ptr addrspace(3) %.107, align 1, !tbaa !15
+ %.109 = icmp eq i8 %.41, %.108
+ br i1 %.109, label %.110, label %.114
+
+.110: ; preds = %.103
+ %.111 = add i32 %.105, 1
+ %.112 = tail call i32 @_Z10atomic_incPU3AS3Vj(ptr addrspace(3) noundef @kernel_round1.collisionsNum) #5
+ %.113 = getelementptr inbounds i32, ptr addrspace(3) @kernel_round1.collisionsData, i32 %.112
+ store i32 %.106, ptr addrspace(3) %.113, align 4, !tbaa !11
+ br label %.114
+
+.114: ; preds = %.110, %.103
+ %.115 = phi i32 [ %.111, %.110 ], [ %.105, %.103 ]
+ %.116 = add i32 %.106, 1
+ %.117 = add nuw i32 %.104, 1
+ %.118 = icmp ult i32 %.117, %.20
+ br i1 %.118, label %.103, label %.32
+
+.119: ; preds = %.32, %.22, %.5
+ tail call void @_Z7barrierj(i32 noundef 1) #5
+ %.120 = load i32, ptr addrspace(3) @kernel_round1.collisionsNum, align 4, !tbaa !11
+ %.121 = icmp ugt i32 %.120, %.9
+ br label %.206
+
+.206: ; preds = %.201, %.119
+ ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare i64 @llvm.fshl.i64(i64, i64, i64) #3
+
+attributes #0 = { convergent mustprogress nofree nounwind willreturn memory(none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1030" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" }
+attributes #1 = { convergent nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1030" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" }
+attributes #2 = { convergent norecurse nounwind "amdgpu-flat-work-group-size"="64,64" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1030" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" "uniform-work-group-size"="true" }
+attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #4 = { convergent nounwind willreturn memory(none) }
+attributes #5 = { convergent nounwind }
+
+!llvm.module.flags = !{!0, !1, !2}
+!opencl.ocl.version = !{!3}
+!llvm.ident = !{!4}
+
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
+!1 = !{i32 1, !"wchar_size", i32 4}
+!2 = !{i32 8, !"PIC Level", i32 2}
+!3 = !{i32 1, i32 2}
+!4 = !{!"clang version 17.0.0 (ssh://chfang@git.amd.com:29418/lightning/ec/llvm-project 06ead8cf696777b9f17876b60707ba9de4d0606f)"}
+!5 = !{i32 1, i32 1, i32 1, i32 1, i32 1}
+!6 = !{!"none", !"none", !"none", !"none", !"none"}
+!7 = !{!"char*", !"char*", !"uint*", !"uint*", !"uint*"}
+!8 = !{!"", !"", !"", !"", !""}
+!9 = !{!"ht_src", !"ht_dst", !"rowCountersSrc", !"rowCountersDst", !"debug"}
+!10 = !{i32 64, i32 1, i32 1}
+!11 = !{!12, !12, i64 0}
+!12 = !{!"int", !13, i64 0}
+!13 = !{!"omnipotent char", !14, i64 0}
+!14 = !{!"Simple C/C++ TBAA"}
+!15 = !{!13, !13, i64 0}
+!16 = !{!17, !17, i64 0}
+!17 = !{!"long", !13, i64 0}
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.mir b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.mir
new file mode 100644
index 000000000000000..191b400011b6b2b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.mir
@@ -0,0 +1,1319 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 -run-pass=machine-sink -o - %s | FileCheck %s
+
+--- |
+ %llvm.amdgcn.kernel.kernel_round1.lds.t = type { [3840 x i32], i32, [896 x i8] }
+ @llvm.amdgcn.kernel.kernel_round1.lds = internal addrspace(3) global %llvm.amdgcn.kernel.kernel_round1.lds.t poison, align 4
+ declare hidden i64 @_Z13get_global_idj(i32 noundef)
+ declare hidden i64 @_Z12get_local_idj(i32 noundef)
+ declare hidden void @_Z7barrierj(i32 noundef)
+ declare hidden i32 @_Z3minjj(i32 noundef, i32 noundef)
+ declare hidden i32 @_Z10atomic_incPU3AS3Vj(ptr addrspace(3) noundef)
+
+ define protected amdgpu_kernel void @kernel_round1_short(){
+ ret void
+ }
+
+ define protected amdgpu_kernel void @sink_salu(){
+ ret void
+ }
+...
+
+---
+name: kernel_round1_short
+alignment: 1
+tracksRegLiveness: true
+machineFunctionInfo:
+ scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
+ stackPtrOffsetReg: '$sgpr32'
+
+body: |
+ ; CHECK-LABEL: name: kernel_round1_short
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.3(0x80000000)
+ ; CHECK-NEXT: liveins: $vgpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr12, $sgpr13, $sgpr14
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7
+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+ ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+ ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY4]](p4), 16, 0 :: (dereferenceable invariant load (s64), align 16, addrspace 4)
+ ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 40
+ ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0(p4)
+ ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1(p4)
+ ; CHECK-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub0
+ ; CHECK-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub1
+ ; CHECK-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY7]], [[COPY9]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY8]], [[COPY10]], implicit-def $scc, implicit $scc
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+ ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @_Z13get_global_idj + 4, target-flags(amdgpu-rel32-hi) @_Z13get_global_idj + 12, implicit-def dead $scc
+ ; CHECK-NEXT: [[COPY11:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY5]]
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF]]
+ ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[REG_SEQUENCE]]
+ ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY3]]
+ ; CHECK-NEXT: $sgpr12 = COPY [[COPY2]]
+ ; CHECK-NEXT: $sgpr13 = COPY [[COPY1]]
+ ; CHECK-NEXT: $sgpr14 = COPY [[COPY]]
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]]
+ ; CHECK-NEXT: $vgpr31 = COPY [[COPY6]](s32)
+ ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY11]]
+ ; CHECK-NEXT: $vgpr0 = COPY [[V_MOV_B32_e32_]]
+ ; CHECK-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[SI_PC_ADD_REL_OFFSET]], @_Z13get_global_idj, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit-def $vgpr0, implicit-def $vgpr1
+ ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY13]], %subreg.sub1
+ ; CHECK-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
+ ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET1:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @_Z12get_local_idj + 4, target-flags(amdgpu-rel32-hi) @_Z12get_local_idj + 12, implicit-def dead $scc
+ ; CHECK-NEXT: [[COPY15:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY5]]
+ ; CHECK-NEXT: [[DEF2:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF2]]
+ ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[REG_SEQUENCE]]
+ ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY3]]
+ ; CHECK-NEXT: $sgpr12 = COPY [[COPY2]]
+ ; CHECK-NEXT: $sgpr13 = COPY [[COPY1]]
+ ; CHECK-NEXT: $sgpr14 = COPY [[COPY]]
+ ; CHECK-NEXT: [[DEF3:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: $sgpr15 = COPY [[DEF3]]
+ ; CHECK-NEXT: $vgpr31 = COPY [[COPY6]](s32)
+ ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY15]]
+ ; CHECK-NEXT: $vgpr0 = COPY [[V_MOV_B32_e32_]]
+ ; CHECK-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[SI_PC_ADD_REL_OFFSET1]], @_Z12get_local_idj, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit-def $vgpr0, implicit-def $vgpr1
+ ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ ; CHECK-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY16]], %subreg.sub0, [[COPY17]], %subreg.sub1
+ ; CHECK-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0
+ ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 14
+ ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 killed [[COPY18]], killed [[S_MOV_B32_]], implicit $exec
+ ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 15364
+ ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MUL_LO_U32_e64_]], killed [[S_MOV_B32_1]], 0, implicit $exec
+ ; CHECK-NEXT: DS_WRITE_B32_gfx9 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_]], 15360, 0, implicit $exec :: (store (s32), align 1024, addrspace 3)
+ ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET2:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @_Z7barrierj + 4, target-flags(amdgpu-rel32-hi) @_Z7barrierj + 12, implicit-def dead $scc
+ ; CHECK-NEXT: [[COPY19:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+ ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY5]]
+ ; CHECK-NEXT: [[DEF4:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF4]]
+ ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[REG_SEQUENCE]]
+ ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY3]]
+ ; CHECK-NEXT: $sgpr12 = COPY [[COPY2]]
+ ; CHECK-NEXT: $sgpr13 = COPY [[COPY1]]
+ ; CHECK-NEXT: $sgpr14 = COPY [[COPY]]
+ ; CHECK-NEXT: [[DEF5:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: $sgpr15 = COPY [[DEF5]]
+ ; CHECK-NEXT: $vgpr31 = COPY [[COPY6]](s32)
+ ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]]
+ ; CHECK-NEXT: $vgpr0 = COPY [[V_MOV_B32_e32_1]]
+ ; CHECK-NEXT: $sgpr30_sgpr31 = SI_CALL [[SI_PC_ADD_REL_OFFSET2]], @_Z7barrierj, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0
+ ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ ; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2
+ ; CHECK-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[S_MOV_B32_2]], [[COPY14]], implicit $exec
+ ; CHECK-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 28
+ ; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 killed [[V_LSHLREV_B32_e64_]], killed [[S_MOV_B32_3]], implicit $exec
+ ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_1]], [[COPY14]], implicit $exec
+ ; CHECK-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483644
+ ; CHECK-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 killed [[V_LSHRREV_B32_e64_]], killed [[S_MOV_B32_4]], implicit $exec
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR killed [[S_LOAD_DWORDX2_IMM]], killed [[V_AND_B32_e64_1]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+ ; CHECK-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 killed [[GLOBAL_LOAD_DWORD_SADDR]], killed [[V_AND_B32_e64_]], 4, implicit $exec
+ ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET3:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @_Z3minjj + 4, target-flags(amdgpu-rel32-hi) @_Z3minjj + 12, implicit-def dead $scc
+ ; CHECK-NEXT: [[COPY20:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; CHECK-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 12, implicit $exec
+ ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY5]]
+ ; CHECK-NEXT: [[DEF6:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF6]]
+ ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[REG_SEQUENCE]]
+ ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY3]]
+ ; CHECK-NEXT: $sgpr12 = COPY [[COPY2]]
+ ; CHECK-NEXT: $sgpr13 = COPY [[COPY1]]
+ ; CHECK-NEXT: $sgpr14 = COPY [[COPY]]
+ ; CHECK-NEXT: [[DEF7:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: $sgpr15 = COPY [[DEF7]]
+ ; CHECK-NEXT: $vgpr31 = COPY [[COPY6]](s32)
+ ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]]
+ ; CHECK-NEXT: $vgpr0 = COPY [[V_BFE_U32_e64_]]
+ ; CHECK-NEXT: $vgpr1 = COPY [[V_MOV_B32_e32_2]]
+ ; CHECK-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[SI_PC_ADD_REL_OFFSET3]], @_Z3minjj, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1, implicit-def $vgpr0
+ ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ ; CHECK-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+ ; CHECK-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY21]], killed [[S_MOV_B32_5]], 0, implicit $exec
+ ; CHECK-NEXT: [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; CHECK-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_6]]
+ ; CHECK-NEXT: DS_WRITE_B8_gfx9 [[V_MUL_LO_U32_e64_]], killed [[COPY22]], 15364, 0, implicit $exec :: (store (s8), addrspace 3)
+ ; CHECK-NEXT: [[S_MOV_B32_7:%[0-9]+]]:sreg_32 = S_MOV_B32 10
+ ; CHECK-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 killed [[S_MOV_B32_7]], [[COPY14]], implicit $exec
+ ; CHECK-NEXT: [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_6]], implicit $exec
+ ; CHECK-NEXT: [[S_MOV_B32_8:%[0-9]+]]:sreg_32 = S_MOV_B32 5
+ ; CHECK-NEXT: [[S_MOV_B32_9:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+ ; CHECK-NEXT: [[S_MOV_B32_10:%[0-9]+]]:sreg_32 = S_MOV_B32 4
+ ; CHECK-NEXT: [[S_MOV_B32_11:%[0-9]+]]:sreg_32 = S_MOV_B32 255
+ ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET4:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @_Z10atomic_incPU3AS3Vj + 4, target-flags(amdgpu-rel32-hi) @_Z10atomic_incPU3AS3Vj + 12, implicit-def dead $scc
+ ; CHECK-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 15360, implicit $exec
+ ; CHECK-NEXT: [[DEF8:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF9:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[S_MOV_B32_12:%[0-9]+]]:sreg_32 = S_MOV_B32 59
+ ; CHECK-NEXT: S_BRANCH %bb.3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.10(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI %74, %bb.13
+ ; CHECK-NEXT: SI_END_CF %75, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: S_BRANCH %bb.10
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: successors: %bb.14(0x04000000), %bb.3(0x7c000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[V_CMP_GE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_GE_U32_e64 %77, [[V_ADD_U32_e64_1]], implicit $exec
+ ; CHECK-NEXT: [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_GT_U32_e64 %79, [[S_MOV_B32_12]], implicit $exec
+ ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 killed [[V_CMP_GE_U32_e64_]], killed [[V_CMP_GT_U32_e64_]], implicit-def dead $scc
+ ; CHECK-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_32 = SI_IF_BREAK killed [[S_OR_B32_]], %82, implicit-def dead $scc
+ ; CHECK-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: S_BRANCH %bb.14
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.3:
+ ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.8(0x40000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[PHI1:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_6]], %bb.0, [[SI_IF_BREAK]], %bb.2
+ ; CHECK-NEXT: [[PHI2:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_6]], %bb.0, %77, %bb.2
+ ; CHECK-NEXT: [[PHI3:%[0-9]+]]:vgpr_32 = PHI [[COPY23]], %bb.0, %79, %bb.2
+ ; CHECK-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_]], [[PHI2]], 0, implicit $exec
+ ; CHECK-NEXT: [[DS_READ_U8_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 killed [[V_ADD_U32_e64_2]], 0, 0, implicit $exec :: (load (s8), addrspace 3)
+ ; CHECK-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[PHI2]], [[S_MOV_B32_8]], implicit-def dead $scc
+ ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = nuw S_ADD_I32 [[PHI2]], [[S_MOV_B32_9]], implicit-def dead $scc
+ ; CHECK-NEXT: [[V_OR3_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR3_B32_e64 killed [[S_LSHL_B32_]], [[V_LSHLREV_B32_e64_1]], [[S_ADD_I32_]], implicit $exec
+ ; CHECK-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[PHI2]], [[S_MOV_B32_8]], implicit-def dead $scc
+ ; CHECK-NEXT: [[V_CMP_LT_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_LT_U32_e64 killed [[S_ADD_I32_1]], [[COPY21]], implicit $exec
+ ; CHECK-NEXT: [[COPY24:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]], implicit $exec
+ ; CHECK-NEXT: [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_LT_U32_e64_]], %bb.8, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: S_BRANCH %bb.4
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.4:
+ ; CHECK-NEXT: successors: %bb.9(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[S_MOV_B32_13:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; CHECK-NEXT: S_BRANCH %bb.9
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.5:
+ ; CHECK-NEXT: successors: %bb.8(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[PHI4:%[0-9]+]]:vgpr_32 = PHI %95, %bb.9
+ ; CHECK-NEXT: SI_END_CF %96, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_OR3_B32_e64_]], %108, 0, implicit $exec
+ ; CHECK-NEXT: S_BRANCH %bb.8
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.6:
+ ; CHECK-NEXT: successors: %bb.7(0x40000000), %bb.10(0x40000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[V_CMP_LT_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_LT_U32_e64 %98, [[COPY21]], implicit $exec
+ ; CHECK-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_LT_U32_e64_1]], %bb.10, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: S_BRANCH %bb.7
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.7:
+ ; CHECK-NEXT: successors: %bb.11(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[S_MOV_B32_14:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; CHECK-NEXT: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[DS_READ_U8_gfx9_]], [[S_MOV_B32_11]], implicit $exec
+ ; CHECK-NEXT: S_BRANCH %bb.11
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.8:
+ ; CHECK-NEXT: successors: %bb.6(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI [[COPY24]], %bb.3, [[PHI4]], %bb.5
+ ; CHECK-NEXT: [[PHI6:%[0-9]+]]:vgpr_32 = PHI [[PHI3]], %bb.3, %103, %bb.5
+ ; CHECK-NEXT: [[PHI7:%[0-9]+]]:vgpr_32 = PHI [[V_OR3_B32_e64_]], %bb.3, [[V_ADD_U32_e64_3]], %bb.5
+ ; CHECK-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: S_BRANCH %bb.6
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.9:
+ ; CHECK-NEXT: successors: %bb.5(0x04000000), %bb.9(0x7c000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[PHI8:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_13]], %bb.4, %96, %bb.9
+ ; CHECK-NEXT: [[PHI9:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_13]], %bb.4, %108, %bb.9
+ ; CHECK-NEXT: [[PHI10:%[0-9]+]]:vgpr_32 = PHI [[PHI3]], %bb.4, %103, %bb.9
+ ; CHECK-NEXT: [[V_ADD_U32_e64_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI10]], [[S_MOV_B32_9]], 0, implicit $exec
+ ; CHECK-NEXT: [[S_ADD_I32_2:%[0-9]+]]:sreg_32 = S_ADD_I32 [[PHI9]], [[S_MOV_B32_10]], implicit-def dead $scc
+ ; CHECK-NEXT: [[S_ADD_I32_3:%[0-9]+]]:sreg_32 = S_ADD_I32 [[PHI2]], [[S_ADD_I32_2]], implicit-def dead $scc
+ ; CHECK-NEXT: [[S_ADD_I32_4:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_ADD_I32_3]], [[S_MOV_B32_8]], implicit-def dead $scc
+ ; CHECK-NEXT: [[V_CMP_GE_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_GE_U32_e64 killed [[S_ADD_I32_4]], [[COPY21]], implicit $exec
+ ; CHECK-NEXT: [[S_ADD_I32_5:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_ADD_I32_3]], [[S_MOV_B32_9]], implicit-def dead $scc
+ ; CHECK-NEXT: [[SI_IF_BREAK1:%[0-9]+]]:sreg_32 = SI_IF_BREAK killed [[V_CMP_GE_U32_e64_1]], [[PHI8]], implicit-def dead $scc
+ ; CHECK-NEXT: [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_5]], implicit $exec
+ ; CHECK-NEXT: SI_LOOP [[SI_IF_BREAK1]], %bb.9, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: S_BRANCH %bb.5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.10:
+ ; CHECK-NEXT: successors: %bb.2(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[PHI11:%[0-9]+]]:vgpr_32 = PHI [[PHI6]], %bb.6, [[PHI]], %bb.1
+ ; CHECK-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: S_BRANCH %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.11:
+ ; CHECK-NEXT: successors: %bb.12(0x40000000), %bb.13(0x40000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[PHI12:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_14]], %bb.7, %75, %bb.13
+ ; CHECK-NEXT: [[PHI13:%[0-9]+]]:vgpr_32 = PHI [[PHI5]], %bb.7, %116, %bb.13
+ ; CHECK-NEXT: [[PHI14:%[0-9]+]]:vgpr_32 = PHI [[PHI6]], %bb.7, %74, %bb.13
+ ; CHECK-NEXT: [[PHI15:%[0-9]+]]:vgpr_32 = PHI [[PHI7]], %bb.7, %119, %bb.13
+ ; CHECK-NEXT: [[V_ADD_U32_e64_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_]], [[PHI13]], 0, implicit $exec
+ ; CHECK-NEXT: [[DS_READ_U8_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 killed [[V_ADD_U32_e64_5]], 0, 0, implicit $exec :: (load (s8), addrspace 3)
+ ; CHECK-NEXT: [[V_CMP_EQ_U16_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U16_e64 [[V_AND_B32_e64_2]], killed [[DS_READ_U8_gfx9_1]], implicit $exec
+ ; CHECK-NEXT: [[SI_IF2:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_EQ_U16_e64_]], %bb.13, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: S_BRANCH %bb.12
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.12:
+ ; CHECK-NEXT: successors: %bb.13(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[V_ADD_U32_e64_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI14]], [[S_MOV_B32_9]], 0, implicit $exec
+ ; CHECK-NEXT: [[COPY26:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0(p4)
+ ; CHECK-NEXT: [[COPY27:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1(p4)
+ ; CHECK-NEXT: [[COPY28:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub0
+ ; CHECK-NEXT: [[COPY29:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub1
+ ; CHECK-NEXT: [[S_ADD_U32_1:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY26]], [[COPY28]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADDC_U32_1:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY27]], [[COPY29]], implicit-def $scc, implicit $scc
+ ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_ADD_U32_1]], %subreg.sub0, [[S_ADDC_U32_1]], %subreg.sub1
+ ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ ; CHECK-NEXT: [[COPY30:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY5]]
+ ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF8]]
+ ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[REG_SEQUENCE3]]
+ ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY3]]
+ ; CHECK-NEXT: $sgpr12 = COPY [[COPY2]]
+ ; CHECK-NEXT: $sgpr13 = COPY [[COPY1]]
+ ; CHECK-NEXT: $sgpr14 = COPY [[COPY]]
+ ; CHECK-NEXT: $sgpr15 = COPY [[DEF9]]
+ ; CHECK-NEXT: $vgpr31 = COPY [[COPY6]](s32)
+ ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY30]]
+ ; CHECK-NEXT: $vgpr0 = COPY [[V_MOV_B32_e32_3]]
+ ; CHECK-NEXT: $sgpr30_sgpr31 = SI_CALL [[SI_PC_ADD_REL_OFFSET4]], @_Z10atomic_incPU3AS3Vj, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit-def $vgpr0
+ ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ ; CHECK-NEXT: [[COPY31:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[V_LSHLREV_B32_e64_2:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[S_MOV_B32_2]], [[COPY31]], implicit $exec
+ ; CHECK-NEXT: DS_WRITE_B32_gfx9 killed [[V_LSHLREV_B32_e64_2]], [[PHI15]], 0, 0, implicit $exec :: (store (s32), addrspace 3)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.13:
+ ; CHECK-NEXT: successors: %bb.1(0x04000000), %bb.11(0x7c000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[PHI16:%[0-9]+]]:vgpr_32 = PHI [[PHI14]], %bb.11, [[V_ADD_U32_e64_6]], %bb.12
+ ; CHECK-NEXT: SI_END_CF [[SI_IF2]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: [[V_ADD_U32_e64_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI15]], [[S_MOV_B32_9]], 0, implicit $exec
+ ; CHECK-NEXT: [[V_ADD_U32_e64_8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI13]], [[S_MOV_B32_9]], 0, implicit $exec
+ ; CHECK-NEXT: [[V_CMP_GE_U32_e64_2:%[0-9]+]]:sreg_32 = V_CMP_GE_U32_e64 [[V_ADD_U32_e64_8]], [[COPY21]], implicit $exec
+ ; CHECK-NEXT: [[SI_IF_BREAK2:%[0-9]+]]:sreg_32 = SI_IF_BREAK killed [[V_CMP_GE_U32_e64_2]], [[PHI12]], implicit-def dead $scc
+ ; CHECK-NEXT: SI_LOOP [[SI_IF_BREAK2]], %bb.11, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: S_BRANCH %bb.1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.14:
+ ; CHECK-NEXT: SI_END_CF [[SI_IF_BREAK]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 40
+ ; CHECK-NEXT: [[COPY32:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0(p4)
+ ; CHECK-NEXT: [[COPY33:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1(p4)
+ ; CHECK-NEXT: [[COPY34:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_1]].sub0
+ ; CHECK-NEXT: [[COPY35:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_1]].sub1
+ ; CHECK-NEXT: [[S_ADD_U32_2:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY32]], [[COPY34]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADDC_U32_2:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY33]], [[COPY35]], implicit-def $scc, implicit $scc
+ ; CHECK-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_ADD_U32_2]], %subreg.sub0, [[S_ADDC_U32_2]], %subreg.sub1
+ ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ ; CHECK-NEXT: [[COPY36:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; CHECK-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+ ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY5]]
+ ; CHECK-NEXT: [[DEF10:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF10]]
+ ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[REG_SEQUENCE4]]
+ ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY3]]
+ ; CHECK-NEXT: $sgpr12 = COPY [[COPY2]]
+ ; CHECK-NEXT: $sgpr13 = COPY [[COPY1]]
+ ; CHECK-NEXT: $sgpr14 = COPY [[COPY]]
+ ; CHECK-NEXT: [[DEF11:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: $sgpr15 = COPY [[DEF11]]
+ ; CHECK-NEXT: $vgpr31 = COPY [[COPY6]](s32)
+ ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY36]]
+ ; CHECK-NEXT: $vgpr0 = COPY [[V_MOV_B32_e32_4]]
+ ; CHECK-NEXT: $sgpr30_sgpr31 = SI_CALL [[SI_PC_ADD_REL_OFFSET2]], @_Z7barrierj, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0
+ ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ ; CHECK-NEXT: S_ENDPGM 0
+ bb.0:
+ successors: %bb.3(0x80000000)
+ liveins: $vgpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr12, $sgpr13, $sgpr14
+
+ %51:sgpr_32 = COPY $sgpr14
+ %50:sgpr_32 = COPY $sgpr13
+ %49:sgpr_32 = COPY $sgpr12
+ %47:sgpr_64 = COPY $sgpr8_sgpr9
+ %46:sgpr_64(p4) = COPY $sgpr6_sgpr7
+ %45:sgpr_64 = COPY $sgpr4_sgpr5
+ %43:vgpr_32(s32) = COPY $vgpr0
+ %54:sreg_64_xexec = S_LOAD_DWORDX2_IMM %46(p4), 16, 0 :: (dereferenceable invariant load (s64), align 16, addrspace 4)
+ %55:sreg_64 = S_MOV_B64 40
+ %158:sreg_32 = COPY %46.sub0(p4)
+ %159:sreg_32 = COPY %46.sub1(p4)
+ %160:sreg_32 = COPY %55.sub0
+ %161:sreg_32 = COPY %55.sub1
+ %156:sreg_32 = S_ADD_U32 %158, %160, implicit-def $scc
+ %157:sreg_32 = S_ADDC_U32 %159, %161, implicit-def $scc, implicit $scc
+ %56:sreg_64 = REG_SEQUENCE %156, %subreg.sub0, %157, %subreg.sub1
+ ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ %57:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @_Z13get_global_idj + 4, target-flags(amdgpu-rel32-hi) @_Z13get_global_idj + 12, implicit-def dead $scc
+ %58:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ %59:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ $sgpr4_sgpr5 = COPY %45
+ %60:sreg_64 = IMPLICIT_DEF
+ $sgpr6_sgpr7 = COPY %60
+ $sgpr8_sgpr9 = COPY %56
+ $sgpr10_sgpr11 = COPY %47
+ $sgpr12 = COPY %49
+ $sgpr13 = COPY %50
+ $sgpr14 = COPY %51
+ %61:sreg_32 = IMPLICIT_DEF
+ $sgpr15 = COPY %61
+ $vgpr31 = COPY %43(s32)
+ $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %58
+ $vgpr0 = COPY %59
+ $sgpr30_sgpr31 = SI_CALL killed %57, @_Z13get_global_idj, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit-def $vgpr0, implicit-def $vgpr1
+ ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ %62:vgpr_32 = COPY $vgpr0
+ %63:vgpr_32 = COPY $vgpr1
+ %150:vreg_64 = REG_SEQUENCE %62, %subreg.sub0, %63, %subreg.sub1
+ %152:vgpr_32 = COPY %150.sub0
+ ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ %66:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @_Z12get_local_idj + 4, target-flags(amdgpu-rel32-hi) @_Z12get_local_idj + 12, implicit-def dead $scc
+ %67:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ $sgpr4_sgpr5 = COPY %45
+ %68:sreg_64 = IMPLICIT_DEF
+ $sgpr6_sgpr7 = COPY %68
+ $sgpr8_sgpr9 = COPY %56
+ $sgpr10_sgpr11 = COPY %47
+ $sgpr12 = COPY %49
+ $sgpr13 = COPY %50
+ $sgpr14 = COPY %51
+ %69:sreg_32 = IMPLICIT_DEF
+ $sgpr15 = COPY %69
+ $vgpr31 = COPY %43(s32)
+ $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %67
+ $vgpr0 = COPY %59
+ $sgpr30_sgpr31 = SI_CALL killed %66, @_Z12get_local_idj, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit-def $vgpr0, implicit-def $vgpr1
+ ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ %70:vgpr_32 = COPY $vgpr0
+ %71:vgpr_32 = COPY $vgpr1
+ %149:vreg_64 = REG_SEQUENCE %70, %subreg.sub0, %71, %subreg.sub1
+ %151:vgpr_32 = COPY %149.sub0
+ %74:sreg_32 = S_MOV_B32 14
+ %75:vgpr_32 = V_MUL_LO_U32_e64 killed %151, killed %74, implicit $exec
+ %76:sreg_32 = S_MOV_B32 15364
+ %4:vgpr_32 = V_ADD_U32_e64 %75, killed %76, 0, implicit $exec
+ DS_WRITE_B32_gfx9 %59, %59, 15360, 0, implicit $exec :: (store (s32), align 1024, addrspace 3)
+ ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ %77:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @_Z7barrierj + 4, target-flags(amdgpu-rel32-hi) @_Z7barrierj + 12, implicit-def dead $scc
+ %78:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ %79:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+ $sgpr4_sgpr5 = COPY %45
+ %80:sreg_64 = IMPLICIT_DEF
+ $sgpr6_sgpr7 = COPY %80
+ $sgpr8_sgpr9 = COPY %56
+ $sgpr10_sgpr11 = COPY %47
+ $sgpr12 = COPY %49
+ $sgpr13 = COPY %50
+ $sgpr14 = COPY %51
+ %81:sreg_32 = IMPLICIT_DEF
+ $sgpr15 = COPY %81
+ $vgpr31 = COPY %43(s32)
+ $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %78
+ $vgpr0 = COPY %79
+ $sgpr30_sgpr31 = SI_CALL %77, @_Z7barrierj, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0
+ ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ %82:sreg_32 = S_MOV_B32 2
+ %83:vgpr_32 = V_LSHLREV_B32_e64 %82, %152, implicit $exec
+ %84:sreg_32 = S_MOV_B32 28
+ %85:vgpr_32 = V_AND_B32_e64 killed %83, killed %84, implicit $exec
+ %86:vgpr_32 = V_LSHRREV_B32_e64 %79, %152, implicit $exec
+ %87:sreg_32 = S_MOV_B32 2147483644
+ %88:vgpr_32 = V_AND_B32_e64 killed %86, killed %87, implicit $exec
+ %89:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR killed %54, killed %88, 0, 0, implicit $exec :: (load (s32), addrspace 1)
+ %90:vgpr_32 = V_BFE_U32_e64 killed %89, killed %85, 4, implicit $exec
+ ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ %91:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @_Z3minjj + 4, target-flags(amdgpu-rel32-hi) @_Z3minjj + 12, implicit-def dead $scc
+ %92:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ %93:vgpr_32 = V_MOV_B32_e32 12, implicit $exec
+ $sgpr4_sgpr5 = COPY %45
+ %94:sreg_64 = IMPLICIT_DEF
+ $sgpr6_sgpr7 = COPY %94
+ $sgpr8_sgpr9 = COPY %56
+ $sgpr10_sgpr11 = COPY %47
+ $sgpr12 = COPY %49
+ $sgpr13 = COPY %50
+ $sgpr14 = COPY %51
+ %95:sreg_32 = IMPLICIT_DEF
+ $sgpr15 = COPY %95
+ $vgpr31 = COPY %43(s32)
+ $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %92
+ $vgpr0 = COPY %90
+ $vgpr1 = COPY %93
+ $sgpr30_sgpr31 = SI_CALL killed %91, @_Z3minjj, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1, implicit-def $vgpr0
+ ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ %96:vgpr_32 = COPY $vgpr0
+ %97:sreg_32 = S_MOV_B32 -1
+ %2:vgpr_32 = V_ADD_U32_e64 %96, killed %97, 0, implicit $exec
+ %98:sreg_32 = S_MOV_B32 0
+ %99:vgpr_32 = COPY %98
+ DS_WRITE_B8_gfx9 %75, killed %99, 15364, 0, implicit $exec :: (store (s8), addrspace 3)
+ %100:sreg_32 = S_MOV_B32 10
+ %3:vgpr_32 = V_LSHLREV_B32_e64 killed %100, %152, implicit $exec
+ %153:vgpr_32 = COPY %98, implicit $exec
+ %102:sreg_32 = S_MOV_B32 5
+ %104:sreg_32 = S_MOV_B32 1
+ %109:sreg_32 = S_MOV_B32 4
+ %118:sreg_32 = S_MOV_B32 255
+ %124:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @_Z10atomic_incPU3AS3Vj + 4, target-flags(amdgpu-rel32-hi) @_Z10atomic_incPU3AS3Vj + 12, implicit-def dead $scc
+ %126:vgpr_32 = V_MOV_B32_e32 15360, implicit $exec
+ %127:sreg_64 = IMPLICIT_DEF
+ %128:sreg_32 = IMPLICIT_DEF
+ %135:sreg_32 = S_MOV_B32 59
+ S_BRANCH %bb.3
+
+ bb.1:
+ successors: %bb.10(0x80000000)
+
+ %5:vgpr_32 = PHI %38, %bb.13
+ SI_END_CF %41, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ S_BRANCH %bb.10
+
+ bb.2:
+ successors: %bb.14(0x04000000), %bb.3(0x7c000000)
+
+ %134:sreg_32 = V_CMP_GE_U32_e64 %12, %2, implicit $exec
+ %136:sreg_32 = V_CMP_GT_U32_e64 %31, %135, implicit $exec
+ %137:sreg_32 = S_OR_B32 killed %134, killed %136, implicit-def dead $scc
+ %7:sreg_32 = SI_IF_BREAK killed %137, %8, implicit-def dead $scc
+ SI_LOOP %7, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ S_BRANCH %bb.14
+
+ bb.3:
+ successors: %bb.4(0x40000000), %bb.8(0x40000000)
+
+ %8:sreg_32 = PHI %98, %bb.0, %7, %bb.2
+ %9:sreg_32 = PHI %98, %bb.0, %12, %bb.2
+ %10:vgpr_32 = PHI %153, %bb.0, %31, %bb.2
+ %101:vgpr_32 = V_ADD_U32_e64 %4, %9, 0, implicit $exec
+ %11:vgpr_32 = DS_READ_U8_gfx9 killed %101, 0, 0, implicit $exec :: (load (s8), addrspace 3)
+ %103:sreg_32 = S_LSHL_B32 %9, %102, implicit-def dead $scc
+ %12:sreg_32 = nuw S_ADD_I32 %9, %104, implicit-def dead $scc
+ %13:vgpr_32 = V_OR3_B32_e64 killed %103, %3, %12, implicit $exec
+ %105:sreg_32 = S_ADD_I32 %9, %102, implicit-def dead $scc
+ %106:sreg_32 = V_CMP_LT_U32_e64 killed %105, %96, implicit $exec
+ %155:vgpr_32 = COPY %12, implicit $exec
+ %14:sreg_32 = SI_IF killed %106, %bb.8, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ S_BRANCH %bb.4
+
+ bb.4:
+ successors: %bb.9(0x80000000)
+
+ %107:sreg_32 = S_MOV_B32 0
+ S_BRANCH %bb.9
+
+ bb.5:
+ successors: %bb.8(0x80000000)
+
+ %17:vgpr_32 = PHI %154, %bb.9
+ SI_END_CF %30, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ S_BRANCH %bb.8
+
+ bb.6:
+ successors: %bb.7(0x40000000), %bb.10(0x40000000)
+
+ %114:sreg_32 = V_CMP_LT_U32_e64 %20, %96, implicit $exec
+ %19:sreg_32 = SI_IF killed %114, %bb.10, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ S_BRANCH %bb.7
+
+ bb.7:
+ successors: %bb.11(0x80000000)
+
+ %115:sreg_32 = S_MOV_B32 0
+ %119:vgpr_32 = V_AND_B32_e64 %11, %118, implicit $exec
+ S_BRANCH %bb.11
+
+ bb.8:
+ successors: %bb.6(0x80000000)
+
+ %20:vgpr_32 = PHI %155, %bb.3, %17, %bb.5
+ %21:vgpr_32 = PHI %10, %bb.3, %26, %bb.5
+ %22:vgpr_32 = PHI %13, %bb.3, %28, %bb.5
+ SI_END_CF %14, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ S_BRANCH %bb.6
+
+ bb.9:
+ successors: %bb.5(0x04000000), %bb.9(0x7c000000)
+
+ %23:sreg_32 = PHI %107, %bb.4, %30, %bb.9
+ %24:sreg_32 = PHI %107, %bb.4, %27, %bb.9
+ %25:vgpr_32 = PHI %10, %bb.4, %26, %bb.9
+ %26:vgpr_32 = V_ADD_U32_e64 %25, %104, 0, implicit $exec
+ %27:sreg_32 = S_ADD_I32 %24, %109, implicit-def dead $scc
+ %110:sreg_32 = S_ADD_I32 %9, %27, implicit-def dead $scc
+ %112:sreg_32 = S_ADD_I32 %110, %102, implicit-def dead $scc
+ %113:sreg_32 = V_CMP_GE_U32_e64 killed %112, %96, implicit $exec
+ %28:vgpr_32 = V_ADD_U32_e64 %13, %27, 0, implicit $exec
+ %29:sreg_32 = S_ADD_I32 %110, %104, implicit-def dead $scc
+ %30:sreg_32 = SI_IF_BREAK killed %113, %23, implicit-def dead $scc
+ %154:vgpr_32 = COPY %29, implicit $exec
+ SI_LOOP %30, %bb.9, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ S_BRANCH %bb.5
+
+ bb.10:
+ successors: %bb.2(0x80000000)
+
+ %31:vgpr_32 = PHI %21, %bb.6, %5, %bb.1
+ SI_END_CF %19, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ S_BRANCH %bb.2
+
+ bb.11:
+ successors: %bb.12(0x40000000), %bb.13(0x40000000)
+
+ %32:sreg_32 = PHI %115, %bb.7, %41, %bb.13
+ %33:vgpr_32 = PHI %20, %bb.7, %40, %bb.13
+ %34:vgpr_32 = PHI %21, %bb.7, %38, %bb.13
+ %35:vgpr_32 = PHI %22, %bb.7, %39, %bb.13
+ %116:vgpr_32 = V_ADD_U32_e64 %4, %33, 0, implicit $exec
+ %117:vgpr_32 = DS_READ_U8_gfx9 killed %116, 0, 0, implicit $exec :: (load (s8), addrspace 3)
+ %120:sreg_32 = V_CMP_EQ_U16_e64 %119, killed %117, implicit $exec
+ %36:sreg_32 = SI_IF killed %120, %bb.13, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ S_BRANCH %bb.12
+
+ bb.12:
+ successors: %bb.13(0x80000000)
+
+ %37:vgpr_32 = V_ADD_U32_e64 %34, %104, 0, implicit $exec
+ %164:sreg_32 = COPY %46.sub0(p4)
+ %165:sreg_32 = COPY %46.sub1(p4)
+ %166:sreg_32 = COPY %55.sub0
+ %167:sreg_32 = COPY %55.sub1
+ %162:sreg_32 = S_ADD_U32 %164, %166, implicit-def $scc
+ %163:sreg_32 = S_ADDC_U32 %165, %167, implicit-def $scc, implicit $scc
+ %123:sreg_64 = REG_SEQUENCE %162, %subreg.sub0, %163, %subreg.sub1
+ ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ %125:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ $sgpr4_sgpr5 = COPY %45
+ $sgpr6_sgpr7 = COPY %127
+ $sgpr8_sgpr9 = COPY %123
+ $sgpr10_sgpr11 = COPY %47
+ $sgpr12 = COPY %49
+ $sgpr13 = COPY %50
+ $sgpr14 = COPY %51
+ $sgpr15 = COPY %128
+ $vgpr31 = COPY %43(s32)
+ $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %125
+ $vgpr0 = COPY %126
+ $sgpr30_sgpr31 = SI_CALL %124, @_Z10atomic_incPU3AS3Vj, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit-def $vgpr0
+ ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ %129:vgpr_32 = COPY $vgpr0
+ %131:vgpr_32 = V_LSHLREV_B32_e64 %82, %129, implicit $exec
+ DS_WRITE_B32_gfx9 killed %131, %35, 0, 0, implicit $exec :: (store (s32), addrspace 3)
+
+ bb.13:
+ successors: %bb.1(0x04000000), %bb.11(0x7c000000)
+
+ %38:vgpr_32 = PHI %34, %bb.11, %37, %bb.12
+ SI_END_CF %36, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ %39:vgpr_32 = V_ADD_U32_e64 %35, %104, 0, implicit $exec
+ %40:vgpr_32 = V_ADD_U32_e64 %33, %104, 0, implicit $exec
+ %133:sreg_32 = V_CMP_GE_U32_e64 %40, %96, implicit $exec
+ %41:sreg_32 = SI_IF_BREAK killed %133, %32, implicit-def dead $scc
+ SI_LOOP %41, %bb.11, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.14:
+ SI_END_CF %7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ %138:sreg_64 = S_MOV_B64 40
+ %170:sreg_32 = COPY %46.sub0(p4)
+ %171:sreg_32 = COPY %46.sub1(p4)
+ %172:sreg_32 = COPY %138.sub0
+ %173:sreg_32 = COPY %138.sub1
+ %168:sreg_32 = S_ADD_U32 %170, %172, implicit-def $scc
+ %169:sreg_32 = S_ADDC_U32 %171, %173, implicit-def $scc, implicit $scc
+ %139:sreg_64 = REG_SEQUENCE %168, %subreg.sub0, %169, %subreg.sub1
+ ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ %141:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ %142:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+ $sgpr4_sgpr5 = COPY %45
+ %143:sreg_64 = IMPLICIT_DEF
+ $sgpr6_sgpr7 = COPY %143
+ $sgpr8_sgpr9 = COPY %139
+ $sgpr10_sgpr11 = COPY %47
+ $sgpr12 = COPY %49
+ $sgpr13 = COPY %50
+ $sgpr14 = COPY %51
+ %144:sreg_32 = IMPLICIT_DEF
+ $sgpr15 = COPY %144
+ $vgpr31 = COPY %43(s32)
+ $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %141
+ $vgpr0 = COPY %142
+ $sgpr30_sgpr31 = SI_CALL %77, @_Z7barrierj, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0
+ ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ S_ENDPGM 0
+
+...
+
+
+---
+name: sink_salu
+alignment: 1
+tracksRegLiveness: true
+machineFunctionInfo:
+ scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
+ stackPtrOffsetReg: '$sgpr32'
+
+body: |
+ ; CHECK-LABEL: name: sink_salu
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.3(0x80000000)
+ ; CHECK-NEXT: liveins: $vgpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr12, $sgpr13, $sgpr14
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7
+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+ ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+ ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY4]](p4), 16, 0 :: (dereferenceable invariant load (s64), align 16, addrspace 4)
+ ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 40
+ ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0(p4)
+ ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1(p4)
+ ; CHECK-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub0
+ ; CHECK-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub1
+ ; CHECK-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY7]], [[COPY9]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY8]], [[COPY10]], implicit-def $scc, implicit $scc
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+ ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @_Z13get_global_idj + 4, target-flags(amdgpu-rel32-hi) @_Z13get_global_idj + 12, implicit-def dead $scc
+ ; CHECK-NEXT: [[COPY11:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY5]]
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF]]
+ ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[REG_SEQUENCE]]
+ ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY3]]
+ ; CHECK-NEXT: $sgpr12 = COPY [[COPY2]]
+ ; CHECK-NEXT: $sgpr13 = COPY [[COPY1]]
+ ; CHECK-NEXT: $sgpr14 = COPY [[COPY]]
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]]
+ ; CHECK-NEXT: $vgpr31 = COPY [[COPY6]](s32)
+ ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY11]]
+ ; CHECK-NEXT: $vgpr0 = COPY [[V_MOV_B32_e32_]]
+ ; CHECK-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[SI_PC_ADD_REL_OFFSET]], @_Z13get_global_idj, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit-def $vgpr0, implicit-def $vgpr1
+ ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY13]], %subreg.sub1
+ ; CHECK-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
+ ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET1:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @_Z12get_local_idj + 4, target-flags(amdgpu-rel32-hi) @_Z12get_local_idj + 12, implicit-def dead $scc
+ ; CHECK-NEXT: [[COPY15:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY5]]
+ ; CHECK-NEXT: [[DEF2:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF2]]
+ ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[REG_SEQUENCE]]
+ ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY3]]
+ ; CHECK-NEXT: $sgpr12 = COPY [[COPY2]]
+ ; CHECK-NEXT: $sgpr13 = COPY [[COPY1]]
+ ; CHECK-NEXT: $sgpr14 = COPY [[COPY]]
+ ; CHECK-NEXT: [[DEF3:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: $sgpr15 = COPY [[DEF3]]
+ ; CHECK-NEXT: $vgpr31 = COPY [[COPY6]](s32)
+ ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY15]]
+ ; CHECK-NEXT: $vgpr0 = COPY [[V_MOV_B32_e32_]]
+ ; CHECK-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[SI_PC_ADD_REL_OFFSET1]], @_Z12get_local_idj, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit-def $vgpr0, implicit-def $vgpr1
+ ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ ; CHECK-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY16]], %subreg.sub0, [[COPY17]], %subreg.sub1
+ ; CHECK-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0
+ ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 14
+ ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 killed [[COPY18]], killed [[S_MOV_B32_]], implicit $exec
+ ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 15364
+ ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MUL_LO_U32_e64_]], killed [[S_MOV_B32_1]], 0, implicit $exec
+ ; CHECK-NEXT: DS_WRITE_B32_gfx9 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_]], 15360, 0, implicit $exec :: (store (s32), align 1024, addrspace 3)
+ ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET2:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @_Z7barrierj + 4, target-flags(amdgpu-rel32-hi) @_Z7barrierj + 12, implicit-def dead $scc
+ ; CHECK-NEXT: [[COPY19:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+ ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY5]]
+ ; CHECK-NEXT: [[DEF4:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF4]]
+ ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[REG_SEQUENCE]]
+ ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY3]]
+ ; CHECK-NEXT: $sgpr12 = COPY [[COPY2]]
+ ; CHECK-NEXT: $sgpr13 = COPY [[COPY1]]
+ ; CHECK-NEXT: $sgpr14 = COPY [[COPY]]
+ ; CHECK-NEXT: [[DEF5:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: $sgpr15 = COPY [[DEF5]]
+ ; CHECK-NEXT: $vgpr31 = COPY [[COPY6]](s32)
+ ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]]
+ ; CHECK-NEXT: $vgpr0 = COPY [[V_MOV_B32_e32_1]]
+ ; CHECK-NEXT: $sgpr30_sgpr31 = SI_CALL [[SI_PC_ADD_REL_OFFSET2]], @_Z7barrierj, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0
+ ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ ; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2
+ ; CHECK-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[S_MOV_B32_2]], [[COPY14]], implicit $exec
+ ; CHECK-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 28
+ ; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 killed [[V_LSHLREV_B32_e64_]], killed [[S_MOV_B32_3]], implicit $exec
+ ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_1]], [[COPY14]], implicit $exec
+ ; CHECK-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483644
+ ; CHECK-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 killed [[V_LSHRREV_B32_e64_]], killed [[S_MOV_B32_4]], implicit $exec
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR killed [[S_LOAD_DWORDX2_IMM]], killed [[V_AND_B32_e64_1]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+ ; CHECK-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 killed [[GLOBAL_LOAD_DWORD_SADDR]], killed [[V_AND_B32_e64_]], 4, implicit $exec
+ ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET3:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @_Z3minjj + 4, target-flags(amdgpu-rel32-hi) @_Z3minjj + 12, implicit-def dead $scc
+ ; CHECK-NEXT: [[COPY20:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; CHECK-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 12, implicit $exec
+ ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY5]]
+ ; CHECK-NEXT: [[DEF6:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF6]]
+ ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[REG_SEQUENCE]]
+ ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY3]]
+ ; CHECK-NEXT: $sgpr12 = COPY [[COPY2]]
+ ; CHECK-NEXT: $sgpr13 = COPY [[COPY1]]
+ ; CHECK-NEXT: $sgpr14 = COPY [[COPY]]
+ ; CHECK-NEXT: [[DEF7:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: $sgpr15 = COPY [[DEF7]]
+ ; CHECK-NEXT: $vgpr31 = COPY [[COPY6]](s32)
+ ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]]
+ ; CHECK-NEXT: $vgpr0 = COPY [[V_BFE_U32_e64_]]
+ ; CHECK-NEXT: $vgpr1 = COPY [[V_MOV_B32_e32_2]]
+ ; CHECK-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[SI_PC_ADD_REL_OFFSET3]], @_Z3minjj, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1, implicit-def $vgpr0
+ ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ ; CHECK-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+ ; CHECK-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY21]], killed [[S_MOV_B32_5]], 0, implicit $exec
+ ; CHECK-NEXT: [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; CHECK-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_6]]
+ ; CHECK-NEXT: DS_WRITE_B8_gfx9 [[V_MUL_LO_U32_e64_]], killed [[COPY22]], 15364, 0, implicit $exec :: (store (s8), addrspace 3)
+ ; CHECK-NEXT: [[S_MOV_B32_7:%[0-9]+]]:sreg_32 = S_MOV_B32 10
+ ; CHECK-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 killed [[S_MOV_B32_7]], [[COPY14]], implicit $exec
+ ; CHECK-NEXT: [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_6]], implicit $exec
+ ; CHECK-NEXT: [[S_MOV_B32_8:%[0-9]+]]:sreg_32 = S_MOV_B32 5
+ ; CHECK-NEXT: [[S_MOV_B32_9:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+ ; CHECK-NEXT: [[S_MOV_B32_10:%[0-9]+]]:sreg_32 = S_MOV_B32 4
+ ; CHECK-NEXT: [[S_MOV_B32_11:%[0-9]+]]:sreg_32 = S_MOV_B32 255
+ ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET4:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @_Z10atomic_incPU3AS3Vj + 4, target-flags(amdgpu-rel32-hi) @_Z10atomic_incPU3AS3Vj + 12, implicit-def dead $scc
+ ; CHECK-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 15360, implicit $exec
+ ; CHECK-NEXT: [[DEF8:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF9:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[S_MOV_B32_12:%[0-9]+]]:sreg_32 = S_MOV_B32 59
+ ; CHECK-NEXT: S_BRANCH %bb.3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.10(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI %74, %bb.13
+ ; CHECK-NEXT: SI_END_CF %75, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: S_BRANCH %bb.10
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: successors: %bb.14(0x04000000), %bb.3(0x7c000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[V_CMP_GE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_GE_U32_e64 %77, [[V_ADD_U32_e64_1]], implicit $exec
+ ; CHECK-NEXT: [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_GT_U32_e64 %79, [[S_MOV_B32_12]], implicit $exec
+ ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 killed [[V_CMP_GE_U32_e64_]], killed [[V_CMP_GT_U32_e64_]], implicit-def dead $scc
+ ; CHECK-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_32 = SI_IF_BREAK killed [[S_OR_B32_]], %82, implicit-def dead $scc
+ ; CHECK-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: S_BRANCH %bb.14
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.3:
+ ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.8(0x40000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[PHI1:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_6]], %bb.0, [[SI_IF_BREAK]], %bb.2
+ ; CHECK-NEXT: [[PHI2:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_6]], %bb.0, %77, %bb.2
+ ; CHECK-NEXT: [[PHI3:%[0-9]+]]:vgpr_32 = PHI [[COPY23]], %bb.0, %79, %bb.2
+ ; CHECK-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_]], [[PHI2]], 0, implicit $exec
+ ; CHECK-NEXT: [[DS_READ_U8_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 killed [[V_ADD_U32_e64_2]], 0, 0, implicit $exec :: (load (s8), addrspace 3)
+ ; CHECK-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[PHI2]], [[S_MOV_B32_8]], implicit-def dead $scc
+ ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = nuw S_ADD_I32 [[PHI2]], [[S_MOV_B32_9]], implicit-def dead $scc
+ ; CHECK-NEXT: [[V_OR3_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR3_B32_e64 killed [[S_LSHL_B32_]], [[V_LSHLREV_B32_e64_1]], [[S_ADD_I32_]], implicit $exec
+ ; CHECK-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[PHI2]], [[S_MOV_B32_8]], implicit-def dead $scc
+ ; CHECK-NEXT: [[V_CMP_LT_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_LT_U32_e64 killed [[S_ADD_I32_1]], [[COPY21]], implicit $exec
+ ; CHECK-NEXT: [[COPY24:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]], implicit $exec
+ ; CHECK-NEXT: [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_LT_U32_e64_]], %bb.8, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: S_BRANCH %bb.4
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.4:
+ ; CHECK-NEXT: successors: %bb.9(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[S_MOV_B32_13:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; CHECK-NEXT: S_BRANCH %bb.9
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.5:
+ ; CHECK-NEXT: successors: %bb.8(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[PHI4:%[0-9]+]]:vgpr_32 = PHI %95, %bb.9
+ ; CHECK-NEXT: SI_END_CF %96, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: [[S_ADD_I32_2:%[0-9]+]]:sreg_32 = S_ADD_I32 %108, 1, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_ADD_I32_3:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_ADD_I32_2]], 2, implicit-def dead $scc
+ ; CHECK-NEXT: [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_OR3_B32_e64_]], [[S_ADD_I32_3]], 0, implicit $exec
+ ; CHECK-NEXT: S_BRANCH %bb.8
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.6:
+ ; CHECK-NEXT: successors: %bb.7(0x40000000), %bb.10(0x40000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[V_CMP_LT_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_LT_U32_e64 %98, [[COPY21]], implicit $exec
+ ; CHECK-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_LT_U32_e64_1]], %bb.10, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: S_BRANCH %bb.7
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.7:
+ ; CHECK-NEXT: successors: %bb.11(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[S_MOV_B32_14:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; CHECK-NEXT: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[DS_READ_U8_gfx9_]], [[S_MOV_B32_11]], implicit $exec
+ ; CHECK-NEXT: S_BRANCH %bb.11
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.8:
+ ; CHECK-NEXT: successors: %bb.6(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI [[COPY24]], %bb.3, [[PHI4]], %bb.5
+ ; CHECK-NEXT: [[PHI6:%[0-9]+]]:vgpr_32 = PHI [[PHI3]], %bb.3, %103, %bb.5
+ ; CHECK-NEXT: [[PHI7:%[0-9]+]]:vgpr_32 = PHI [[V_OR3_B32_e64_]], %bb.3, [[V_ADD_U32_e64_3]], %bb.5
+ ; CHECK-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: S_BRANCH %bb.6
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.9:
+ ; CHECK-NEXT: successors: %bb.5(0x04000000), %bb.9(0x7c000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[PHI8:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_13]], %bb.4, %96, %bb.9
+ ; CHECK-NEXT: [[PHI9:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_13]], %bb.4, %108, %bb.9
+ ; CHECK-NEXT: [[PHI10:%[0-9]+]]:vgpr_32 = PHI [[PHI3]], %bb.4, %103, %bb.9
+ ; CHECK-NEXT: [[V_ADD_U32_e64_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI10]], [[S_MOV_B32_9]], 0, implicit $exec
+ ; CHECK-NEXT: [[S_ADD_I32_4:%[0-9]+]]:sreg_32 = S_ADD_I32 [[PHI9]], [[S_MOV_B32_10]], implicit-def dead $scc
+ ; CHECK-NEXT: [[S_ADD_I32_5:%[0-9]+]]:sreg_32 = S_ADD_I32 [[PHI2]], [[S_ADD_I32_4]], implicit-def dead $scc
+ ; CHECK-NEXT: [[S_ADD_I32_6:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_ADD_I32_5]], [[S_MOV_B32_8]], implicit-def dead $scc
+ ; CHECK-NEXT: [[V_CMP_GE_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_GE_U32_e64 killed [[S_ADD_I32_6]], [[COPY21]], implicit $exec
+ ; CHECK-NEXT: [[S_ADD_I32_7:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_ADD_I32_5]], [[S_MOV_B32_9]], implicit-def dead $scc
+ ; CHECK-NEXT: [[SI_IF_BREAK1:%[0-9]+]]:sreg_32 = SI_IF_BREAK killed [[V_CMP_GE_U32_e64_1]], [[PHI8]], implicit-def dead $scc
+ ; CHECK-NEXT: [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_7]], implicit $exec
+ ; CHECK-NEXT: SI_LOOP [[SI_IF_BREAK1]], %bb.9, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: S_BRANCH %bb.5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.10:
+ ; CHECK-NEXT: successors: %bb.2(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[PHI11:%[0-9]+]]:vgpr_32 = PHI [[PHI6]], %bb.6, [[PHI]], %bb.1
+ ; CHECK-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: S_BRANCH %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.11:
+ ; CHECK-NEXT: successors: %bb.12(0x40000000), %bb.13(0x40000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[PHI12:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_14]], %bb.7, %75, %bb.13
+ ; CHECK-NEXT: [[PHI13:%[0-9]+]]:vgpr_32 = PHI [[PHI5]], %bb.7, %118, %bb.13
+ ; CHECK-NEXT: [[PHI14:%[0-9]+]]:vgpr_32 = PHI [[PHI6]], %bb.7, %74, %bb.13
+ ; CHECK-NEXT: [[PHI15:%[0-9]+]]:vgpr_32 = PHI [[PHI7]], %bb.7, %121, %bb.13
+ ; CHECK-NEXT: [[V_ADD_U32_e64_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_]], [[PHI13]], 0, implicit $exec
+ ; CHECK-NEXT: [[DS_READ_U8_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 killed [[V_ADD_U32_e64_5]], 0, 0, implicit $exec :: (load (s8), addrspace 3)
+ ; CHECK-NEXT: [[V_CMP_EQ_U16_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U16_e64 [[V_AND_B32_e64_2]], killed [[DS_READ_U8_gfx9_1]], implicit $exec
+ ; CHECK-NEXT: [[SI_IF2:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_EQ_U16_e64_]], %bb.13, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: S_BRANCH %bb.12
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.12:
+ ; CHECK-NEXT: successors: %bb.13(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[V_ADD_U32_e64_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI14]], [[S_MOV_B32_9]], 0, implicit $exec
+ ; CHECK-NEXT: [[COPY26:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0(p4)
+ ; CHECK-NEXT: [[COPY27:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1(p4)
+ ; CHECK-NEXT: [[COPY28:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub0
+ ; CHECK-NEXT: [[COPY29:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub1
+ ; CHECK-NEXT: [[S_ADD_U32_1:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY26]], [[COPY28]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADDC_U32_1:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY27]], [[COPY29]], implicit-def $scc, implicit $scc
+ ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_ADD_U32_1]], %subreg.sub0, [[S_ADDC_U32_1]], %subreg.sub1
+ ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ ; CHECK-NEXT: [[COPY30:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY5]]
+ ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF8]]
+ ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[REG_SEQUENCE3]]
+ ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY3]]
+ ; CHECK-NEXT: $sgpr12 = COPY [[COPY2]]
+ ; CHECK-NEXT: $sgpr13 = COPY [[COPY1]]
+ ; CHECK-NEXT: $sgpr14 = COPY [[COPY]]
+ ; CHECK-NEXT: $sgpr15 = COPY [[DEF9]]
+ ; CHECK-NEXT: $vgpr31 = COPY [[COPY6]](s32)
+ ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY30]]
+ ; CHECK-NEXT: $vgpr0 = COPY [[V_MOV_B32_e32_3]]
+ ; CHECK-NEXT: $sgpr30_sgpr31 = SI_CALL [[SI_PC_ADD_REL_OFFSET4]], @_Z10atomic_incPU3AS3Vj, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit-def $vgpr0
+ ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ ; CHECK-NEXT: [[COPY31:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[V_LSHLREV_B32_e64_2:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[S_MOV_B32_2]], [[COPY31]], implicit $exec
+ ; CHECK-NEXT: DS_WRITE_B32_gfx9 killed [[V_LSHLREV_B32_e64_2]], [[PHI15]], 0, 0, implicit $exec :: (store (s32), addrspace 3)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.13:
+ ; CHECK-NEXT: successors: %bb.1(0x04000000), %bb.11(0x7c000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[PHI16:%[0-9]+]]:vgpr_32 = PHI [[PHI14]], %bb.11, [[V_ADD_U32_e64_6]], %bb.12
+ ; CHECK-NEXT: SI_END_CF [[SI_IF2]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: [[V_ADD_U32_e64_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI15]], [[S_MOV_B32_9]], 0, implicit $exec
+ ; CHECK-NEXT: [[V_ADD_U32_e64_8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI13]], [[S_MOV_B32_9]], 0, implicit $exec
+ ; CHECK-NEXT: [[V_CMP_GE_U32_e64_2:%[0-9]+]]:sreg_32 = V_CMP_GE_U32_e64 [[V_ADD_U32_e64_8]], [[COPY21]], implicit $exec
+ ; CHECK-NEXT: [[SI_IF_BREAK2:%[0-9]+]]:sreg_32 = SI_IF_BREAK killed [[V_CMP_GE_U32_e64_2]], [[PHI12]], implicit-def dead $scc
+ ; CHECK-NEXT: SI_LOOP [[SI_IF_BREAK2]], %bb.11, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: S_BRANCH %bb.1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.14:
+ ; CHECK-NEXT: SI_END_CF [[SI_IF_BREAK]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 40
+ ; CHECK-NEXT: [[COPY32:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0(p4)
+ ; CHECK-NEXT: [[COPY33:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1(p4)
+ ; CHECK-NEXT: [[COPY34:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_1]].sub0
+ ; CHECK-NEXT: [[COPY35:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_1]].sub1
+ ; CHECK-NEXT: [[S_ADD_U32_2:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY32]], [[COPY34]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADDC_U32_2:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY33]], [[COPY35]], implicit-def $scc, implicit $scc
+ ; CHECK-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_ADD_U32_2]], %subreg.sub0, [[S_ADDC_U32_2]], %subreg.sub1
+ ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ ; CHECK-NEXT: [[COPY36:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; CHECK-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+ ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY5]]
+ ; CHECK-NEXT: [[DEF10:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF10]]
+ ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[REG_SEQUENCE4]]
+ ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY3]]
+ ; CHECK-NEXT: $sgpr12 = COPY [[COPY2]]
+ ; CHECK-NEXT: $sgpr13 = COPY [[COPY1]]
+ ; CHECK-NEXT: $sgpr14 = COPY [[COPY]]
+ ; CHECK-NEXT: [[DEF11:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: $sgpr15 = COPY [[DEF11]]
+ ; CHECK-NEXT: $vgpr31 = COPY [[COPY6]](s32)
+ ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY36]]
+ ; CHECK-NEXT: $vgpr0 = COPY [[V_MOV_B32_e32_4]]
+ ; CHECK-NEXT: $sgpr30_sgpr31 = SI_CALL [[SI_PC_ADD_REL_OFFSET2]], @_Z7barrierj, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0
+ ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ ; CHECK-NEXT: S_ENDPGM 0
+ bb.0:
+ successors: %bb.3(0x80000000)
+ liveins: $vgpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr12, $sgpr13, $sgpr14
+
+ %51:sgpr_32 = COPY $sgpr14
+ %50:sgpr_32 = COPY $sgpr13
+ %49:sgpr_32 = COPY $sgpr12
+ %47:sgpr_64 = COPY $sgpr8_sgpr9
+ %46:sgpr_64(p4) = COPY $sgpr6_sgpr7
+ %45:sgpr_64 = COPY $sgpr4_sgpr5
+ %43:vgpr_32(s32) = COPY $vgpr0
+ %54:sreg_64_xexec = S_LOAD_DWORDX2_IMM %46(p4), 16, 0 :: (dereferenceable invariant load (s64), align 16, addrspace 4)
+ %55:sreg_64 = S_MOV_B64 40
+ %158:sreg_32 = COPY %46.sub0(p4)
+ %159:sreg_32 = COPY %46.sub1(p4)
+ %160:sreg_32 = COPY %55.sub0
+ %161:sreg_32 = COPY %55.sub1
+ %156:sreg_32 = S_ADD_U32 %158, %160, implicit-def $scc
+ %157:sreg_32 = S_ADDC_U32 %159, %161, implicit-def $scc, implicit $scc
+ %56:sreg_64 = REG_SEQUENCE %156, %subreg.sub0, %157, %subreg.sub1
+ ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ %57:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @_Z13get_global_idj + 4, target-flags(amdgpu-rel32-hi) @_Z13get_global_idj + 12, implicit-def dead $scc
+ %58:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ %59:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ $sgpr4_sgpr5 = COPY %45
+ %60:sreg_64 = IMPLICIT_DEF
+ $sgpr6_sgpr7 = COPY %60
+ $sgpr8_sgpr9 = COPY %56
+ $sgpr10_sgpr11 = COPY %47
+ $sgpr12 = COPY %49
+ $sgpr13 = COPY %50
+ $sgpr14 = COPY %51
+ %61:sreg_32 = IMPLICIT_DEF
+ $sgpr15 = COPY %61
+ $vgpr31 = COPY %43(s32)
+ $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %58
+ $vgpr0 = COPY %59
+ $sgpr30_sgpr31 = SI_CALL killed %57, @_Z13get_global_idj, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit-def $vgpr0, implicit-def $vgpr1
+ ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ %62:vgpr_32 = COPY $vgpr0
+ %63:vgpr_32 = COPY $vgpr1
+ %150:vreg_64 = REG_SEQUENCE %62, %subreg.sub0, %63, %subreg.sub1
+ %152:vgpr_32 = COPY %150.sub0
+ ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ %66:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @_Z12get_local_idj + 4, target-flags(amdgpu-rel32-hi) @_Z12get_local_idj + 12, implicit-def dead $scc
+ %67:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ $sgpr4_sgpr5 = COPY %45
+ %68:sreg_64 = IMPLICIT_DEF
+ $sgpr6_sgpr7 = COPY %68
+ $sgpr8_sgpr9 = COPY %56
+ $sgpr10_sgpr11 = COPY %47
+ $sgpr12 = COPY %49
+ $sgpr13 = COPY %50
+ $sgpr14 = COPY %51
+ %69:sreg_32 = IMPLICIT_DEF
+ $sgpr15 = COPY %69
+ $vgpr31 = COPY %43(s32)
+ $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %67
+ $vgpr0 = COPY %59
+ $sgpr30_sgpr31 = SI_CALL killed %66, @_Z12get_local_idj, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit-def $vgpr0, implicit-def $vgpr1
+ ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ %70:vgpr_32 = COPY $vgpr0
+ %71:vgpr_32 = COPY $vgpr1
+ %149:vreg_64 = REG_SEQUENCE %70, %subreg.sub0, %71, %subreg.sub1
+ %151:vgpr_32 = COPY %149.sub0
+ %74:sreg_32 = S_MOV_B32 14
+ %75:vgpr_32 = V_MUL_LO_U32_e64 killed %151, killed %74, implicit $exec
+ %76:sreg_32 = S_MOV_B32 15364
+ %4:vgpr_32 = V_ADD_U32_e64 %75, killed %76, 0, implicit $exec
+ DS_WRITE_B32_gfx9 %59, %59, 15360, 0, implicit $exec :: (store (s32), align 1024, addrspace 3)
+ ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ %77:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @_Z7barrierj + 4, target-flags(amdgpu-rel32-hi) @_Z7barrierj + 12, implicit-def dead $scc
+ %78:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ %79:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+ $sgpr4_sgpr5 = COPY %45
+ %80:sreg_64 = IMPLICIT_DEF
+ $sgpr6_sgpr7 = COPY %80
+ $sgpr8_sgpr9 = COPY %56
+ $sgpr10_sgpr11 = COPY %47
+ $sgpr12 = COPY %49
+ $sgpr13 = COPY %50
+ $sgpr14 = COPY %51
+ %81:sreg_32 = IMPLICIT_DEF
+ $sgpr15 = COPY %81
+ $vgpr31 = COPY %43(s32)
+ $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %78
+ $vgpr0 = COPY %79
+ $sgpr30_sgpr31 = SI_CALL %77, @_Z7barrierj, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0
+ ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ %82:sreg_32 = S_MOV_B32 2
+ %83:vgpr_32 = V_LSHLREV_B32_e64 %82, %152, implicit $exec
+ %84:sreg_32 = S_MOV_B32 28
+ %85:vgpr_32 = V_AND_B32_e64 killed %83, killed %84, implicit $exec
+ %86:vgpr_32 = V_LSHRREV_B32_e64 %79, %152, implicit $exec
+ %87:sreg_32 = S_MOV_B32 2147483644
+ %88:vgpr_32 = V_AND_B32_e64 killed %86, killed %87, implicit $exec
+ %89:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR killed %54, killed %88, 0, 0, implicit $exec :: (load (s32), addrspace 1)
+ %90:vgpr_32 = V_BFE_U32_e64 killed %89, killed %85, 4, implicit $exec
+ ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ %91:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @_Z3minjj + 4, target-flags(amdgpu-rel32-hi) @_Z3minjj + 12, implicit-def dead $scc
+ %92:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ %93:vgpr_32 = V_MOV_B32_e32 12, implicit $exec
+ $sgpr4_sgpr5 = COPY %45
+ %94:sreg_64 = IMPLICIT_DEF
+ $sgpr6_sgpr7 = COPY %94
+ $sgpr8_sgpr9 = COPY %56
+ $sgpr10_sgpr11 = COPY %47
+ $sgpr12 = COPY %49
+ $sgpr13 = COPY %50
+ $sgpr14 = COPY %51
+ %95:sreg_32 = IMPLICIT_DEF
+ $sgpr15 = COPY %95
+ $vgpr31 = COPY %43(s32)
+ $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %92
+ $vgpr0 = COPY %90
+ $vgpr1 = COPY %93
+ $sgpr30_sgpr31 = SI_CALL killed %91, @_Z3minjj, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1, implicit-def $vgpr0
+ ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ %96:vgpr_32 = COPY $vgpr0
+ %97:sreg_32 = S_MOV_B32 -1
+ %2:vgpr_32 = V_ADD_U32_e64 %96, killed %97, 0, implicit $exec
+ %98:sreg_32 = S_MOV_B32 0
+ %99:vgpr_32 = COPY %98
+ DS_WRITE_B8_gfx9 %75, killed %99, 15364, 0, implicit $exec :: (store (s8), addrspace 3)
+ %100:sreg_32 = S_MOV_B32 10
+ %3:vgpr_32 = V_LSHLREV_B32_e64 killed %100, %152, implicit $exec
+ %153:vgpr_32 = COPY %98, implicit $exec
+ %102:sreg_32 = S_MOV_B32 5
+ %104:sreg_32 = S_MOV_B32 1
+ %109:sreg_32 = S_MOV_B32 4
+ %118:sreg_32 = S_MOV_B32 255
+ %124:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @_Z10atomic_incPU3AS3Vj + 4, target-flags(amdgpu-rel32-hi) @_Z10atomic_incPU3AS3Vj + 12, implicit-def dead $scc
+ %126:vgpr_32 = V_MOV_B32_e32 15360, implicit $exec
+ %127:sreg_64 = IMPLICIT_DEF
+ %128:sreg_32 = IMPLICIT_DEF
+ %135:sreg_32 = S_MOV_B32 59
+ S_BRANCH %bb.3
+
+ bb.1:
+ successors: %bb.10(0x80000000)
+
+ %5:vgpr_32 = PHI %38, %bb.13
+ SI_END_CF %41, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ S_BRANCH %bb.10
+
+ bb.2:
+ successors: %bb.14(0x04000000), %bb.3(0x7c000000)
+
+ %134:sreg_32 = V_CMP_GE_U32_e64 %12, %2, implicit $exec
+ %136:sreg_32 = V_CMP_GT_U32_e64 %31, %135, implicit $exec
+ %137:sreg_32 = S_OR_B32 killed %134, killed %136, implicit-def dead $scc
+ %7:sreg_32 = SI_IF_BREAK killed %137, %8, implicit-def dead $scc
+ SI_LOOP %7, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ S_BRANCH %bb.14
+
+ bb.3:
+ successors: %bb.4(0x40000000), %bb.8(0x40000000)
+
+ %8:sreg_32 = PHI %98, %bb.0, %7, %bb.2
+ %9:sreg_32 = PHI %98, %bb.0, %12, %bb.2
+ %10:vgpr_32 = PHI %153, %bb.0, %31, %bb.2
+ %101:vgpr_32 = V_ADD_U32_e64 %4, %9, 0, implicit $exec
+ %11:vgpr_32 = DS_READ_U8_gfx9 killed %101, 0, 0, implicit $exec :: (load (s8), addrspace 3)
+ %103:sreg_32 = S_LSHL_B32 %9, %102, implicit-def dead $scc
+ %12:sreg_32 = nuw S_ADD_I32 %9, %104, implicit-def dead $scc
+ %13:vgpr_32 = V_OR3_B32_e64 killed %103, %3, %12, implicit $exec
+ %105:sreg_32 = S_ADD_I32 %9, %102, implicit-def dead $scc
+ %106:sreg_32 = V_CMP_LT_U32_e64 killed %105, %96, implicit $exec
+ %155:vgpr_32 = COPY %12, implicit $exec
+ %14:sreg_32 = SI_IF killed %106, %bb.8, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ S_BRANCH %bb.4
+
+ bb.4:
+ successors: %bb.9(0x80000000)
+
+ %107:sreg_32 = S_MOV_B32 0
+ S_BRANCH %bb.9
+
+ bb.5:
+ successors: %bb.8(0x80000000)
+
+ %17:vgpr_32 = PHI %154, %bb.9
+ SI_END_CF %30, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ S_BRANCH %bb.8
+
+ bb.6:
+ successors: %bb.7(0x40000000), %bb.10(0x40000000)
+
+ %114:sreg_32 = V_CMP_LT_U32_e64 %20, %96, implicit $exec
+ %19:sreg_32 = SI_IF killed %114, %bb.10, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ S_BRANCH %bb.7
+
+ bb.7:
+ successors: %bb.11(0x80000000)
+
+ %115:sreg_32 = S_MOV_B32 0
+ %119:vgpr_32 = V_AND_B32_e64 %11, %118, implicit $exec
+ S_BRANCH %bb.11
+
+ bb.8:
+ successors: %bb.6(0x80000000)
+
+ %20:vgpr_32 = PHI %155, %bb.3, %17, %bb.5
+ %21:vgpr_32 = PHI %10, %bb.3, %26, %bb.5
+ %22:vgpr_32 = PHI %13, %bb.3, %28, %bb.5
+ SI_END_CF %14, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ S_BRANCH %bb.6
+
+ bb.9:
+ successors: %bb.5(0x04000000), %bb.9(0x7c000000)
+
+ %23:sreg_32 = PHI %107, %bb.4, %30, %bb.9
+ %24:sreg_32 = PHI %107, %bb.4, %27, %bb.9
+ %25:vgpr_32 = PHI %10, %bb.4, %26, %bb.9
+ %26:vgpr_32 = V_ADD_U32_e64 %25, %104, 0, implicit $exec
+ %27:sreg_32 = S_ADD_I32 %24, %109, implicit-def dead $scc
+ %110:sreg_32 = S_ADD_I32 %9, %27, implicit-def dead $scc
+ %112:sreg_32 = S_ADD_I32 %110, %102, implicit-def dead $scc
+ %113:sreg_32 = V_CMP_GE_U32_e64 killed %112, %96, implicit $exec
+ %1000:sreg_32 = S_ADD_I32 %27, 1, implicit-def dead $scc
+ %1001:sreg_32 = S_ADD_I32 %1000, 2, implicit-def dead $scc
+ %28:vgpr_32 = V_ADD_U32_e64 %13, %1001, 0, implicit $exec
+ %29:sreg_32 = S_ADD_I32 %110, %104, implicit-def dead $scc
+ %30:sreg_32 = SI_IF_BREAK killed %113, %23, implicit-def dead $scc
+ %154:vgpr_32 = COPY %29, implicit $exec
+ SI_LOOP %30, %bb.9, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ S_BRANCH %bb.5
+
+ bb.10:
+ successors: %bb.2(0x80000000)
+
+ %31:vgpr_32 = PHI %21, %bb.6, %5, %bb.1
+ SI_END_CF %19, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ S_BRANCH %bb.2
+
+ bb.11:
+ successors: %bb.12(0x40000000), %bb.13(0x40000000)
+
+ %32:sreg_32 = PHI %115, %bb.7, %41, %bb.13
+ %33:vgpr_32 = PHI %20, %bb.7, %40, %bb.13
+ %34:vgpr_32 = PHI %21, %bb.7, %38, %bb.13
+ %35:vgpr_32 = PHI %22, %bb.7, %39, %bb.13
+ %116:vgpr_32 = V_ADD_U32_e64 %4, %33, 0, implicit $exec
+ %117:vgpr_32 = DS_READ_U8_gfx9 killed %116, 0, 0, implicit $exec :: (load (s8), addrspace 3)
+ %120:sreg_32 = V_CMP_EQ_U16_e64 %119, killed %117, implicit $exec
+ %36:sreg_32 = SI_IF killed %120, %bb.13, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ S_BRANCH %bb.12
+
+ bb.12:
+ successors: %bb.13(0x80000000)
+
+ %37:vgpr_32 = V_ADD_U32_e64 %34, %104, 0, implicit $exec
+ %164:sreg_32 = COPY %46.sub0(p4)
+ %165:sreg_32 = COPY %46.sub1(p4)
+ %166:sreg_32 = COPY %55.sub0
+ %167:sreg_32 = COPY %55.sub1
+ %162:sreg_32 = S_ADD_U32 %164, %166, implicit-def $scc
+ %163:sreg_32 = S_ADDC_U32 %165, %167, implicit-def $scc, implicit $scc
+ %123:sreg_64 = REG_SEQUENCE %162, %subreg.sub0, %163, %subreg.sub1
+ ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ %125:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ $sgpr4_sgpr5 = COPY %45
+ $sgpr6_sgpr7 = COPY %127
+ $sgpr8_sgpr9 = COPY %123
+ $sgpr10_sgpr11 = COPY %47
+ $sgpr12 = COPY %49
+ $sgpr13 = COPY %50
+ $sgpr14 = COPY %51
+ $sgpr15 = COPY %128
+ $vgpr31 = COPY %43(s32)
+ $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %125
+ $vgpr0 = COPY %126
+ $sgpr30_sgpr31 = SI_CALL %124, @_Z10atomic_incPU3AS3Vj, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit-def $vgpr0
+ ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ %129:vgpr_32 = COPY $vgpr0
+ %131:vgpr_32 = V_LSHLREV_B32_e64 %82, %129, implicit $exec
+ DS_WRITE_B32_gfx9 killed %131, %35, 0, 0, implicit $exec :: (store (s32), addrspace 3)
+
+ bb.13:
+ successors: %bb.1(0x04000000), %bb.11(0x7c000000)
+
+ %38:vgpr_32 = PHI %34, %bb.11, %37, %bb.12
+ SI_END_CF %36, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ %39:vgpr_32 = V_ADD_U32_e64 %35, %104, 0, implicit $exec
+ %40:vgpr_32 = V_ADD_U32_e64 %33, %104, 0, implicit $exec
+ %133:sreg_32 = V_CMP_GE_U32_e64 %40, %96, implicit $exec
+ %41:sreg_32 = SI_IF_BREAK killed %133, %32, implicit-def dead $scc
+ SI_LOOP %41, %bb.11, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.14:
+ SI_END_CF %7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ %138:sreg_64 = S_MOV_B64 40
+ %170:sreg_32 = COPY %46.sub0(p4)
+ %171:sreg_32 = COPY %46.sub1(p4)
+ %172:sreg_32 = COPY %138.sub0
+ %173:sreg_32 = COPY %138.sub1
+ %168:sreg_32 = S_ADD_U32 %170, %172, implicit-def $scc
+ %169:sreg_32 = S_ADDC_U32 %171, %173, implicit-def $scc, implicit $scc
+ %139:sreg_64 = REG_SEQUENCE %168, %subreg.sub0, %169, %subreg.sub1
+ ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ %141:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ %142:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+ $sgpr4_sgpr5 = COPY %45
+ %143:sreg_64 = IMPLICIT_DEF
+ $sgpr6_sgpr7 = COPY %143
+ $sgpr8_sgpr9 = COPY %139
+ $sgpr10_sgpr11 = COPY %47
+ $sgpr12 = COPY %49
+ $sgpr13 = COPY %50
+ $sgpr14 = COPY %51
+ %144:sreg_32 = IMPLICIT_DEF
+ $sgpr15 = COPY %144
+ $vgpr31 = COPY %43(s32)
+ $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %141
+ $vgpr0 = COPY %142
+ $sgpr30_sgpr31 = SI_CALL %77, @_Z7barrierj, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0
+ ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ S_ENDPGM 0
+
+...
>From 532dfb69dcc3f3ab363f41bd52daefd182981a36 Mon Sep 17 00:00:00 2001
From: Petar Avramovic <Petar.Avramovic at amd.com>
Date: Tue, 3 Oct 2023 18:48:55 +0200
Subject: [PATCH 3/3] AMDGPU: Fix temporal divergence introduced by
machine-sink
Temporal divergence that was present in input or introduced in IR
transforms, like code-sinking or LICM, is handled in SIFixSGPRCopies
by changing sgpr source instr to vgpr instr.
After 5b657f5, that moved LICM after AMDGPUCodeGenPrepare,
machine-sinking can introduce temporal divergence by sinking
instructions outside of the cycle.
Add isSafeToSink callback in TargetInstrInfo.
---
llvm/include/llvm/CodeGen/MachineBasicBlock.h | 9 ++++
llvm/include/llvm/CodeGen/TargetInstrInfo.h | 7 ++++
llvm/lib/CodeGen/MachineBasicBlock.cpp | 4 ++
llvm/lib/CodeGen/MachineSink.cpp | 3 ++
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 42 +++++++++++++++++++
llvm/lib/Target/AMDGPU/SIInstrInfo.h | 3 ++
...ne-sink-temporal-divergence-swdev407790.ll | 6 ++-
...e-sink-temporal-divergence-swdev407790.mir | 26 ++++++------
8 files changed, 85 insertions(+), 15 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/MachineBasicBlock.h b/llvm/include/llvm/CodeGen/MachineBasicBlock.h
index ed9fc8f7ec3d75e..15c4fcd8399c181 100644
--- a/llvm/include/llvm/CodeGen/MachineBasicBlock.h
+++ b/llvm/include/llvm/CodeGen/MachineBasicBlock.h
@@ -794,6 +794,15 @@ class MachineBasicBlock
static_cast<const MachineBasicBlock *>(this)->getSingleSuccessor());
}
+ /// Return the predecessor of this block if it has a single predecessor.
+ /// Otherwise return a null pointer.
+ ///
+ const MachineBasicBlock *getSinglePredecessor() const;
+ MachineBasicBlock *getSinglePredecessor() {
+ return const_cast<MachineBasicBlock *>(
+ static_cast<const MachineBasicBlock *>(this)->getSinglePredecessor());
+ }
+
/// Return the fallthrough block if the block can implicitly
/// transfer control to the block after it by falling off the end of
/// it. If an explicit branch to the fallthrough block is not allowed,
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 98679b4dcf3cbfb..9f8721b823bd434 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -19,6 +19,8 @@
#include "llvm/ADT/Uniformity.h"
#include "llvm/CodeGen/MIRFormatter.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineCycleAnalysis.h"
+#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -150,6 +152,11 @@ class TargetInstrInfo : public MCInstrInfo {
return false;
}
+ virtual bool isSafeToSink(MachineInstr &MI, MachineBasicBlock *SuccToSinkTo,
+ MachineCycleInfo *CI) const {
+ return true;
+ }
+
protected:
/// For instructions with opcodes for which the M_REMATERIALIZABLE flag is
/// set, this hook lets the target specify whether the instruction is actually
diff --git a/llvm/lib/CodeGen/MachineBasicBlock.cpp b/llvm/lib/CodeGen/MachineBasicBlock.cpp
index 280ced65db7d8c0..7d3d8b6fba1b7df 100644
--- a/llvm/lib/CodeGen/MachineBasicBlock.cpp
+++ b/llvm/lib/CodeGen/MachineBasicBlock.cpp
@@ -960,6 +960,10 @@ const MachineBasicBlock *MachineBasicBlock::getSingleSuccessor() const {
return Successors.size() == 1 ? Successors[0] : nullptr;
}
+const MachineBasicBlock *MachineBasicBlock::getSinglePredecessor() const {
+ return Predecessors.size() == 1 ? Predecessors[0] : nullptr;
+}
+
MachineBasicBlock *MachineBasicBlock::getFallThrough(bool JumpToFallThrough) {
MachineFunction::iterator Fallthrough = getIterator();
++Fallthrough;
diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index 02c7880f86f00a1..8f25b981f495fa2 100644
--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -1260,6 +1260,9 @@ MachineSinking::FindSuccToSinkTo(MachineInstr &MI, MachineBasicBlock *MBB,
if (SuccToSinkTo && SuccToSinkTo->isInlineAsmBrIndirectTarget())
return nullptr;
+ if (!TII->isSafeToSink(MI, SuccToSinkTo, CI))
+ return nullptr;
+
return SuccToSinkTo;
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 2799a3e78b04d22..786b08f59c392ed 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -171,6 +171,48 @@ bool SIInstrInfo::isIgnorableUse(const MachineOperand &MO) const {
isVALU(*MO.getParent()) && !resultDependsOnExec(*MO.getParent());
}
+bool SIInstrInfo::isSafeToSink(MachineInstr &MI,
+ MachineBasicBlock *SuccToSinkTo,
+ MachineCycleInfo *CI) const {
+ CI->clear();
+ CI->compute(*MI.getMF());
+ MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
+
+ // Check if sinking of MI would create temporal divergent use.
+ for (auto Op : MI.uses()) {
+ if (Op.isReg() && Op.getReg().isVirtual() &&
+ RI.isSGPRClass(MRI.getRegClass(Op.getReg()))) {
+ MachineInstr *SgprDef = MRI.getVRegDef(Op.getReg());
+
+ // SgprDef defined inside cycle
+ MachineCycle *FromCycle = CI->getCycle(SgprDef->getParent());
+ if (FromCycle == nullptr)
+ return true;
+
+ // After structurize-cfg, there should be exactly one cycle exit.
+ SmallVector<MachineBasicBlock *, 1> ExitBlocks;
+ FromCycle->getExitBlocks(ExitBlocks);
+ assert(ExitBlocks.size() == 1);
+ assert(ExitBlocks[0]->getSinglePredecessor());
+
+ // Cycle has divergent exit condition.
+ if (!hasDivergentBranch(ExitBlocks[0]->getSinglePredecessor()))
+ return true;
+
+ // SuccToSinkTo is not in the cycle.
+ if (FromCycle != CI->getCycle(SuccToSinkTo)) {
+
+ // Allow sinking if MI edits lane mask (divergent i1 in sgpr).
+ if (MI.getOpcode() == AMDGPU::SI_IF_BREAK)
+ return true;
+
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
int64_t &Offset0,
int64_t &Offset1) const {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index a4f59fc3513d646..5ef17c44f7de389 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -222,6 +222,9 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
bool isIgnorableUse(const MachineOperand &MO) const override;
+ bool isSafeToSink(MachineInstr &MI, MachineBasicBlock *SuccToSinkTo,
+ MachineCycleInfo *CI) const override;
+
bool areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, int64_t &Offset0,
int64_t &Offset1) const override;
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
index ca1cf526d949a14..e2683bba37f4bc9 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
@@ -167,6 +167,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s59
; CHECK-NEXT: s_add_i32 s58, s58, 4
; CHECK-NEXT: s_add_i32 s4, s55, s58
+; CHECK-NEXT: v_add_nc_u32_e32 v0, s58, v57
; CHECK-NEXT: s_add_i32 s5, s4, 5
; CHECK-NEXT: s_add_i32 s4, s4, 1
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s5, v42
@@ -267,7 +268,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: .LBB0_16: ; %Flow43
; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s57
-; CHECK-NEXT: v_add_nc_u32_e32 v57, s58, v57
+; CHECK-NEXT: v_mov_b32_e32 v57, v0
; CHECK-NEXT: .LBB0_17: ; %Flow44
; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s56
@@ -869,6 +870,7 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
; CHECK-NEXT: s_add_i32 s7, s7, 4
; CHECK-NEXT: v_add_nc_u32_e32 v43, 1, v43
; CHECK-NEXT: s_add_i32 s8, s4, s7
+; CHECK-NEXT: v_add_nc_u32_e32 v0, s7, v47
; CHECK-NEXT: s_add_i32 s9, s8, 5
; CHECK-NEXT: s_add_i32 s8, s8, 1
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s9, v41
@@ -879,7 +881,7 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
; CHECK-NEXT: ; %bb.4: ; %Flow3
; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
-; CHECK-NEXT: v_add_nc_u32_e32 v47, s7, v47
+; CHECK-NEXT: v_mov_b32_e32 v47, v0
; CHECK-NEXT: .LBB1_5: ; %Flow4
; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.mir b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.mir
index 191b400011b6b2b..1ecbcb9fcb99273 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.mir
@@ -212,7 +212,6 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[PHI4:%[0-9]+]]:vgpr_32 = PHI %95, %bb.9
; CHECK-NEXT: SI_END_CF %96, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
- ; CHECK-NEXT: [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_OR3_B32_e64_]], %108, 0, implicit $exec
; CHECK-NEXT: S_BRANCH %bb.8
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.6:
@@ -234,7 +233,7 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI [[COPY24]], %bb.3, [[PHI4]], %bb.5
; CHECK-NEXT: [[PHI6:%[0-9]+]]:vgpr_32 = PHI [[PHI3]], %bb.3, %103, %bb.5
- ; CHECK-NEXT: [[PHI7:%[0-9]+]]:vgpr_32 = PHI [[V_OR3_B32_e64_]], %bb.3, [[V_ADD_U32_e64_3]], %bb.5
+ ; CHECK-NEXT: [[PHI7:%[0-9]+]]:vgpr_32 = PHI [[V_OR3_B32_e64_]], %bb.3, %105, %bb.5
; CHECK-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; CHECK-NEXT: S_BRANCH %bb.6
; CHECK-NEXT: {{ $}}
@@ -244,11 +243,12 @@ body: |
; CHECK-NEXT: [[PHI8:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_13]], %bb.4, %96, %bb.9
; CHECK-NEXT: [[PHI9:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_13]], %bb.4, %108, %bb.9
; CHECK-NEXT: [[PHI10:%[0-9]+]]:vgpr_32 = PHI [[PHI3]], %bb.4, %103, %bb.9
- ; CHECK-NEXT: [[V_ADD_U32_e64_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI10]], [[S_MOV_B32_9]], 0, implicit $exec
+ ; CHECK-NEXT: [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI10]], [[S_MOV_B32_9]], 0, implicit $exec
; CHECK-NEXT: [[S_ADD_I32_2:%[0-9]+]]:sreg_32 = S_ADD_I32 [[PHI9]], [[S_MOV_B32_10]], implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_3:%[0-9]+]]:sreg_32 = S_ADD_I32 [[PHI2]], [[S_ADD_I32_2]], implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_4:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_ADD_I32_3]], [[S_MOV_B32_8]], implicit-def dead $scc
; CHECK-NEXT: [[V_CMP_GE_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_GE_U32_e64 killed [[S_ADD_I32_4]], [[COPY21]], implicit $exec
+ ; CHECK-NEXT: [[V_ADD_U32_e64_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_OR3_B32_e64_]], [[S_ADD_I32_2]], 0, implicit $exec
; CHECK-NEXT: [[S_ADD_I32_5:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_ADD_I32_3]], [[S_MOV_B32_9]], implicit-def dead $scc
; CHECK-NEXT: [[SI_IF_BREAK1:%[0-9]+]]:sreg_32 = SI_IF_BREAK killed [[V_CMP_GE_U32_e64_1]], [[PHI8]], implicit-def dead $scc
; CHECK-NEXT: [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_5]], implicit $exec
@@ -860,9 +860,6 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[PHI4:%[0-9]+]]:vgpr_32 = PHI %95, %bb.9
; CHECK-NEXT: SI_END_CF %96, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
- ; CHECK-NEXT: [[S_ADD_I32_2:%[0-9]+]]:sreg_32 = S_ADD_I32 %108, 1, implicit-def dead $scc
- ; CHECK-NEXT: [[S_ADD_I32_3:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_ADD_I32_2]], 2, implicit-def dead $scc
- ; CHECK-NEXT: [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_OR3_B32_e64_]], [[S_ADD_I32_3]], 0, implicit $exec
; CHECK-NEXT: S_BRANCH %bb.8
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.6:
@@ -884,7 +881,7 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI [[COPY24]], %bb.3, [[PHI4]], %bb.5
; CHECK-NEXT: [[PHI6:%[0-9]+]]:vgpr_32 = PHI [[PHI3]], %bb.3, %103, %bb.5
- ; CHECK-NEXT: [[PHI7:%[0-9]+]]:vgpr_32 = PHI [[V_OR3_B32_e64_]], %bb.3, [[V_ADD_U32_e64_3]], %bb.5
+ ; CHECK-NEXT: [[PHI7:%[0-9]+]]:vgpr_32 = PHI [[V_OR3_B32_e64_]], %bb.3, %105, %bb.5
; CHECK-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; CHECK-NEXT: S_BRANCH %bb.6
; CHECK-NEXT: {{ $}}
@@ -894,12 +891,15 @@ body: |
; CHECK-NEXT: [[PHI8:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_13]], %bb.4, %96, %bb.9
; CHECK-NEXT: [[PHI9:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_13]], %bb.4, %108, %bb.9
; CHECK-NEXT: [[PHI10:%[0-9]+]]:vgpr_32 = PHI [[PHI3]], %bb.4, %103, %bb.9
- ; CHECK-NEXT: [[V_ADD_U32_e64_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI10]], [[S_MOV_B32_9]], 0, implicit $exec
- ; CHECK-NEXT: [[S_ADD_I32_4:%[0-9]+]]:sreg_32 = S_ADD_I32 [[PHI9]], [[S_MOV_B32_10]], implicit-def dead $scc
- ; CHECK-NEXT: [[S_ADD_I32_5:%[0-9]+]]:sreg_32 = S_ADD_I32 [[PHI2]], [[S_ADD_I32_4]], implicit-def dead $scc
- ; CHECK-NEXT: [[S_ADD_I32_6:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_ADD_I32_5]], [[S_MOV_B32_8]], implicit-def dead $scc
- ; CHECK-NEXT: [[V_CMP_GE_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_GE_U32_e64 killed [[S_ADD_I32_6]], [[COPY21]], implicit $exec
- ; CHECK-NEXT: [[S_ADD_I32_7:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_ADD_I32_5]], [[S_MOV_B32_9]], implicit-def dead $scc
+ ; CHECK-NEXT: [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI10]], [[S_MOV_B32_9]], 0, implicit $exec
+ ; CHECK-NEXT: [[S_ADD_I32_2:%[0-9]+]]:sreg_32 = S_ADD_I32 [[PHI9]], [[S_MOV_B32_10]], implicit-def dead $scc
+ ; CHECK-NEXT: [[S_ADD_I32_3:%[0-9]+]]:sreg_32 = S_ADD_I32 [[PHI2]], [[S_ADD_I32_2]], implicit-def dead $scc
+ ; CHECK-NEXT: [[S_ADD_I32_4:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_ADD_I32_3]], [[S_MOV_B32_8]], implicit-def dead $scc
+ ; CHECK-NEXT: [[V_CMP_GE_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_GE_U32_e64 killed [[S_ADD_I32_4]], [[COPY21]], implicit $exec
+ ; CHECK-NEXT: [[S_ADD_I32_5:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_ADD_I32_2]], 1, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_ADD_I32_6:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_ADD_I32_5]], 2, implicit-def dead $scc
+ ; CHECK-NEXT: [[V_ADD_U32_e64_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_OR3_B32_e64_]], [[S_ADD_I32_6]], 0, implicit $exec
+ ; CHECK-NEXT: [[S_ADD_I32_7:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_ADD_I32_3]], [[S_MOV_B32_9]], implicit-def dead $scc
; CHECK-NEXT: [[SI_IF_BREAK1:%[0-9]+]]:sreg_32 = SI_IF_BREAK killed [[V_CMP_GE_U32_e64_1]], [[PHI8]], implicit-def dead $scc
; CHECK-NEXT: [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_7]], implicit $exec
; CHECK-NEXT: SI_LOOP [[SI_IF_BREAK1]], %bb.9, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
More information about the llvm-commits
mailing list