[llvm-branch-commits] [llvm] [AMDGPU] Support Wave Reduction for i16 types - 2 (PR #194810)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Mon May 4 02:12:18 PDT 2026
https://github.com/easyonaadit updated https://github.com/llvm/llvm-project/pull/194810
>From 92f2857bb792a9bb2a199164c77b0d5e8f1b7430 Mon Sep 17 00:00:00 2001
From: Aaditya <Aaditya.AlokDeshpande at amd.com>
Date: Tue, 28 Apr 2026 14:26:15 +0530
Subject: [PATCH] [AMDGPU] Support Wave Reduction for i16 types - 2
Supported Ops: `add`, `sub`.
Supports only the iterative stratergy, DPP is yet
to be supported.
Supports only Fake-16 versions of the lowering.
True-16 support is yet to be added.
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 24 +-
llvm/lib/Target/AMDGPU/SIInstructions.td | 2 +
.../CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll | 810 +++++++++++++----
.../CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll | 835 ++++++++++++++----
4 files changed, 1320 insertions(+), 351 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a12de461cb098..d912073f35bd9 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5641,6 +5641,10 @@ static uint64_t getIdentityValueForWaveReduction(unsigned Opc) {
case AMDGPU::V_MAX_U16_opsel_e64:
case AMDGPU::V_MAX_U16_fake16_e64:
case AMDGPU::V_MAX_U16_t16_e64:
+ case AMDGPU::V_ADD_I16_e64:
+ case AMDGPU::V_ADD_I16_fake16_e64:
+ case AMDGPU::V_SUB_I16_e64:
+ case AMDGPU::V_SUB_I16_fake16_e64:
return 0x0;
case AMDGPU::V_MAX_I16_e64:
case AMDGPU::V_MAX_I16_opsel_e64:
@@ -5707,7 +5711,9 @@ static bool is16bitWaveReduceOperation(unsigned Opc) {
Opc == AMDGPU::V_MAX_I16_opsel_e64 ||
Opc == AMDGPU::V_MAX_I16_fake16_e64 || Opc == AMDGPU::V_MAX_I16_e64 ||
Opc == AMDGPU::V_MIN_U16_t16_e64 || Opc == AMDGPU::V_MIN_I16_t16_e64 ||
- Opc == AMDGPU::V_MAX_U16_t16_e64 || Opc == AMDGPU::V_MAX_I16_t16_e64;
+ Opc == AMDGPU::V_MAX_U16_t16_e64 || Opc == AMDGPU::V_MAX_I16_t16_e64 ||
+ Opc == AMDGPU::V_ADD_I16_e64 || Opc == AMDGPU::V_ADD_I16_fake16_e64 ||
+ Opc == AMDGPU::V_SUB_I16_e64 || Opc == AMDGPU::V_SUB_I16_fake16_e64;
}
static bool is32bitWaveReduceOperation(unsigned Opc) {
@@ -5898,6 +5904,10 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
}
case AMDGPU::S_XOR_B32:
case AMDGPU::S_XOR_B64:
+ case AMDGPU::V_ADD_I16_e64:
+ case AMDGPU::V_ADD_I16_fake16_e64:
+ case AMDGPU::V_SUB_I16_e64:
+ case AMDGPU::V_SUB_I16_fake16_e64:
case AMDGPU::S_ADD_I32:
case AMDGPU::S_ADD_U64_PSEUDO:
case AMDGPU::V_ADD_F32_e64:
@@ -5959,6 +5969,8 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
}
break;
}
+ case AMDGPU::V_SUB_I16_e64:
+ case AMDGPU::V_SUB_I16_fake16_e64:
case AMDGPU::S_SUB_I32: {
Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
@@ -5971,6 +5983,8 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
.addReg(NewAccumulator->getOperand(0).getReg());
break;
}
+ case AMDGPU::V_ADD_I16_e64:
+ case AMDGPU::V_ADD_I16_fake16_e64:
case AMDGPU::S_ADD_I32: {
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
.addReg(SrcReg)
@@ -6844,6 +6858,10 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
ST.getGeneration() >= AMDGPUSubtarget::GFX12
? AMDGPU::V_MAX_NUM_F64_e64
: AMDGPU::V_MAX_F64_e64);
+ case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I16:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(),
+ ST.hasTrue16BitInsts() ? AMDGPU::V_ADD_I16_fake16_e64
+ : AMDGPU::V_ADD_I16_e64);
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
@@ -6855,6 +6873,10 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
ST.getGeneration() >= AMDGPUSubtarget::GFX12
? AMDGPU::V_ADD_F64_pseudo_e64
: AMDGPU::V_ADD_F64_e64);
+ case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I16:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(),
+ ST.hasTrue16BitInsts() ? AMDGPU::V_SUB_I16_fake16_e64
+ : AMDGPU::V_SUB_I16_e64);
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 2d0d9ce0ea1b7..f54e01a6b8656 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -412,6 +412,8 @@ defvar Operations = [
WaveReduceOp<"min", "I16", i16, SGPR_32, VSrc_b16, NotUseRealTrue16Insts>,
WaveReduceOp<"umax", "U16", i16, SGPR_32, VSrc_b16, NotUseRealTrue16Insts>,
WaveReduceOp<"max", "I16", i16, SGPR_32, VSrc_b16, NotUseRealTrue16Insts>,
+ WaveReduceOp<"add", "I16", i16, SGPR_32, VSrc_b16, NotUseRealTrue16Insts>,
+ WaveReduceOp<"sub", "I16", i16, SGPR_32, VSrc_b16, NotUseRealTrue16Insts>,
WaveReduceOp<"umin", "U16_t16", i16, SGPR_32, VSrcT_b16, UseRealTrue16Insts>,
WaveReduceOp<"min", "I16_t16", i16, SGPR_32, VSrcT_b16, UseRealTrue16Insts>,
WaveReduceOp<"umax", "U16_t16", i16, SGPR_32, VSrcT_b16, UseRealTrue16Insts>,
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll
index eddd57c8da852..88f0fa199e037 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll
@@ -7,11 +7,469 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1064GISEL %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GFX1032DAGISEL %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX1032GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164DAGISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX1132DAGISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX1132GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -global-isel=0 < %s | FileCheck -check-prefixes=GFX12DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mattr=-real-true16 -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164DAGISEL,GFX1164DAGISEL-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mattr=-real-true16 -mcpu=gfx1100 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164GISEL,GFX1164GISEL-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mattr=-real-true16 -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX1132DAGISEL,GFX1132DAGISEL-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mattr=-real-true16 -mcpu=gfx1100 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX1132GISEL,GFX1132GISEL-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mattr=-real-true16 -mcpu=gfx1200 -global-isel=0 < %s | FileCheck -check-prefixes=GFX12DAGISEL,GFX12DAGISEL-FAKE16 %s
+
+define amdgpu_kernel void @uniform_value_i16(ptr addrspace(1) %out, i16 %in) {
+; GFX8DAGISEL-LABEL: uniform_value_i16:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: s_mul_i32 s2, s6, s2
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT: flat_store_short v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: uniform_value_i16:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: s_and_b32 s3, 0xffff, s6
+; GFX8GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_short v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: uniform_value_i16:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: s_mul_i32 s2, s6, s2
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: uniform_value_i16:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: s_and_b32 s3, 0xffff, s6
+; GFX9GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT: global_store_short v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: uniform_value_i16:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_clause 0x1
+; GFX1064DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s6, s2
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT: global_store_short v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: uniform_value_i16:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_clause 0x1
+; GFX1064GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: s_and_b32 s3, 0xffff, s6
+; GFX1064GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT: global_store_short v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: uniform_value_i16:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_clause 0x1
+; GFX1032DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s2, s3
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1032DAGISEL-NEXT: global_store_short v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: uniform_value_i16:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_clause 0x1
+; GFX1032GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX1032GISEL-NEXT: s_mul_i32 s2, s2, s3
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: global_store_short v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: uniform_value_i16:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_clause 0x1
+; GFX1164DAGISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s6, s2
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: uniform_value_i16:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_clause 0x1
+; GFX1164GISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: s_and_b32 s3, 0xffff, s6
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: uniform_value_i16:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_clause 0x1
+; GFX1132DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s2, s3
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: uniform_value_i16:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_clause 0x1
+; GFX1132GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: s_mul_i32 s2, s2, s3
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132GISEL-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_endpgm
+;
+; GFX12DAGISEL-LABEL: uniform_value_i16:
+; GFX12DAGISEL: ; %bb.0: ; %entry
+; GFX12DAGISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX12DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12DAGISEL-NEXT: s_mul_i32 s2, s2, s3
+; GFX12DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12DAGISEL-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX12DAGISEL-NEXT: s_endpgm
+entry:
+ %result = call i16 @llvm.amdgcn.wave.reduce.add.i16(i16 %in, i32 1)
+ store i16 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define void @divergent_value_i16(ptr addrspace(1) %out, i16 %in) {
+; GFX8DAGISEL-LABEL: divergent_value_i16:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX8DAGISEL-NEXT: s_mov_b32 s6, 0
+; GFX8DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v2, s7
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s8
+; GFX8DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX8DAGISEL-NEXT: v_add_i16 v3, s6, v3
+; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX8DAGISEL-NEXT: ; %bb.2:
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX8DAGISEL-NEXT: flat_store_short v[0:1], v2
+; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX8DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8GISEL-LABEL: divergent_value_i16:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX8GISEL-NEXT: s_mov_b32 s6, 0
+; GFX8GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX8GISEL-NEXT: v_readlane_b32 s8, v2, s7
+; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s8
+; GFX8GISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX8GISEL-NEXT: v_add_i16 v3, s6, v3
+; GFX8GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX8GISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX8GISEL-NEXT: ; %bb.2:
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX8GISEL-NEXT: flat_store_short v[0:1], v2
+; GFX8GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX8GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9DAGISEL-LABEL: divergent_value_i16:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX9DAGISEL-NEXT: s_mov_b32 s6, 0
+; GFX9DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v2, s7
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v3, s8
+; GFX9DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX9DAGISEL-NEXT: v_add_i16 v3, s6, v3
+; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX9DAGISEL-NEXT: ; %bb.2:
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX9DAGISEL-NEXT: global_store_short v[0:1], v2, off
+; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9GISEL-LABEL: divergent_value_i16:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX9GISEL-NEXT: s_mov_b32 s6, 0
+; GFX9GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX9GISEL-NEXT: v_readlane_b32 s8, v2, s7
+; GFX9GISEL-NEXT: v_mov_b32_e32 v3, s8
+; GFX9GISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX9GISEL-NEXT: v_add_i16 v3, s6, v3
+; GFX9GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9GISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX9GISEL-NEXT: ; %bb.2:
+; GFX9GISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX9GISEL-NEXT: global_store_short v[0:1], v2, off
+; GFX9GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1064DAGISEL-LABEL: divergent_value_i16:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064DAGISEL-NEXT: s_mov_b32 s6, 0
+; GFX1064DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v2, s7
+; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1064DAGISEL-NEXT: v_add_nc_i16 v3, s6, s8
+; GFX1064DAGISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1064DAGISEL-NEXT: ; %bb.2:
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX1064DAGISEL-NEXT: global_store_short v[0:1], v2, off
+; GFX1064DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1064GISEL-LABEL: divergent_value_i16:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064GISEL-NEXT: s_mov_b32 s6, 0
+; GFX1064GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX1064GISEL-NEXT: v_readlane_b32 s8, v2, s7
+; GFX1064GISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1064GISEL-NEXT: v_add_nc_i16 v3, s6, s8
+; GFX1064GISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1064GISEL-NEXT: ; %bb.2:
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX1064GISEL-NEXT: global_store_short v[0:1], v2, off
+; GFX1064GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1032DAGISEL-LABEL: divergent_value_i16:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1032DAGISEL-NEXT: s_mov_b32 s5, 0
+; GFX1032DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s6, s4
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s7, v2, s6
+; GFX1032DAGISEL-NEXT: s_bitset0_b32 s4, s6
+; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1032DAGISEL-NEXT: v_add_nc_i16 v3, s5, s7
+; GFX1032DAGISEL-NEXT: v_readfirstlane_b32 s5, v3
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1032DAGISEL-NEXT: ; %bb.2:
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, s5
+; GFX1032DAGISEL-NEXT: global_store_short v[0:1], v2, off
+; GFX1032DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1032GISEL-LABEL: divergent_value_i16:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032GISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1032GISEL-NEXT: s_mov_b32 s5, 0
+; GFX1032GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: s_ff1_i32_b32 s6, s4
+; GFX1032GISEL-NEXT: v_readlane_b32 s7, v2, s6
+; GFX1032GISEL-NEXT: s_bitset0_b32 s4, s6
+; GFX1032GISEL-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1032GISEL-NEXT: v_add_nc_i16 v3, s5, s7
+; GFX1032GISEL-NEXT: v_readfirstlane_b32 s5, v3
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1032GISEL-NEXT: ; %bb.2:
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, s5
+; GFX1032GISEL-NEXT: global_store_short v[0:1], v2, off
+; GFX1032GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1164DAGISEL-LABEL: divergent_value_i16:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164DAGISEL-NEXT: s_mov_b32 s2, 0
+; GFX1164DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s3, s[0:1]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: v_readlane_b32 s4, v2, s3
+; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[0:1], s3
+; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164DAGISEL-NEXT: v_add_nc_i16 v3, s2, s4
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1164DAGISEL-NEXT: ; %bb.2:
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX1164DAGISEL-NEXT: global_store_b16 v[0:1], v2, off
+; GFX1164DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1164GISEL-LABEL: divergent_value_i16:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164GISEL-NEXT: s_mov_b32 s2, 0
+; GFX1164GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: s_ctz_i32_b64 s3, s[0:1]
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: v_readlane_b32 s4, v2, s3
+; GFX1164GISEL-NEXT: s_bitset0_b64 s[0:1], s3
+; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164GISEL-NEXT: v_add_nc_i16 v3, s2, s4
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1164GISEL-NEXT: ; %bb.2:
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX1164GISEL-NEXT: global_store_b16 v[0:1], v2, off
+; GFX1164GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1132DAGISEL-LABEL: divergent_value_i16:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0
+; GFX1132DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s2, s0
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: v_readlane_b32 s3, v2, s2
+; GFX1132DAGISEL-NEXT: s_bitset0_b32 s0, s2
+; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132DAGISEL-NEXT: v_add_nc_i16 v3, s1, s3
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_readfirstlane_b32 s1, v3
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1132DAGISEL-NEXT: ; %bb.2:
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v2, s1
+; GFX1132DAGISEL-NEXT: global_store_b16 v[0:1], v2, off
+; GFX1132DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1132GISEL-LABEL: divergent_value_i16:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132GISEL-NEXT: s_mov_b32 s1, 0
+; GFX1132GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: s_ctz_i32_b32 s2, s0
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: v_readlane_b32 s3, v2, s2
+; GFX1132GISEL-NEXT: s_bitset0_b32 s0, s2
+; GFX1132GISEL-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132GISEL-NEXT: v_add_nc_i16 v3, s1, s3
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_readfirstlane_b32 s1, v3
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1132GISEL-NEXT: ; %bb.2:
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v2, s1
+; GFX1132GISEL-NEXT: global_store_b16 v[0:1], v2, off
+; GFX1132GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12DAGISEL-LABEL: divergent_value_i16:
+; GFX12DAGISEL: ; %bb.0: ; %entry
+; GFX12DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12DAGISEL-NEXT: s_wait_expcnt 0x0
+; GFX12DAGISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12DAGISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12DAGISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX12DAGISEL-NEXT: s_mov_b32 s1, 0
+; GFX12DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
+; GFX12DAGISEL-NEXT: s_ctz_i32_b32 s2, s0
+; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
+; GFX12DAGISEL-NEXT: v_readlane_b32 s3, v2, s2
+; GFX12DAGISEL-NEXT: s_bitset0_b32 s0, s2
+; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
+; GFX12DAGISEL-NEXT: s_cmp_lg_u32 s0, 0
+; GFX12DAGISEL-NEXT: v_add_nc_i16 v3, s1, s3
+; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12DAGISEL-NEXT: v_readfirstlane_b32 s1, v3
+; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX12DAGISEL-NEXT: ; %bb.2:
+; GFX12DAGISEL-NEXT: s_wait_alu depctr_va_sdst(0)
+; GFX12DAGISEL-NEXT: v_mov_b32_e32 v2, s1
+; GFX12DAGISEL-NEXT: global_store_b16 v[0:1], v2, off
+; GFX12DAGISEL-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %result = call i16 @llvm.amdgcn.wave.reduce.add.i16(i16 %in, i32 1)
+ store i16 %result, ptr addrspace(1) %out
+ ret void
+}
define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
; GFX8DAGISEL-LABEL: uniform_value:
@@ -206,13 +664,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX8DAGISEL-NEXT: s_mov_b32 s4, 0
-; GFX8DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
; GFX8DAGISEL-NEXT: s_add_i32 s4, s4, s6
; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX8DAGISEL-NEXT: ; %bb.2:
; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
@@ -226,13 +684,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX8GISEL-NEXT: s_mov_b32 s4, 0
-; GFX8GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX8GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
; GFX8GISEL-NEXT: v_readlane_b32 s6, v0, s5
; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s5
; GFX8GISEL-NEXT: s_add_i32 s4, s4, s6
; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX8GISEL-NEXT: ; %bb.2:
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
@@ -247,13 +705,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX9DAGISEL-NEXT: s_mov_b32 s4, 0
-; GFX9DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
; GFX9DAGISEL-NEXT: s_add_i32 s4, s4, s6
; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX9DAGISEL-NEXT: ; %bb.2:
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
@@ -265,13 +723,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX9GISEL-NEXT: s_mov_b32 s4, 0
-; GFX9GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX9GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
; GFX9GISEL-NEXT: v_readlane_b32 s6, v0, s5
; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s5
; GFX9GISEL-NEXT: s_add_i32 s4, s4, s6
; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX9GISEL-NEXT: ; %bb.2:
; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
@@ -285,13 +743,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX1064DAGISEL-NEXT: s_mov_b32 s4, 0
-; GFX1064DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
; GFX1064DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
; GFX1064DAGISEL-NEXT: s_add_i32 s4, s4, s6
; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1064DAGISEL-NEXT: ; %bb.2:
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
@@ -303,13 +761,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX1064GISEL-NEXT: s_mov_b32 s4, 0
-; GFX1064GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX1064GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
; GFX1064GISEL-NEXT: v_readlane_b32 s6, v0, s5
; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s5
; GFX1064GISEL-NEXT: s_add_i32 s4, s4, s6
; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1064GISEL-NEXT: ; %bb.2:
; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
@@ -323,13 +781,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo
; GFX1032DAGISEL-NEXT: s_mov_b32 s2, 0
-; GFX1032DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s4, s3
; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
; GFX1032DAGISEL-NEXT: s_bitset0_b32 s3, s4
; GFX1032DAGISEL-NEXT: s_add_i32 s2, s2, s5
; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
-; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1032DAGISEL-NEXT: ; %bb.2:
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2
; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
@@ -341,13 +799,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo
; GFX1032GISEL-NEXT: s_mov_b32 s2, 0
-; GFX1032GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX1032GISEL-NEXT: s_ff1_i32_b32 s4, s3
; GFX1032GISEL-NEXT: v_readlane_b32 s5, v0, s4
; GFX1032GISEL-NEXT: s_bitset0_b32 s3, s4
; GFX1032GISEL-NEXT: s_add_i32 s2, s2, s5
; GFX1032GISEL-NEXT: s_cmp_lg_u32 s3, 0
-; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1032GISEL-NEXT: ; %bb.2:
; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
@@ -362,14 +820,14 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX1164DAGISEL-NEXT: s_mov_b32 s4, 0
-; GFX1164DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
; GFX1164DAGISEL-NEXT: s_add_i32 s4, s4, s6
; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1164DAGISEL-NEXT: ; %bb.2:
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
@@ -382,14 +840,14 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX1164GISEL-NEXT: s_mov_b32 s4, 0
-; GFX1164GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5
; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5
; GFX1164GISEL-NEXT: s_add_i32 s4, s4, s6
; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1164GISEL-NEXT: ; %bb.2:
; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
@@ -403,14 +861,14 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo
; GFX1132DAGISEL-NEXT: s_mov_b32 s2, 0
-; GFX1132DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s3
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4
; GFX1132DAGISEL-NEXT: s_add_i32 s2, s2, s5
; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
-; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1132DAGISEL-NEXT: ; %bb.2:
; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2
; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
@@ -423,14 +881,14 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo
; GFX1132GISEL-NEXT: s_mov_b32 s2, 0
-; GFX1132GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s3
; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4
; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4
; GFX1132GISEL-NEXT: s_add_i32 s2, s2, s5
; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0
-; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1132GISEL-NEXT: ; %bb.2:
; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
@@ -443,14 +901,14 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX12DAGISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
; GFX12DAGISEL-NEXT: s_mov_b32 s3, exec_lo
; GFX12DAGISEL-NEXT: s_mov_b32 s2, 0
-; GFX12DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX12DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX12DAGISEL-NEXT: s_ctz_i32_b32 s4, s3
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
; GFX12DAGISEL-NEXT: s_bitset0_b32 s3, s4
; GFX12DAGISEL-NEXT: s_add_co_i32 s2, s2, s5
; GFX12DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
-; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX12DAGISEL-NEXT: ; %bb.2:
; GFX12DAGISEL-NEXT: v_mov_b32_e32 v0, s2
; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0
@@ -2312,7 +2770,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr2
; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX8DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB6_2
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB8_2
; GFX8DAGISEL-NEXT: ; %bb.1: ; %else
; GFX8DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
@@ -2320,24 +2778,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr0
; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8DAGISEL-NEXT: s_mul_i32 s2, s6, s2
-; GFX8DAGISEL-NEXT: .LBB6_2: ; %Flow
+; GFX8DAGISEL-NEXT: .LBB8_2: ; %Flow
; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s2
; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB6_6
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB8_6
; GFX8DAGISEL-NEXT: ; %bb.3: ; %if
; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX8DAGISEL-NEXT: s_mov_b32 s6, 0
-; GFX8DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1
; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX8DAGISEL-NEXT: s_add_i32 s6, s6, s8
; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB8_4
; GFX8DAGISEL-NEXT: ; %bb.5:
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6
-; GFX8DAGISEL-NEXT: .LBB6_6: ; %endif
+; GFX8DAGISEL-NEXT: .LBB8_6: ; %endif
; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
@@ -2352,7 +2810,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX8GISEL-NEXT: ; implicit-def: $sgpr2
; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX8GISEL-NEXT: s_cbranch_execz .LBB6_2
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB8_2
; GFX8GISEL-NEXT: ; %bb.1: ; %else
; GFX8GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
@@ -2360,24 +2818,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX8GISEL-NEXT: ; implicit-def: $vgpr0
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8GISEL-NEXT: s_mul_i32 s2, s6, s2
-; GFX8GISEL-NEXT: .LBB6_2: ; %Flow
+; GFX8GISEL-NEXT: .LBB8_2: ; %Flow
; GFX8GISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s2
; GFX8GISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX8GISEL-NEXT: s_cbranch_execz .LBB6_6
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB8_6
; GFX8GISEL-NEXT: ; %bb.3: ; %if
; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX8GISEL-NEXT: s_mov_b32 s6, 0
-; GFX8GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1
; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
; GFX8GISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX8GISEL-NEXT: s_add_i32 s6, s6, s8
; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB6_4
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB8_4
; GFX8GISEL-NEXT: ; %bb.5:
; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s6
-; GFX8GISEL-NEXT: .LBB6_6: ; %endif
+; GFX8GISEL-NEXT: .LBB8_6: ; %endif
; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
@@ -2392,7 +2850,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr2
; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX9DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB6_2
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB8_2
; GFX9DAGISEL-NEXT: ; %bb.1: ; %else
; GFX9DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
@@ -2400,24 +2858,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr0
; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9DAGISEL-NEXT: s_mul_i32 s2, s6, s2
-; GFX9DAGISEL-NEXT: .LBB6_2: ; %Flow
+; GFX9DAGISEL-NEXT: .LBB8_2: ; %Flow
; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB6_6
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB8_6
; GFX9DAGISEL-NEXT: ; %bb.3: ; %if
; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX9DAGISEL-NEXT: s_mov_b32 s6, 0
-; GFX9DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1
; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX9DAGISEL-NEXT: s_add_i32 s6, s6, s8
; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB8_4
; GFX9DAGISEL-NEXT: ; %bb.5:
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6
-; GFX9DAGISEL-NEXT: .LBB6_6: ; %endif
+; GFX9DAGISEL-NEXT: .LBB8_6: ; %endif
; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
@@ -2431,7 +2889,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX9GISEL-NEXT: ; implicit-def: $sgpr2
; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX9GISEL-NEXT: s_cbranch_execz .LBB6_2
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB8_2
; GFX9GISEL-NEXT: ; %bb.1: ; %else
; GFX9GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
@@ -2439,24 +2897,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX9GISEL-NEXT: ; implicit-def: $vgpr0
; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9GISEL-NEXT: s_mul_i32 s2, s6, s2
-; GFX9GISEL-NEXT: .LBB6_2: ; %Flow
+; GFX9GISEL-NEXT: .LBB8_2: ; %Flow
; GFX9GISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s2
; GFX9GISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX9GISEL-NEXT: s_cbranch_execz .LBB6_6
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB8_6
; GFX9GISEL-NEXT: ; %bb.3: ; %if
; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX9GISEL-NEXT: s_mov_b32 s6, 0
-; GFX9GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1
; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
; GFX9GISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX9GISEL-NEXT: s_add_i32 s6, s6, s8
; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB6_4
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB8_4
; GFX9GISEL-NEXT: ; %bb.5:
; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s6
-; GFX9GISEL-NEXT: .LBB6_6: ; %endif
+; GFX9GISEL-NEXT: .LBB8_6: ; %endif
; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9GISEL-NEXT: v_mov_b32_e32 v0, 0
@@ -2470,7 +2928,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr2
; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB6_2
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB8_2
; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else
; GFX1064DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
@@ -2478,24 +2936,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s6, s2
-; GFX1064DAGISEL-NEXT: .LBB6_2: ; %Flow
+; GFX1064DAGISEL-NEXT: .LBB8_2: ; %Flow
; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB6_6
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB8_6
; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if
; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX1064DAGISEL-NEXT: s_mov_b32 s6, 0
-; GFX1064DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1
; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX1064DAGISEL-NEXT: s_add_i32 s6, s6, s8
; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB8_4
; GFX1064DAGISEL-NEXT: ; %bb.5:
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6
-; GFX1064DAGISEL-NEXT: .LBB6_6: ; %endif
+; GFX1064DAGISEL-NEXT: .LBB8_6: ; %endif
; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
@@ -2509,7 +2967,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1064GISEL-NEXT: ; implicit-def: $sgpr2
; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064GISEL-NEXT: s_cbranch_execz .LBB6_2
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB8_2
; GFX1064GISEL-NEXT: ; %bb.1: ; %else
; GFX1064GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
@@ -2517,24 +2975,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064GISEL-NEXT: s_mul_i32 s2, s6, s2
-; GFX1064GISEL-NEXT: .LBB6_2: ; %Flow
+; GFX1064GISEL-NEXT: .LBB8_2: ; %Flow
; GFX1064GISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, s2
; GFX1064GISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX1064GISEL-NEXT: s_cbranch_execz .LBB6_6
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB8_6
; GFX1064GISEL-NEXT: ; %bb.3: ; %if
; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX1064GISEL-NEXT: s_mov_b32 s6, 0
-; GFX1064GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1
; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
; GFX1064GISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX1064GISEL-NEXT: s_add_i32 s6, s6, s8
; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB6_4
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB8_4
; GFX1064GISEL-NEXT: ; %bb.5:
; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, s6
-; GFX1064GISEL-NEXT: .LBB6_6: ; %endif
+; GFX1064GISEL-NEXT: .LBB8_6: ; %endif
; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, 0
@@ -2548,7 +3006,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr1
; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB6_2
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB8_2
; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else
; GFX1032DAGISEL-NEXT: s_load_dword s1, s[4:5], 0x2c
; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
@@ -2556,24 +3014,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032DAGISEL-NEXT: s_mul_i32 s1, s1, s2
-; GFX1032DAGISEL-NEXT: .LBB6_2: ; %Flow
+; GFX1032DAGISEL-NEXT: .LBB8_2: ; %Flow
; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, s0
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB6_6
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB8_6
; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if
; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
; GFX1032DAGISEL-NEXT: s_mov_b32 s1, 0
-; GFX1032DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1
; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s3, s2
; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s3
; GFX1032DAGISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1032DAGISEL-NEXT: s_add_i32 s1, s1, s6
; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB8_4
; GFX1032DAGISEL-NEXT: ; %bb.5:
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX1032DAGISEL-NEXT: .LBB6_6: ; %endif
+; GFX1032DAGISEL-NEXT: .LBB8_6: ; %endif
; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
@@ -2587,7 +3045,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1032GISEL-NEXT: ; implicit-def: $sgpr1
; GFX1032GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX1032GISEL-NEXT: s_cbranch_execz .LBB6_2
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB8_2
; GFX1032GISEL-NEXT: ; %bb.1: ; %else
; GFX1032GISEL-NEXT: s_load_dword s1, s[4:5], 0x2c
; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
@@ -2595,24 +3053,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032GISEL-NEXT: s_mul_i32 s1, s1, s2
-; GFX1032GISEL-NEXT: .LBB6_2: ; %Flow
+; GFX1032GISEL-NEXT: .LBB8_2: ; %Flow
; GFX1032GISEL-NEXT: s_or_saveexec_b32 s0, s0
; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX1032GISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX1032GISEL-NEXT: s_cbranch_execz .LBB6_6
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB8_6
; GFX1032GISEL-NEXT: ; %bb.3: ; %if
; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
; GFX1032GISEL-NEXT: s_mov_b32 s1, 0
-; GFX1032GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1
; GFX1032GISEL-NEXT: s_ff1_i32_b32 s3, s2
; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s3
; GFX1032GISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1032GISEL-NEXT: s_add_i32 s1, s1, s6
; GFX1032GISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB6_4
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB8_4
; GFX1032GISEL-NEXT: ; %bb.5:
; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX1032GISEL-NEXT: .LBB6_6: ; %endif
+; GFX1032GISEL-NEXT: .LBB8_6: ; %endif
; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, 0
@@ -2628,7 +3086,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB6_2
+; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB8_2
; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else
; GFX1164DAGISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c
; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
@@ -2637,25 +3095,25 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s6, s2
-; GFX1164DAGISEL-NEXT: .LBB6_2: ; %Flow
+; GFX1164DAGISEL-NEXT: .LBB8_2: ; %Flow
; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB6_6
+; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB8_6
; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if
; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX1164DAGISEL-NEXT: s_mov_b32 s6, 0
-; GFX1164DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1
; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX1164DAGISEL-NEXT: s_add_i32 s6, s6, s8
; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB8_4
; GFX1164DAGISEL-NEXT: ; %bb.5:
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6
-; GFX1164DAGISEL-NEXT: .LBB6_6: ; %endif
+; GFX1164DAGISEL-NEXT: .LBB8_6: ; %endif
; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
@@ -2671,7 +3129,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164GISEL-NEXT: s_cbranch_execz .LBB6_2
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB8_2
; GFX1164GISEL-NEXT: ; %bb.1: ; %else
; GFX1164GISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c
; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
@@ -2680,25 +3138,25 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164GISEL-NEXT: s_mul_i32 s2, s6, s2
-; GFX1164GISEL-NEXT: .LBB6_2: ; %Flow
+; GFX1164GISEL-NEXT: .LBB8_2: ; %Flow
; GFX1164GISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, s2
; GFX1164GISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX1164GISEL-NEXT: s_cbranch_execz .LBB6_6
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB8_6
; GFX1164GISEL-NEXT: ; %bb.3: ; %if
; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX1164GISEL-NEXT: s_mov_b32 s6, 0
-; GFX1164GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1
; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX1164GISEL-NEXT: s_add_i32 s6, s6, s8
; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB6_4
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB8_4
; GFX1164GISEL-NEXT: ; %bb.5:
; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, s6
-; GFX1164GISEL-NEXT: .LBB6_6: ; %endif
+; GFX1164GISEL-NEXT: .LBB8_6: ; %endif
; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, 0
@@ -2714,7 +3172,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
; GFX1132DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB6_2
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB8_2
; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else
; GFX1132DAGISEL-NEXT: s_load_b32 s1, s[4:5], 0x2c
; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
@@ -2723,25 +3181,25 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132DAGISEL-NEXT: s_mul_i32 s1, s1, s2
-; GFX1132DAGISEL-NEXT: .LBB6_2: ; %Flow
+; GFX1132DAGISEL-NEXT: .LBB8_2: ; %Flow
; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, s0
; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB6_6
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB8_6
; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if
; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0
-; GFX1132DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1
; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s3
; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1132DAGISEL-NEXT: s_add_i32 s1, s1, s6
; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB8_4
; GFX1132DAGISEL-NEXT: ; %bb.5:
; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX1132DAGISEL-NEXT: .LBB6_6: ; %endif
+; GFX1132DAGISEL-NEXT: .LBB8_6: ; %endif
; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0
@@ -2757,7 +3215,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
; GFX1132GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX1132GISEL-NEXT: s_cbranch_execz .LBB6_2
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB8_2
; GFX1132GISEL-NEXT: ; %bb.1: ; %else
; GFX1132GISEL-NEXT: s_load_b32 s1, s[4:5], 0x2c
; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
@@ -2766,25 +3224,25 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132GISEL-NEXT: s_mul_i32 s1, s1, s2
-; GFX1132GISEL-NEXT: .LBB6_2: ; %Flow
+; GFX1132GISEL-NEXT: .LBB8_2: ; %Flow
; GFX1132GISEL-NEXT: s_or_saveexec_b32 s0, s0
; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX1132GISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX1132GISEL-NEXT: s_cbranch_execz .LBB6_6
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB8_6
; GFX1132GISEL-NEXT: ; %bb.3: ; %if
; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
; GFX1132GISEL-NEXT: s_mov_b32 s1, 0
-; GFX1132GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1
; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2
; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s3
; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1132GISEL-NEXT: s_add_i32 s1, s1, s6
; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB6_4
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB8_4
; GFX1132GISEL-NEXT: ; %bb.5:
; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX1132GISEL-NEXT: .LBB6_6: ; %endif
+; GFX1132GISEL-NEXT: .LBB8_6: ; %endif
; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, 0
@@ -2800,7 +3258,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
; GFX12DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB6_2
+; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB8_2
; GFX12DAGISEL-NEXT: ; %bb.1: ; %else
; GFX12DAGISEL-NEXT: s_load_b32 s1, s[4:5], 0x2c
; GFX12DAGISEL-NEXT: s_mov_b32 s2, exec_lo
@@ -2809,15 +3267,15 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX12DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0
; GFX12DAGISEL-NEXT: s_mul_i32 s1, s1, s2
-; GFX12DAGISEL-NEXT: .LBB6_2: ; %Flow
+; GFX12DAGISEL-NEXT: .LBB8_2: ; %Flow
; GFX12DAGISEL-NEXT: s_or_saveexec_b32 s0, s0
; GFX12DAGISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX12DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB6_6
+; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB8_6
; GFX12DAGISEL-NEXT: ; %bb.3: ; %if
; GFX12DAGISEL-NEXT: s_mov_b32 s2, exec_lo
; GFX12DAGISEL-NEXT: s_mov_b32 s1, 0
-; GFX12DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX12DAGISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1
; GFX12DAGISEL-NEXT: s_ctz_i32_b32 s3, s2
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12DAGISEL-NEXT: v_readlane_b32 s6, v0, s3
@@ -2825,10 +3283,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX12DAGISEL-NEXT: s_add_co_i32 s1, s1, s6
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4
+; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB8_4
; GFX12DAGISEL-NEXT: ; %bb.5:
; GFX12DAGISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX12DAGISEL-NEXT: .LBB6_6: ; %endif
+; GFX12DAGISEL-NEXT: .LBB8_6: ; %endif
; GFX12DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX12DAGISEL-NEXT: v_mov_b32_e32 v0, 0
@@ -3076,7 +3534,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], 0
; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX8DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1
; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s8, s[6:7]
; GFX8DAGISEL-NEXT: v_readlane_b32 s9, v2, s8
; GFX8DAGISEL-NEXT: v_readlane_b32 s10, v3, s8
@@ -3084,7 +3542,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX8DAGISEL-NEXT: s_bitset0_b64 s[6:7], s8
; GFX8DAGISEL-NEXT: s_addc_u32 s5, s5, s10
; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB10_1
; GFX8DAGISEL-NEXT: ; %bb.2:
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -3097,7 +3555,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX8GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8GISEL-NEXT: s_mov_b64 s[4:5], 0
; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX8GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1
; GFX8GISEL-NEXT: s_ff1_i32_b64 s8, s[6:7]
; GFX8GISEL-NEXT: v_readlane_b32 s9, v2, s8
; GFX8GISEL-NEXT: v_readlane_b32 s10, v3, s8
@@ -3105,7 +3563,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX8GISEL-NEXT: s_bitset0_b64 s[6:7], s8
; GFX8GISEL-NEXT: s_addc_u32 s5, s5, s10
; GFX8GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB10_1
; GFX8GISEL-NEXT: ; %bb.2:
; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -3118,7 +3576,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], 0
; GFX9DAGISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX9DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1
; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s8, s[6:7]
; GFX9DAGISEL-NEXT: v_readlane_b32 s9, v2, s8
; GFX9DAGISEL-NEXT: v_readlane_b32 s10, v3, s8
@@ -3126,7 +3584,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX9DAGISEL-NEXT: s_bitset0_b64 s[6:7], s8
; GFX9DAGISEL-NEXT: s_addc_u32 s5, s5, s10
; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB10_1
; GFX9DAGISEL-NEXT: ; %bb.2:
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -3139,7 +3597,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX9GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9GISEL-NEXT: s_mov_b64 s[4:5], 0
; GFX9GISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX9GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1
; GFX9GISEL-NEXT: s_ff1_i32_b64 s8, s[6:7]
; GFX9GISEL-NEXT: v_readlane_b32 s9, v2, s8
; GFX9GISEL-NEXT: v_readlane_b32 s10, v3, s8
@@ -3147,7 +3605,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX9GISEL-NEXT: s_bitset0_b64 s[6:7], s8
; GFX9GISEL-NEXT: s_addc_u32 s5, s5, s10
; GFX9GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB10_1
; GFX9GISEL-NEXT: ; %bb.2:
; GFX9GISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX9GISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -3160,7 +3618,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1064DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], 0
; GFX1064DAGISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX1064DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1
; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s8, s[6:7]
; GFX1064DAGISEL-NEXT: v_readlane_b32 s9, v2, s8
; GFX1064DAGISEL-NEXT: v_readlane_b32 s10, v3, s8
@@ -3168,7 +3626,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1064DAGISEL-NEXT: s_add_u32 s4, s4, s9
; GFX1064DAGISEL-NEXT: s_addc_u32 s5, s5, s10
; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1064DAGISEL-NEXT: ; %bb.2:
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -3180,7 +3638,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1064GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], 0
; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX1064GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1
; GFX1064GISEL-NEXT: s_ff1_i32_b64 s8, s[6:7]
; GFX1064GISEL-NEXT: v_readlane_b32 s9, v2, s8
; GFX1064GISEL-NEXT: v_readlane_b32 s10, v3, s8
@@ -3188,7 +3646,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1064GISEL-NEXT: s_add_u32 s4, s4, s9
; GFX1064GISEL-NEXT: s_addc_u32 s5, s5, s10
; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1064GISEL-NEXT: ; %bb.2:
; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX1064GISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -3200,7 +3658,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1032DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1032DAGISEL-NEXT: s_mov_b64 s[4:5], 0
; GFX1032DAGISEL-NEXT: s_mov_b32 s6, exec_lo
-; GFX1032DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1
; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s7, s6
; GFX1032DAGISEL-NEXT: v_readlane_b32 s8, v2, s7
; GFX1032DAGISEL-NEXT: v_readlane_b32 s9, v3, s7
@@ -3208,7 +3666,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1032DAGISEL-NEXT: s_add_u32 s4, s4, s8
; GFX1032DAGISEL-NEXT: s_addc_u32 s5, s5, s9
; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s6, 0
-; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1032DAGISEL-NEXT: ; %bb.2:
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -3220,7 +3678,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1032GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1032GISEL-NEXT: s_mov_b64 s[4:5], 0
; GFX1032GISEL-NEXT: s_mov_b32 s6, exec_lo
-; GFX1032GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1
; GFX1032GISEL-NEXT: s_ff1_i32_b32 s7, s6
; GFX1032GISEL-NEXT: v_readlane_b32 s8, v2, s7
; GFX1032GISEL-NEXT: v_readlane_b32 s9, v3, s7
@@ -3228,7 +3686,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1032GISEL-NEXT: s_add_u32 s4, s4, s8
; GFX1032GISEL-NEXT: s_addc_u32 s5, s5, s9
; GFX1032GISEL-NEXT: s_cmp_lg_u32 s6, 0
-; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1032GISEL-NEXT: ; %bb.2:
; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX1032GISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -3240,7 +3698,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], 0
; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s4, s[2:3]
; GFX1164DAGISEL-NEXT: v_readlane_b32 s5, v2, s4
@@ -3249,7 +3707,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1164DAGISEL-NEXT: s_add_u32 s0, s0, s5
; GFX1164DAGISEL-NEXT: s_addc_u32 s1, s1, s6
; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1164DAGISEL-NEXT: ; %bb.2:
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v3, s1
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, s0
@@ -3261,7 +3719,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1164GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], 0
; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1
; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164GISEL-NEXT: s_ctz_i32_b64 s4, s[2:3]
; GFX1164GISEL-NEXT: v_readlane_b32 s5, v2, s4
@@ -3270,7 +3728,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1164GISEL-NEXT: s_add_u32 s0, s0, s5
; GFX1164GISEL-NEXT: s_addc_u32 s1, s1, s6
; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1164GISEL-NEXT: ; %bb.2:
; GFX1164GISEL-NEXT: v_mov_b32_e32 v3, s1
; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, s0
@@ -3282,7 +3740,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1132DAGISEL-NEXT: s_mov_b64 s[0:1], 0
; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2
; GFX1132DAGISEL-NEXT: v_readlane_b32 s4, v2, s3
@@ -3291,7 +3749,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1132DAGISEL-NEXT: s_add_u32 s0, s0, s4
; GFX1132DAGISEL-NEXT: s_addc_u32 s1, s1, s5
; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1132DAGISEL-NEXT: ; %bb.2:
; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132DAGISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
@@ -3302,7 +3760,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1132GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1132GISEL-NEXT: s_mov_b64 s[0:1], 0
; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1
; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2
; GFX1132GISEL-NEXT: v_readlane_b32 s4, v2, s3
@@ -3311,7 +3769,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1132GISEL-NEXT: s_add_u32 s0, s0, s4
; GFX1132GISEL-NEXT: s_addc_u32 s1, s1, s5
; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1132GISEL-NEXT: ; %bb.2:
; GFX1132GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
@@ -3326,7 +3784,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0
; GFX12DAGISEL-NEXT: s_mov_b64 s[0:1], 0
; GFX12DAGISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX12DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX12DAGISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12DAGISEL-NEXT: s_ctz_i32_b32 s3, s2
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
@@ -3336,7 +3794,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX12DAGISEL-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB10_1
; GFX12DAGISEL-NEXT: ; %bb.2:
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12DAGISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
@@ -3357,7 +3815,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr6_sgpr7
; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc
; GFX8DAGISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
-; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB9_2
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB11_2
; GFX8DAGISEL-NEXT: ; %bb.1: ; %else
; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s7, s[6:7]
@@ -3366,7 +3824,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX8DAGISEL-NEXT: s_mul_hi_u32 s2, s2, s7
; GFX8DAGISEL-NEXT: s_mul_i32 s3, s3, s7
; GFX8DAGISEL-NEXT: s_add_u32 s7, s2, s3
-; GFX8DAGISEL-NEXT: .LBB9_2: ; %Flow
+; GFX8DAGISEL-NEXT: .LBB11_2: ; %Flow
; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9]
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s6
@@ -3395,7 +3853,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX8GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7
; GFX8GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc
; GFX8GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
-; GFX8GISEL-NEXT: s_cbranch_execz .LBB9_2
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB11_2
; GFX8GISEL-NEXT: ; %bb.1: ; %else
; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s7, s[6:7]
@@ -3404,13 +3862,13 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX8GISEL-NEXT: s_mul_hi_u32 s2, s2, s7
; GFX8GISEL-NEXT: s_mul_i32 s3, s3, s7
; GFX8GISEL-NEXT: s_add_u32 s7, s2, s3
-; GFX8GISEL-NEXT: .LBB9_2: ; %Flow
+; GFX8GISEL-NEXT: .LBB11_2: ; %Flow
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8GISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9]
; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s7
; GFX8GISEL-NEXT: s_xor_b64 exec, exec, s[2:3]
-; GFX8GISEL-NEXT: s_cbranch_execz .LBB9_4
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB11_4
; GFX8GISEL-NEXT: ; %bb.3: ; %if
; GFX8GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec
@@ -3422,7 +3880,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX8GISEL-NEXT: s_add_u32 s7, s4, s5
; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s7
-; GFX8GISEL-NEXT: .LBB9_4: ; %endif
+; GFX8GISEL-NEXT: .LBB11_4: ; %endif
; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1
; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0
@@ -3437,7 +3895,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr4_sgpr5
; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc
; GFX9DAGISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
-; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB9_2
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB11_2
; GFX9DAGISEL-NEXT: ; %bb.1: ; %else
; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec
; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s5, s[4:5]
@@ -3446,7 +3904,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX9DAGISEL-NEXT: s_mul_hi_u32 s2, s2, s5
; GFX9DAGISEL-NEXT: s_mul_i32 s3, s3, s5
; GFX9DAGISEL-NEXT: s_add_u32 s5, s2, s3
-; GFX9DAGISEL-NEXT: .LBB9_2: ; %Flow
+; GFX9DAGISEL-NEXT: .LBB11_2: ; %Flow
; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9]
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4
@@ -3474,7 +3932,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX9GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7
; GFX9GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc
; GFX9GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
-; GFX9GISEL-NEXT: s_cbranch_execz .LBB9_2
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB11_2
; GFX9GISEL-NEXT: ; %bb.1: ; %else
; GFX9GISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s7, s[6:7]
@@ -3483,13 +3941,13 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX9GISEL-NEXT: s_mul_hi_u32 s2, s2, s7
; GFX9GISEL-NEXT: s_mul_i32 s3, s3, s7
; GFX9GISEL-NEXT: s_add_u32 s7, s2, s3
-; GFX9GISEL-NEXT: .LBB9_2: ; %Flow
+; GFX9GISEL-NEXT: .LBB11_2: ; %Flow
; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9GISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9]
; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s7
; GFX9GISEL-NEXT: s_xor_b64 exec, exec, s[2:3]
-; GFX9GISEL-NEXT: s_cbranch_execz .LBB9_4
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB11_4
; GFX9GISEL-NEXT: ; %bb.3: ; %if
; GFX9GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec
@@ -3501,7 +3959,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX9GISEL-NEXT: s_add_u32 s5, s6, s5
; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s5
-; GFX9GISEL-NEXT: .LBB9_4: ; %endif
+; GFX9GISEL-NEXT: .LBB11_4: ; %endif
; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX9GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -3516,7 +3974,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr8_sgpr9
; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064DAGISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB9_2
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB11_2
; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else
; GFX1064DAGISEL-NEXT: s_mov_b64 s[8:9], exec
; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s8, s[8:9]
@@ -3525,7 +3983,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1064DAGISEL-NEXT: s_mul_i32 s3, s3, s8
; GFX1064DAGISEL-NEXT: s_mul_i32 s8, s2, s8
; GFX1064DAGISEL-NEXT: s_add_u32 s9, s9, s3
-; GFX1064DAGISEL-NEXT: .LBB9_2: ; %Flow
+; GFX1064DAGISEL-NEXT: .LBB11_2: ; %Flow
; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[4:5]
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s8
@@ -3553,7 +4011,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1064GISEL-NEXT: ; implicit-def: $sgpr8_sgpr9
; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc
; GFX1064GISEL-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
-; GFX1064GISEL-NEXT: s_cbranch_execz .LBB9_2
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB11_2
; GFX1064GISEL-NEXT: ; %bb.1: ; %else
; GFX1064GISEL-NEXT: s_mov_b64 s[8:9], exec
; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s8, s[8:9]
@@ -3562,13 +4020,13 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1064GISEL-NEXT: s_mul_i32 s3, s3, s8
; GFX1064GISEL-NEXT: s_mul_i32 s8, s2, s8
; GFX1064GISEL-NEXT: s_add_u32 s9, s9, s3
-; GFX1064GISEL-NEXT: .LBB9_2: ; %Flow
+; GFX1064GISEL-NEXT: .LBB11_2: ; %Flow
; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064GISEL-NEXT: s_or_saveexec_b64 s[2:3], s[6:7]
; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s8
; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, s9
; GFX1064GISEL-NEXT: s_xor_b64 exec, exec, s[2:3]
-; GFX1064GISEL-NEXT: s_cbranch_execz .LBB9_4
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB11_4
; GFX1064GISEL-NEXT: ; %bb.3: ; %if
; GFX1064GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec
@@ -3580,7 +4038,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1064GISEL-NEXT: s_add_u32 s5, s5, s7
; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, s5
-; GFX1064GISEL-NEXT: .LBB9_4: ; %endif
+; GFX1064GISEL-NEXT: .LBB11_4: ; %endif
; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX1064GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -3595,7 +4053,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr4_sgpr5
; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s8, vcc_lo
; GFX1032DAGISEL-NEXT: s_xor_b32 s8, exec_lo, s8
-; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB9_2
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB11_2
; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else
; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo
; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s4, s4
@@ -3604,7 +4062,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1032DAGISEL-NEXT: s_mul_i32 s3, s3, s4
; GFX1032DAGISEL-NEXT: s_mul_i32 s4, s2, s4
; GFX1032DAGISEL-NEXT: s_add_u32 s5, s5, s3
-; GFX1032DAGISEL-NEXT: .LBB9_2: ; %Flow
+; GFX1032DAGISEL-NEXT: .LBB11_2: ; %Flow
; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s2, s8
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s4
@@ -3632,7 +4090,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1032GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7
; GFX1032GISEL-NEXT: s_and_saveexec_b32 s8, vcc_lo
; GFX1032GISEL-NEXT: s_xor_b32 s8, exec_lo, s8
-; GFX1032GISEL-NEXT: s_cbranch_execz .LBB9_2
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB11_2
; GFX1032GISEL-NEXT: ; %bb.1: ; %else
; GFX1032GISEL-NEXT: s_mov_b32 s6, exec_lo
; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s6, s6
@@ -3641,13 +4099,13 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1032GISEL-NEXT: s_mul_i32 s3, s3, s6
; GFX1032GISEL-NEXT: s_mul_i32 s6, s2, s6
; GFX1032GISEL-NEXT: s_add_u32 s7, s7, s3
-; GFX1032GISEL-NEXT: .LBB9_2: ; %Flow
+; GFX1032GISEL-NEXT: .LBB11_2: ; %Flow
; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032GISEL-NEXT: s_or_saveexec_b32 s2, s8
; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s7
; GFX1032GISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s2
-; GFX1032GISEL-NEXT: s_cbranch_execz .LBB9_4
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB11_4
; GFX1032GISEL-NEXT: ; %bb.3: ; %if
; GFX1032GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo
@@ -3659,7 +4117,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1032GISEL-NEXT: s_add_u32 s5, s5, s7
; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s5
-; GFX1032GISEL-NEXT: .LBB9_4: ; %endif
+; GFX1032GISEL-NEXT: .LBB11_4: ; %endif
; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX1032GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -3676,7 +4134,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
; GFX1164DAGISEL-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
-; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB9_2
+; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB11_2
; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else
; GFX1164DAGISEL-NEXT: s_mov_b64 s[8:9], exec
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -3686,7 +4144,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1164DAGISEL-NEXT: s_mul_i32 s3, s3, s8
; GFX1164DAGISEL-NEXT: s_mul_i32 s8, s2, s8
; GFX1164DAGISEL-NEXT: s_add_u32 s9, s9, s3
-; GFX1164DAGISEL-NEXT: .LBB9_2: ; %Flow
+; GFX1164DAGISEL-NEXT: .LBB11_2: ; %Flow
; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[6:7]
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s8
@@ -3718,7 +4176,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
; GFX1164GISEL-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
-; GFX1164GISEL-NEXT: s_cbranch_execz .LBB9_2
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB11_2
; GFX1164GISEL-NEXT: ; %bb.1: ; %else
; GFX1164GISEL-NEXT: s_mov_b64 s[8:9], exec
; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -3728,13 +4186,13 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1164GISEL-NEXT: s_mul_i32 s3, s3, s8
; GFX1164GISEL-NEXT: s_mul_i32 s8, s2, s8
; GFX1164GISEL-NEXT: s_add_u32 s9, s9, s3
-; GFX1164GISEL-NEXT: .LBB9_2: ; %Flow
+; GFX1164GISEL-NEXT: .LBB11_2: ; %Flow
; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164GISEL-NEXT: s_or_saveexec_b64 s[2:3], s[6:7]
; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s8
; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, s9
; GFX1164GISEL-NEXT: s_xor_b64 exec, exec, s[2:3]
-; GFX1164GISEL-NEXT: s_cbranch_execz .LBB9_4
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB11_4
; GFX1164GISEL-NEXT: ; %bb.3: ; %if
; GFX1164GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX1164GISEL-NEXT: s_mov_b64 s[6:7], exec
@@ -3747,7 +4205,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1164GISEL-NEXT: s_add_u32 s5, s7, s5
; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, s5
-; GFX1164GISEL-NEXT: .LBB9_4: ; %endif
+; GFX1164GISEL-NEXT: .LBB11_4: ; %endif
; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX1164GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
@@ -3764,7 +4222,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
; GFX1132DAGISEL-NEXT: s_xor_b32 s8, exec_lo, s8
-; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB9_2
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB11_2
; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else
; GFX1132DAGISEL-NEXT: s_mov_b32 s6, exec_lo
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -3774,7 +4232,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1132DAGISEL-NEXT: s_mul_i32 s3, s3, s6
; GFX1132DAGISEL-NEXT: s_mul_i32 s6, s2, s6
; GFX1132DAGISEL-NEXT: s_add_u32 s7, s7, s3
-; GFX1132DAGISEL-NEXT: .LBB9_2: ; %Flow
+; GFX1132DAGISEL-NEXT: .LBB11_2: ; %Flow
; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s2, s8
; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
@@ -3804,7 +4262,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
; GFX1132GISEL-NEXT: s_xor_b32 s8, exec_lo, s8
-; GFX1132GISEL-NEXT: s_cbranch_execz .LBB9_2
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB11_2
; GFX1132GISEL-NEXT: ; %bb.1: ; %else
; GFX1132GISEL-NEXT: s_mov_b32 s6, exec_lo
; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -3814,12 +4272,12 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1132GISEL-NEXT: s_mul_i32 s3, s3, s6
; GFX1132GISEL-NEXT: s_mul_i32 s6, s2, s6
; GFX1132GISEL-NEXT: s_add_u32 s7, s7, s3
-; GFX1132GISEL-NEXT: .LBB9_2: ; %Flow
+; GFX1132GISEL-NEXT: .LBB11_2: ; %Flow
; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132GISEL-NEXT: s_or_saveexec_b32 s2, s8
; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
; GFX1132GISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s2
-; GFX1132GISEL-NEXT: s_cbranch_execz .LBB9_4
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB11_4
; GFX1132GISEL-NEXT: ; %bb.3: ; %if
; GFX1132GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo
@@ -3832,7 +4290,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1132GISEL-NEXT: s_add_u32 s5, s6, s5
; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX1132GISEL-NEXT: .LBB9_4: ; %endif
+; GFX1132GISEL-NEXT: .LBB11_4: ; %endif
; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1132GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX1132GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
@@ -3849,7 +4307,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
; GFX12DAGISEL-NEXT: s_xor_b32 s8, exec_lo, s8
-; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB9_2
+; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB11_2
; GFX12DAGISEL-NEXT: ; %bb.1: ; %else
; GFX12DAGISEL-NEXT: s_mov_b32 s6, exec_lo
; GFX12DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -3859,7 +4317,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX12DAGISEL-NEXT: s_mul_i32 s3, s3, s6
; GFX12DAGISEL-NEXT: s_mul_i32 s6, s2, s6
; GFX12DAGISEL-NEXT: s_add_co_u32 s7, s7, s3
-; GFX12DAGISEL-NEXT: .LBB9_2: ; %Flow
+; GFX12DAGISEL-NEXT: .LBB11_2: ; %Flow
; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0
; GFX12DAGISEL-NEXT: s_or_saveexec_b32 s2, s8
; GFX12DAGISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
@@ -3898,3 +4356,9 @@ endif:
store i64 %combine, ptr addrspace(1) %out
ret void
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX1132DAGISEL-FAKE16: {{.*}}
+; GFX1132GISEL-FAKE16: {{.*}}
+; GFX1164DAGISEL-FAKE16: {{.*}}
+; GFX1164GISEL-FAKE16: {{.*}}
+; GFX12DAGISEL-FAKE16: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll
index 71b33f9ddffa2..deceece4f3716 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll
@@ -7,11 +7,486 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1064GISEL %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GFX1032DAGISEL %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX1032GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164DAGISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX1132DAGISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX1132GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -global-isel=0 < %s | FileCheck -check-prefixes=GFX12DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mattr=-real-true16 -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164DAGISEL,GFX1164DAGISEL-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mattr=-real-true16 -mcpu=gfx1100 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164GISEL,GFX1164GISEL-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mattr=-real-true16 -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX1132DAGISEL,GFX1132DAGISEL-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mattr=-real-true16 -mcpu=gfx1100 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX1132GISEL,GFX1132GISEL-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mattr=-real-true16 -mcpu=gfx1200 -global-isel=0 < %s | FileCheck -check-prefixes=GFX12DAGISEL,GFX12DAGISEL-FAKE16 %s
+
+define amdgpu_kernel void @uniform_value_i16(ptr addrspace(1) %out, i16 %in) {
+; GFX8DAGISEL-LABEL: uniform_value_i16:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: s_sub_i32 s3, 0, s6
+; GFX8DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT: flat_store_short v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: uniform_value_i16:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: s_and_b32 s3, 0xffff, s6
+; GFX8GISEL-NEXT: s_sub_i32 s3, 0, s3
+; GFX8GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_short v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: uniform_value_i16:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: s_sub_i32 s3, 0, s6
+; GFX9DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: uniform_value_i16:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: s_and_b32 s3, 0xffff, s6
+; GFX9GISEL-NEXT: s_sub_i32 s3, 0, s3
+; GFX9GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT: global_store_short v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: uniform_value_i16:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_clause 0x1
+; GFX1064DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: s_sub_i32 s3, 0, s6
+; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT: global_store_short v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: uniform_value_i16:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_clause 0x1
+; GFX1064GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: s_and_b32 s4, 0xffff, s6
+; GFX1064GISEL-NEXT: s_sub_i32 s3, 0, s4
+; GFX1064GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT: global_store_short v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: uniform_value_i16:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_clause 0x1
+; GFX1032DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: s_sub_i32 s2, 0, s2
+; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s2, s3
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1032DAGISEL-NEXT: global_store_short v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: uniform_value_i16:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_clause 0x1
+; GFX1032GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX1032GISEL-NEXT: s_sub_i32 s2, 0, s2
+; GFX1032GISEL-NEXT: s_mul_i32 s2, s2, s3
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: global_store_short v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: uniform_value_i16:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_clause 0x1
+; GFX1164DAGISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: s_sub_i32 s3, 0, s6
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: uniform_value_i16:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_clause 0x1
+; GFX1164GISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: s_and_b32 s4, 0xffff, s6
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: s_sub_i32 s3, 0, s4
+; GFX1164GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: uniform_value_i16:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_clause 0x1
+; GFX1132DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: s_sub_i32 s2, 0, s2
+; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s2, s3
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: uniform_value_i16:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_clause 0x1
+; GFX1132GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: s_sub_i32 s2, 0, s2
+; GFX1132GISEL-NEXT: s_mul_i32 s2, s2, s3
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132GISEL-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_endpgm
+;
+; GFX12DAGISEL-LABEL: uniform_value_i16:
+; GFX12DAGISEL: ; %bb.0: ; %entry
+; GFX12DAGISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX12DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12DAGISEL-NEXT: s_sub_co_i32 s2, 0, s2
+; GFX12DAGISEL-NEXT: s_mul_i32 s2, s2, s3
+; GFX12DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12DAGISEL-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX12DAGISEL-NEXT: s_endpgm
+entry:
+ %result = call i16 @llvm.amdgcn.wave.reduce.sub.i16(i16 %in, i32 1)
+ store i16 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define void @divergent_value_i16(ptr addrspace(1) %out, i16 %in) {
+; GFX8DAGISEL-LABEL: divergent_value_i16:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX8DAGISEL-NEXT: s_mov_b32 s6, 0
+; GFX8DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v2, s7
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s8
+; GFX8DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX8DAGISEL-NEXT: v_sub_i16 v3, s6, v3
+; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX8DAGISEL-NEXT: ; %bb.2:
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX8DAGISEL-NEXT: flat_store_short v[0:1], v2
+; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX8DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8GISEL-LABEL: divergent_value_i16:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX8GISEL-NEXT: s_mov_b32 s6, 0
+; GFX8GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX8GISEL-NEXT: v_readlane_b32 s8, v2, s7
+; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s8
+; GFX8GISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX8GISEL-NEXT: v_sub_i16 v3, s6, v3
+; GFX8GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX8GISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX8GISEL-NEXT: ; %bb.2:
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX8GISEL-NEXT: flat_store_short v[0:1], v2
+; GFX8GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX8GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9DAGISEL-LABEL: divergent_value_i16:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX9DAGISEL-NEXT: s_mov_b32 s6, 0
+; GFX9DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v2, s7
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v3, s8
+; GFX9DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX9DAGISEL-NEXT: v_sub_i16 v3, s6, v3
+; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX9DAGISEL-NEXT: ; %bb.2:
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX9DAGISEL-NEXT: global_store_short v[0:1], v2, off
+; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9GISEL-LABEL: divergent_value_i16:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX9GISEL-NEXT: s_mov_b32 s6, 0
+; GFX9GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX9GISEL-NEXT: v_readlane_b32 s8, v2, s7
+; GFX9GISEL-NEXT: v_mov_b32_e32 v3, s8
+; GFX9GISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX9GISEL-NEXT: v_sub_i16 v3, s6, v3
+; GFX9GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9GISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX9GISEL-NEXT: ; %bb.2:
+; GFX9GISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX9GISEL-NEXT: global_store_short v[0:1], v2, off
+; GFX9GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1064DAGISEL-LABEL: divergent_value_i16:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064DAGISEL-NEXT: s_mov_b32 s6, 0
+; GFX1064DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v2, s7
+; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1064DAGISEL-NEXT: v_sub_nc_i16 v3, s6, s8
+; GFX1064DAGISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1064DAGISEL-NEXT: ; %bb.2:
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX1064DAGISEL-NEXT: global_store_short v[0:1], v2, off
+; GFX1064DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1064GISEL-LABEL: divergent_value_i16:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064GISEL-NEXT: s_mov_b32 s6, 0
+; GFX1064GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX1064GISEL-NEXT: v_readlane_b32 s8, v2, s7
+; GFX1064GISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1064GISEL-NEXT: v_sub_nc_i16 v3, s6, s8
+; GFX1064GISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1064GISEL-NEXT: ; %bb.2:
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX1064GISEL-NEXT: global_store_short v[0:1], v2, off
+; GFX1064GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1032DAGISEL-LABEL: divergent_value_i16:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1032DAGISEL-NEXT: s_mov_b32 s5, 0
+; GFX1032DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s6, s4
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s7, v2, s6
+; GFX1032DAGISEL-NEXT: s_bitset0_b32 s4, s6
+; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1032DAGISEL-NEXT: v_sub_nc_i16 v3, s5, s7
+; GFX1032DAGISEL-NEXT: v_readfirstlane_b32 s5, v3
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1032DAGISEL-NEXT: ; %bb.2:
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, s5
+; GFX1032DAGISEL-NEXT: global_store_short v[0:1], v2, off
+; GFX1032DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1032GISEL-LABEL: divergent_value_i16:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032GISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1032GISEL-NEXT: s_mov_b32 s5, 0
+; GFX1032GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: s_ff1_i32_b32 s6, s4
+; GFX1032GISEL-NEXT: v_readlane_b32 s7, v2, s6
+; GFX1032GISEL-NEXT: s_bitset0_b32 s4, s6
+; GFX1032GISEL-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1032GISEL-NEXT: v_sub_nc_i16 v3, s5, s7
+; GFX1032GISEL-NEXT: v_readfirstlane_b32 s5, v3
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1032GISEL-NEXT: ; %bb.2:
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, s5
+; GFX1032GISEL-NEXT: global_store_short v[0:1], v2, off
+; GFX1032GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1164DAGISEL-LABEL: divergent_value_i16:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164DAGISEL-NEXT: s_mov_b32 s2, 0
+; GFX1164DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s3, s[0:1]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: v_readlane_b32 s4, v2, s3
+; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[0:1], s3
+; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164DAGISEL-NEXT: v_sub_nc_i16 v3, s2, s4
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1164DAGISEL-NEXT: ; %bb.2:
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX1164DAGISEL-NEXT: global_store_b16 v[0:1], v2, off
+; GFX1164DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1164GISEL-LABEL: divergent_value_i16:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164GISEL-NEXT: s_mov_b32 s2, 0
+; GFX1164GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: s_ctz_i32_b64 s3, s[0:1]
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: v_readlane_b32 s4, v2, s3
+; GFX1164GISEL-NEXT: s_bitset0_b64 s[0:1], s3
+; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164GISEL-NEXT: v_sub_nc_i16 v3, s2, s4
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1164GISEL-NEXT: ; %bb.2:
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX1164GISEL-NEXT: global_store_b16 v[0:1], v2, off
+; GFX1164GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1132DAGISEL-LABEL: divergent_value_i16:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0
+; GFX1132DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s2, s0
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: v_readlane_b32 s3, v2, s2
+; GFX1132DAGISEL-NEXT: s_bitset0_b32 s0, s2
+; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132DAGISEL-NEXT: v_sub_nc_i16 v3, s1, s3
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_readfirstlane_b32 s1, v3
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1132DAGISEL-NEXT: ; %bb.2:
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v2, s1
+; GFX1132DAGISEL-NEXT: global_store_b16 v[0:1], v2, off
+; GFX1132DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1132GISEL-LABEL: divergent_value_i16:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132GISEL-NEXT: s_mov_b32 s1, 0
+; GFX1132GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: s_ctz_i32_b32 s2, s0
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: v_readlane_b32 s3, v2, s2
+; GFX1132GISEL-NEXT: s_bitset0_b32 s0, s2
+; GFX1132GISEL-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132GISEL-NEXT: v_sub_nc_i16 v3, s1, s3
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_readfirstlane_b32 s1, v3
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1132GISEL-NEXT: ; %bb.2:
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v2, s1
+; GFX1132GISEL-NEXT: global_store_b16 v[0:1], v2, off
+; GFX1132GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12DAGISEL-LABEL: divergent_value_i16:
+; GFX12DAGISEL: ; %bb.0: ; %entry
+; GFX12DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12DAGISEL-NEXT: s_wait_expcnt 0x0
+; GFX12DAGISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12DAGISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12DAGISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX12DAGISEL-NEXT: s_mov_b32 s1, 0
+; GFX12DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
+; GFX12DAGISEL-NEXT: s_ctz_i32_b32 s2, s0
+; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
+; GFX12DAGISEL-NEXT: v_readlane_b32 s3, v2, s2
+; GFX12DAGISEL-NEXT: s_bitset0_b32 s0, s2
+; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
+; GFX12DAGISEL-NEXT: s_cmp_lg_u32 s0, 0
+; GFX12DAGISEL-NEXT: v_sub_nc_i16 v3, s1, s3
+; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12DAGISEL-NEXT: v_readfirstlane_b32 s1, v3
+; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX12DAGISEL-NEXT: ; %bb.2:
+; GFX12DAGISEL-NEXT: s_wait_alu depctr_va_sdst(0)
+; GFX12DAGISEL-NEXT: v_mov_b32_e32 v2, s1
+; GFX12DAGISEL-NEXT: global_store_b16 v[0:1], v2, off
+; GFX12DAGISEL-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %result = call i16 @llvm.amdgcn.wave.reduce.sub.i16(i16 %in, i32 1)
+ store i16 %result, ptr addrspace(1) %out
+ ret void
+}
define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
; GFX8DAGISEL-LABEL: uniform_value:
@@ -221,13 +696,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX8DAGISEL-NEXT: s_mov_b32 s4, 0
-; GFX8DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
; GFX8DAGISEL-NEXT: s_sub_i32 s4, s4, s6
; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX8DAGISEL-NEXT: ; %bb.2:
; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
@@ -241,13 +716,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX8GISEL-NEXT: s_mov_b32 s4, 0
-; GFX8GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX8GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
; GFX8GISEL-NEXT: v_readlane_b32 s6, v0, s5
; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s5
; GFX8GISEL-NEXT: s_sub_i32 s4, s4, s6
; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX8GISEL-NEXT: ; %bb.2:
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
@@ -262,13 +737,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX9DAGISEL-NEXT: s_mov_b32 s4, 0
-; GFX9DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
; GFX9DAGISEL-NEXT: s_sub_i32 s4, s4, s6
; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX9DAGISEL-NEXT: ; %bb.2:
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
@@ -280,13 +755,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX9GISEL-NEXT: s_mov_b32 s4, 0
-; GFX9GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX9GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
; GFX9GISEL-NEXT: v_readlane_b32 s6, v0, s5
; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s5
; GFX9GISEL-NEXT: s_sub_i32 s4, s4, s6
; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX9GISEL-NEXT: ; %bb.2:
; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
@@ -300,13 +775,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX1064DAGISEL-NEXT: s_mov_b32 s4, 0
-; GFX1064DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
; GFX1064DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
; GFX1064DAGISEL-NEXT: s_sub_i32 s4, s4, s6
; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1064DAGISEL-NEXT: ; %bb.2:
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
@@ -318,13 +793,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX1064GISEL-NEXT: s_mov_b32 s4, 0
-; GFX1064GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX1064GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
; GFX1064GISEL-NEXT: v_readlane_b32 s6, v0, s5
; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s5
; GFX1064GISEL-NEXT: s_sub_i32 s4, s4, s6
; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1064GISEL-NEXT: ; %bb.2:
; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
@@ -338,13 +813,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo
; GFX1032DAGISEL-NEXT: s_mov_b32 s2, 0
-; GFX1032DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s4, s3
; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
; GFX1032DAGISEL-NEXT: s_bitset0_b32 s3, s4
; GFX1032DAGISEL-NEXT: s_sub_i32 s2, s2, s5
; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
-; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1032DAGISEL-NEXT: ; %bb.2:
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2
; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
@@ -356,13 +831,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo
; GFX1032GISEL-NEXT: s_mov_b32 s2, 0
-; GFX1032GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX1032GISEL-NEXT: s_ff1_i32_b32 s4, s3
; GFX1032GISEL-NEXT: v_readlane_b32 s5, v0, s4
; GFX1032GISEL-NEXT: s_bitset0_b32 s3, s4
; GFX1032GISEL-NEXT: s_sub_i32 s2, s2, s5
; GFX1032GISEL-NEXT: s_cmp_lg_u32 s3, 0
-; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1032GISEL-NEXT: ; %bb.2:
; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
@@ -377,14 +852,14 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX1164DAGISEL-NEXT: s_mov_b32 s4, 0
-; GFX1164DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
; GFX1164DAGISEL-NEXT: s_sub_i32 s4, s4, s6
; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1164DAGISEL-NEXT: ; %bb.2:
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
@@ -397,14 +872,14 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX1164GISEL-NEXT: s_mov_b32 s4, 0
-; GFX1164GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5
; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5
; GFX1164GISEL-NEXT: s_sub_i32 s4, s4, s6
; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1164GISEL-NEXT: ; %bb.2:
; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
@@ -418,14 +893,14 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo
; GFX1132DAGISEL-NEXT: s_mov_b32 s2, 0
-; GFX1132DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s3
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4
; GFX1132DAGISEL-NEXT: s_sub_i32 s2, s2, s5
; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
-; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1132DAGISEL-NEXT: ; %bb.2:
; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2
; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
@@ -438,14 +913,14 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo
; GFX1132GISEL-NEXT: s_mov_b32 s2, 0
-; GFX1132GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s3
; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4
; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4
; GFX1132GISEL-NEXT: s_sub_i32 s2, s2, s5
; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0
-; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1132GISEL-NEXT: ; %bb.2:
; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
@@ -458,14 +933,14 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX12DAGISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
; GFX12DAGISEL-NEXT: s_mov_b32 s3, exec_lo
; GFX12DAGISEL-NEXT: s_mov_b32 s2, 0
-; GFX12DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX12DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX12DAGISEL-NEXT: s_ctz_i32_b32 s4, s3
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
; GFX12DAGISEL-NEXT: s_bitset0_b32 s3, s4
; GFX12DAGISEL-NEXT: s_sub_co_i32 s2, s2, s5
; GFX12DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
-; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX12DAGISEL-NEXT: ; %bb.2:
; GFX12DAGISEL-NEXT: v_mov_b32_e32 v0, s2
; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0
@@ -2402,7 +2877,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr2
; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX8DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB6_2
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB8_2
; GFX8DAGISEL-NEXT: ; %bb.1: ; %else
; GFX8DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
@@ -2411,24 +2886,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8DAGISEL-NEXT: s_sub_i32 s3, 0, s6
; GFX8DAGISEL-NEXT: s_mul_i32 s2, s3, s2
-; GFX8DAGISEL-NEXT: .LBB6_2: ; %Flow
+; GFX8DAGISEL-NEXT: .LBB8_2: ; %Flow
; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s2
; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB6_6
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB8_6
; GFX8DAGISEL-NEXT: ; %bb.3: ; %if
; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX8DAGISEL-NEXT: s_mov_b32 s6, 0
-; GFX8DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1
; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX8DAGISEL-NEXT: s_sub_i32 s6, s6, s8
; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB8_4
; GFX8DAGISEL-NEXT: ; %bb.5:
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6
-; GFX8DAGISEL-NEXT: .LBB6_6: ; %endif
+; GFX8DAGISEL-NEXT: .LBB8_6: ; %endif
; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
@@ -2443,7 +2918,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX8GISEL-NEXT: ; implicit-def: $sgpr2
; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX8GISEL-NEXT: s_cbranch_execz .LBB6_2
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB8_2
; GFX8GISEL-NEXT: ; %bb.1: ; %else
; GFX8GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
@@ -2452,24 +2927,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8GISEL-NEXT: s_sub_i32 s3, 0, s6
; GFX8GISEL-NEXT: s_mul_i32 s2, s3, s2
-; GFX8GISEL-NEXT: .LBB6_2: ; %Flow
+; GFX8GISEL-NEXT: .LBB8_2: ; %Flow
; GFX8GISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s2
; GFX8GISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX8GISEL-NEXT: s_cbranch_execz .LBB6_6
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB8_6
; GFX8GISEL-NEXT: ; %bb.3: ; %if
; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX8GISEL-NEXT: s_mov_b32 s6, 0
-; GFX8GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1
; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
; GFX8GISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX8GISEL-NEXT: s_sub_i32 s6, s6, s8
; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB6_4
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB8_4
; GFX8GISEL-NEXT: ; %bb.5:
; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s6
-; GFX8GISEL-NEXT: .LBB6_6: ; %endif
+; GFX8GISEL-NEXT: .LBB8_6: ; %endif
; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
@@ -2484,7 +2959,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr2
; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX9DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB6_2
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB8_2
; GFX9DAGISEL-NEXT: ; %bb.1: ; %else
; GFX9DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
@@ -2493,24 +2968,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9DAGISEL-NEXT: s_sub_i32 s3, 0, s6
; GFX9DAGISEL-NEXT: s_mul_i32 s2, s3, s2
-; GFX9DAGISEL-NEXT: .LBB6_2: ; %Flow
+; GFX9DAGISEL-NEXT: .LBB8_2: ; %Flow
; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB6_6
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB8_6
; GFX9DAGISEL-NEXT: ; %bb.3: ; %if
; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX9DAGISEL-NEXT: s_mov_b32 s6, 0
-; GFX9DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1
; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX9DAGISEL-NEXT: s_sub_i32 s6, s6, s8
; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB8_4
; GFX9DAGISEL-NEXT: ; %bb.5:
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6
-; GFX9DAGISEL-NEXT: .LBB6_6: ; %endif
+; GFX9DAGISEL-NEXT: .LBB8_6: ; %endif
; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
@@ -2524,7 +2999,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX9GISEL-NEXT: ; implicit-def: $sgpr2
; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX9GISEL-NEXT: s_cbranch_execz .LBB6_2
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB8_2
; GFX9GISEL-NEXT: ; %bb.1: ; %else
; GFX9GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
@@ -2533,24 +3008,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9GISEL-NEXT: s_sub_i32 s3, 0, s6
; GFX9GISEL-NEXT: s_mul_i32 s2, s3, s2
-; GFX9GISEL-NEXT: .LBB6_2: ; %Flow
+; GFX9GISEL-NEXT: .LBB8_2: ; %Flow
; GFX9GISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s2
; GFX9GISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX9GISEL-NEXT: s_cbranch_execz .LBB6_6
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB8_6
; GFX9GISEL-NEXT: ; %bb.3: ; %if
; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX9GISEL-NEXT: s_mov_b32 s6, 0
-; GFX9GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1
; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
; GFX9GISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX9GISEL-NEXT: s_sub_i32 s6, s6, s8
; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB6_4
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB8_4
; GFX9GISEL-NEXT: ; %bb.5:
; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s6
-; GFX9GISEL-NEXT: .LBB6_6: ; %endif
+; GFX9GISEL-NEXT: .LBB8_6: ; %endif
; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9GISEL-NEXT: v_mov_b32_e32 v0, 0
@@ -2564,7 +3039,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr2
; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB6_2
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB8_2
; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else
; GFX1064DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
@@ -2573,24 +3048,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064DAGISEL-NEXT: s_sub_i32 s3, 0, s6
; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s3, s2
-; GFX1064DAGISEL-NEXT: .LBB6_2: ; %Flow
+; GFX1064DAGISEL-NEXT: .LBB8_2: ; %Flow
; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB6_6
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB8_6
; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if
; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX1064DAGISEL-NEXT: s_mov_b32 s6, 0
-; GFX1064DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1
; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX1064DAGISEL-NEXT: s_sub_i32 s6, s6, s8
; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB8_4
; GFX1064DAGISEL-NEXT: ; %bb.5:
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6
-; GFX1064DAGISEL-NEXT: .LBB6_6: ; %endif
+; GFX1064DAGISEL-NEXT: .LBB8_6: ; %endif
; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
@@ -2604,7 +3079,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1064GISEL-NEXT: ; implicit-def: $sgpr2
; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064GISEL-NEXT: s_cbranch_execz .LBB6_2
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB8_2
; GFX1064GISEL-NEXT: ; %bb.1: ; %else
; GFX1064GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
@@ -2613,24 +3088,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064GISEL-NEXT: s_sub_i32 s3, 0, s6
; GFX1064GISEL-NEXT: s_mul_i32 s2, s3, s2
-; GFX1064GISEL-NEXT: .LBB6_2: ; %Flow
+; GFX1064GISEL-NEXT: .LBB8_2: ; %Flow
; GFX1064GISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, s2
; GFX1064GISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX1064GISEL-NEXT: s_cbranch_execz .LBB6_6
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB8_6
; GFX1064GISEL-NEXT: ; %bb.3: ; %if
; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX1064GISEL-NEXT: s_mov_b32 s6, 0
-; GFX1064GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1
; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
; GFX1064GISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX1064GISEL-NEXT: s_sub_i32 s6, s6, s8
; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB6_4
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB8_4
; GFX1064GISEL-NEXT: ; %bb.5:
; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, s6
-; GFX1064GISEL-NEXT: .LBB6_6: ; %endif
+; GFX1064GISEL-NEXT: .LBB8_6: ; %endif
; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, 0
@@ -2644,7 +3119,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr1
; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB6_2
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB8_2
; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else
; GFX1032DAGISEL-NEXT: s_load_dword s1, s[4:5], 0x2c
; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
@@ -2653,24 +3128,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032DAGISEL-NEXT: s_sub_i32 s1, 0, s1
; GFX1032DAGISEL-NEXT: s_mul_i32 s1, s1, s2
-; GFX1032DAGISEL-NEXT: .LBB6_2: ; %Flow
+; GFX1032DAGISEL-NEXT: .LBB8_2: ; %Flow
; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, s0
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB6_6
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB8_6
; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if
; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
; GFX1032DAGISEL-NEXT: s_mov_b32 s1, 0
-; GFX1032DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1
; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s3, s2
; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s3
; GFX1032DAGISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1032DAGISEL-NEXT: s_sub_i32 s1, s1, s6
; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB8_4
; GFX1032DAGISEL-NEXT: ; %bb.5:
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX1032DAGISEL-NEXT: .LBB6_6: ; %endif
+; GFX1032DAGISEL-NEXT: .LBB8_6: ; %endif
; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
@@ -2684,7 +3159,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1032GISEL-NEXT: ; implicit-def: $sgpr1
; GFX1032GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX1032GISEL-NEXT: s_cbranch_execz .LBB6_2
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB8_2
; GFX1032GISEL-NEXT: ; %bb.1: ; %else
; GFX1032GISEL-NEXT: s_load_dword s1, s[4:5], 0x2c
; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
@@ -2693,24 +3168,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032GISEL-NEXT: s_sub_i32 s1, 0, s1
; GFX1032GISEL-NEXT: s_mul_i32 s1, s1, s2
-; GFX1032GISEL-NEXT: .LBB6_2: ; %Flow
+; GFX1032GISEL-NEXT: .LBB8_2: ; %Flow
; GFX1032GISEL-NEXT: s_or_saveexec_b32 s0, s0
; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX1032GISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX1032GISEL-NEXT: s_cbranch_execz .LBB6_6
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB8_6
; GFX1032GISEL-NEXT: ; %bb.3: ; %if
; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
; GFX1032GISEL-NEXT: s_mov_b32 s1, 0
-; GFX1032GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1
; GFX1032GISEL-NEXT: s_ff1_i32_b32 s3, s2
; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s3
; GFX1032GISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1032GISEL-NEXT: s_sub_i32 s1, s1, s6
; GFX1032GISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB6_4
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB8_4
; GFX1032GISEL-NEXT: ; %bb.5:
; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX1032GISEL-NEXT: .LBB6_6: ; %endif
+; GFX1032GISEL-NEXT: .LBB8_6: ; %endif
; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, 0
@@ -2726,7 +3201,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB6_2
+; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB8_2
; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else
; GFX1164DAGISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c
; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
@@ -2736,25 +3211,25 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164DAGISEL-NEXT: s_sub_i32 s3, 0, s6
; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s3, s2
-; GFX1164DAGISEL-NEXT: .LBB6_2: ; %Flow
+; GFX1164DAGISEL-NEXT: .LBB8_2: ; %Flow
; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB6_6
+; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB8_6
; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if
; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX1164DAGISEL-NEXT: s_mov_b32 s6, 0
-; GFX1164DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1
; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX1164DAGISEL-NEXT: s_sub_i32 s6, s6, s8
; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB8_4
; GFX1164DAGISEL-NEXT: ; %bb.5:
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6
-; GFX1164DAGISEL-NEXT: .LBB6_6: ; %endif
+; GFX1164DAGISEL-NEXT: .LBB8_6: ; %endif
; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
@@ -2770,7 +3245,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164GISEL-NEXT: s_cbranch_execz .LBB6_2
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB8_2
; GFX1164GISEL-NEXT: ; %bb.1: ; %else
; GFX1164GISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c
; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
@@ -2780,25 +3255,25 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164GISEL-NEXT: s_sub_i32 s3, 0, s6
; GFX1164GISEL-NEXT: s_mul_i32 s2, s3, s2
-; GFX1164GISEL-NEXT: .LBB6_2: ; %Flow
+; GFX1164GISEL-NEXT: .LBB8_2: ; %Flow
; GFX1164GISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, s2
; GFX1164GISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX1164GISEL-NEXT: s_cbranch_execz .LBB6_6
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB8_6
; GFX1164GISEL-NEXT: ; %bb.3: ; %if
; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX1164GISEL-NEXT: s_mov_b32 s6, 0
-; GFX1164GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1
; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX1164GISEL-NEXT: s_sub_i32 s6, s6, s8
; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB6_4
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB8_4
; GFX1164GISEL-NEXT: ; %bb.5:
; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, s6
-; GFX1164GISEL-NEXT: .LBB6_6: ; %endif
+; GFX1164GISEL-NEXT: .LBB8_6: ; %endif
; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, 0
@@ -2814,7 +3289,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
; GFX1132DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB6_2
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB8_2
; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else
; GFX1132DAGISEL-NEXT: s_load_b32 s1, s[4:5], 0x2c
; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
@@ -2824,25 +3299,25 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132DAGISEL-NEXT: s_sub_i32 s1, 0, s1
; GFX1132DAGISEL-NEXT: s_mul_i32 s1, s1, s2
-; GFX1132DAGISEL-NEXT: .LBB6_2: ; %Flow
+; GFX1132DAGISEL-NEXT: .LBB8_2: ; %Flow
; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, s0
; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB6_6
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB8_6
; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if
; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0
-; GFX1132DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1
; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s3
; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1132DAGISEL-NEXT: s_sub_i32 s1, s1, s6
; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB8_4
; GFX1132DAGISEL-NEXT: ; %bb.5:
; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX1132DAGISEL-NEXT: .LBB6_6: ; %endif
+; GFX1132DAGISEL-NEXT: .LBB8_6: ; %endif
; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0
@@ -2858,7 +3333,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
; GFX1132GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX1132GISEL-NEXT: s_cbranch_execz .LBB6_2
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB8_2
; GFX1132GISEL-NEXT: ; %bb.1: ; %else
; GFX1132GISEL-NEXT: s_load_b32 s1, s[4:5], 0x2c
; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
@@ -2868,25 +3343,25 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132GISEL-NEXT: s_sub_i32 s1, 0, s1
; GFX1132GISEL-NEXT: s_mul_i32 s1, s1, s2
-; GFX1132GISEL-NEXT: .LBB6_2: ; %Flow
+; GFX1132GISEL-NEXT: .LBB8_2: ; %Flow
; GFX1132GISEL-NEXT: s_or_saveexec_b32 s0, s0
; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX1132GISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX1132GISEL-NEXT: s_cbranch_execz .LBB6_6
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB8_6
; GFX1132GISEL-NEXT: ; %bb.3: ; %if
; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
; GFX1132GISEL-NEXT: s_mov_b32 s1, 0
-; GFX1132GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1
; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2
; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s3
; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1132GISEL-NEXT: s_sub_i32 s1, s1, s6
; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB6_4
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB8_4
; GFX1132GISEL-NEXT: ; %bb.5:
; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX1132GISEL-NEXT: .LBB6_6: ; %endif
+; GFX1132GISEL-NEXT: .LBB8_6: ; %endif
; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, 0
@@ -2902,7 +3377,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
; GFX12DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB6_2
+; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB8_2
; GFX12DAGISEL-NEXT: ; %bb.1: ; %else
; GFX12DAGISEL-NEXT: s_load_b32 s1, s[4:5], 0x2c
; GFX12DAGISEL-NEXT: s_mov_b32 s2, exec_lo
@@ -2912,15 +3387,15 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0
; GFX12DAGISEL-NEXT: s_sub_co_i32 s1, 0, s1
; GFX12DAGISEL-NEXT: s_mul_i32 s1, s1, s2
-; GFX12DAGISEL-NEXT: .LBB6_2: ; %Flow
+; GFX12DAGISEL-NEXT: .LBB8_2: ; %Flow
; GFX12DAGISEL-NEXT: s_or_saveexec_b32 s0, s0
; GFX12DAGISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX12DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB6_6
+; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB8_6
; GFX12DAGISEL-NEXT: ; %bb.3: ; %if
; GFX12DAGISEL-NEXT: s_mov_b32 s2, exec_lo
; GFX12DAGISEL-NEXT: s_mov_b32 s1, 0
-; GFX12DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX12DAGISEL-NEXT: .LBB8_4: ; =>This Inner Loop Header: Depth=1
; GFX12DAGISEL-NEXT: s_ctz_i32_b32 s3, s2
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12DAGISEL-NEXT: v_readlane_b32 s6, v0, s3
@@ -2928,10 +3403,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX12DAGISEL-NEXT: s_sub_co_i32 s1, s1, s6
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4
+; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB8_4
; GFX12DAGISEL-NEXT: ; %bb.5:
; GFX12DAGISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX12DAGISEL-NEXT: .LBB6_6: ; %endif
+; GFX12DAGISEL-NEXT: .LBB8_6: ; %endif
; GFX12DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX12DAGISEL-NEXT: v_mov_b32_e32 v0, 0
@@ -3236,7 +3711,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], 0
; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX8DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1
; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s8, s[6:7]
; GFX8DAGISEL-NEXT: v_readlane_b32 s9, v2, s8
; GFX8DAGISEL-NEXT: v_readlane_b32 s10, v3, s8
@@ -3244,7 +3719,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX8DAGISEL-NEXT: s_bitset0_b64 s[6:7], s8
; GFX8DAGISEL-NEXT: s_subb_u32 s5, s5, s10
; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB10_1
; GFX8DAGISEL-NEXT: ; %bb.2:
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -3257,7 +3732,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX8GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8GISEL-NEXT: s_mov_b64 s[4:5], 0
; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX8GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1
; GFX8GISEL-NEXT: s_ff1_i32_b64 s8, s[6:7]
; GFX8GISEL-NEXT: v_readlane_b32 s9, v2, s8
; GFX8GISEL-NEXT: v_readlane_b32 s10, v3, s8
@@ -3265,7 +3740,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX8GISEL-NEXT: s_bitset0_b64 s[6:7], s8
; GFX8GISEL-NEXT: s_subb_u32 s5, s5, s10
; GFX8GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB10_1
; GFX8GISEL-NEXT: ; %bb.2:
; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -3278,7 +3753,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], 0
; GFX9DAGISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX9DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1
; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s8, s[6:7]
; GFX9DAGISEL-NEXT: v_readlane_b32 s9, v2, s8
; GFX9DAGISEL-NEXT: v_readlane_b32 s10, v3, s8
@@ -3286,7 +3761,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX9DAGISEL-NEXT: s_bitset0_b64 s[6:7], s8
; GFX9DAGISEL-NEXT: s_subb_u32 s5, s5, s10
; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB10_1
; GFX9DAGISEL-NEXT: ; %bb.2:
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -3299,7 +3774,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX9GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9GISEL-NEXT: s_mov_b64 s[4:5], 0
; GFX9GISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX9GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1
; GFX9GISEL-NEXT: s_ff1_i32_b64 s8, s[6:7]
; GFX9GISEL-NEXT: v_readlane_b32 s9, v2, s8
; GFX9GISEL-NEXT: v_readlane_b32 s10, v3, s8
@@ -3307,7 +3782,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX9GISEL-NEXT: s_bitset0_b64 s[6:7], s8
; GFX9GISEL-NEXT: s_subb_u32 s5, s5, s10
; GFX9GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB10_1
; GFX9GISEL-NEXT: ; %bb.2:
; GFX9GISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX9GISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -3320,7 +3795,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1064DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], 0
; GFX1064DAGISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX1064DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1
; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s8, s[6:7]
; GFX1064DAGISEL-NEXT: v_readlane_b32 s9, v2, s8
; GFX1064DAGISEL-NEXT: v_readlane_b32 s10, v3, s8
@@ -3328,7 +3803,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1064DAGISEL-NEXT: s_sub_u32 s4, s4, s9
; GFX1064DAGISEL-NEXT: s_subb_u32 s5, s5, s10
; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1064DAGISEL-NEXT: ; %bb.2:
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -3340,7 +3815,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1064GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], 0
; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX1064GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1
; GFX1064GISEL-NEXT: s_ff1_i32_b64 s8, s[6:7]
; GFX1064GISEL-NEXT: v_readlane_b32 s9, v2, s8
; GFX1064GISEL-NEXT: v_readlane_b32 s10, v3, s8
@@ -3348,7 +3823,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1064GISEL-NEXT: s_sub_u32 s4, s4, s9
; GFX1064GISEL-NEXT: s_subb_u32 s5, s5, s10
; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1064GISEL-NEXT: ; %bb.2:
; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX1064GISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -3360,7 +3835,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1032DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1032DAGISEL-NEXT: s_mov_b64 s[4:5], 0
; GFX1032DAGISEL-NEXT: s_mov_b32 s6, exec_lo
-; GFX1032DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1
; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s7, s6
; GFX1032DAGISEL-NEXT: v_readlane_b32 s8, v2, s7
; GFX1032DAGISEL-NEXT: v_readlane_b32 s9, v3, s7
@@ -3368,7 +3843,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1032DAGISEL-NEXT: s_sub_u32 s4, s4, s8
; GFX1032DAGISEL-NEXT: s_subb_u32 s5, s5, s9
; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s6, 0
-; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1032DAGISEL-NEXT: ; %bb.2:
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -3380,7 +3855,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1032GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1032GISEL-NEXT: s_mov_b64 s[4:5], 0
; GFX1032GISEL-NEXT: s_mov_b32 s6, exec_lo
-; GFX1032GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1
; GFX1032GISEL-NEXT: s_ff1_i32_b32 s7, s6
; GFX1032GISEL-NEXT: v_readlane_b32 s8, v2, s7
; GFX1032GISEL-NEXT: v_readlane_b32 s9, v3, s7
@@ -3388,7 +3863,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1032GISEL-NEXT: s_sub_u32 s4, s4, s8
; GFX1032GISEL-NEXT: s_subb_u32 s5, s5, s9
; GFX1032GISEL-NEXT: s_cmp_lg_u32 s6, 0
-; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1032GISEL-NEXT: ; %bb.2:
; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX1032GISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -3400,7 +3875,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], 0
; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s4, s[2:3]
; GFX1164DAGISEL-NEXT: v_readlane_b32 s5, v2, s4
@@ -3409,7 +3884,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1164DAGISEL-NEXT: s_sub_u32 s0, s0, s5
; GFX1164DAGISEL-NEXT: s_subb_u32 s1, s1, s6
; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1164DAGISEL-NEXT: ; %bb.2:
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v3, s1
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, s0
@@ -3421,7 +3896,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1164GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], 0
; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1
; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164GISEL-NEXT: s_ctz_i32_b64 s4, s[2:3]
; GFX1164GISEL-NEXT: v_readlane_b32 s5, v2, s4
@@ -3430,7 +3905,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1164GISEL-NEXT: s_sub_u32 s0, s0, s5
; GFX1164GISEL-NEXT: s_subb_u32 s1, s1, s6
; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1164GISEL-NEXT: ; %bb.2:
; GFX1164GISEL-NEXT: v_mov_b32_e32 v3, s1
; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, s0
@@ -3442,7 +3917,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1132DAGISEL-NEXT: s_mov_b64 s[0:1], 0
; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2
; GFX1132DAGISEL-NEXT: v_readlane_b32 s4, v2, s3
@@ -3451,7 +3926,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1132DAGISEL-NEXT: s_sub_u32 s0, s0, s4
; GFX1132DAGISEL-NEXT: s_subb_u32 s1, s1, s5
; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1132DAGISEL-NEXT: ; %bb.2:
; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132DAGISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
@@ -3462,7 +3937,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1132GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1132GISEL-NEXT: s_mov_b64 s[0:1], 0
; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1
; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2
; GFX1132GISEL-NEXT: v_readlane_b32 s4, v2, s3
@@ -3471,7 +3946,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1132GISEL-NEXT: s_sub_u32 s0, s0, s4
; GFX1132GISEL-NEXT: s_subb_u32 s1, s1, s5
; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1132GISEL-NEXT: ; %bb.2:
; GFX1132GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
@@ -3486,7 +3961,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0
; GFX12DAGISEL-NEXT: s_mov_b64 s[0:1], 0
; GFX12DAGISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX12DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX12DAGISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12DAGISEL-NEXT: s_ctz_i32_b32 s3, s2
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
@@ -3496,7 +3971,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX12DAGISEL-NEXT: s_sub_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB10_1
; GFX12DAGISEL-NEXT: ; %bb.2:
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12DAGISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
@@ -3517,7 +3992,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr6_sgpr7
; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc
; GFX8DAGISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
-; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB9_2
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB11_2
; GFX8DAGISEL-NEXT: ; %bb.1: ; %else
; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
@@ -3530,13 +4005,13 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX8DAGISEL-NEXT: s_mul_i32 s3, s3, s7
; GFX8DAGISEL-NEXT: s_add_u32 s2, s2, s3
; GFX8DAGISEL-NEXT: s_add_u32 s7, s2, s10
-; GFX8DAGISEL-NEXT: .LBB9_2: ; %Flow
+; GFX8DAGISEL-NEXT: .LBB11_2: ; %Flow
; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9]
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s7
; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3]
-; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB9_4
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB11_4
; GFX8DAGISEL-NEXT: ; %bb.3: ; %if
; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
@@ -3550,7 +4025,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX8DAGISEL-NEXT: s_add_u32 s7, s4, s8
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s7
-; GFX8DAGISEL-NEXT: .LBB9_4: ; %endif
+; GFX8DAGISEL-NEXT: .LBB11_4: ; %endif
; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1
@@ -3564,7 +4039,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX8GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7
; GFX8GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc
; GFX8GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
-; GFX8GISEL-NEXT: s_cbranch_execz .LBB9_2
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB11_2
; GFX8GISEL-NEXT: ; %bb.1: ; %else
; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
@@ -3577,13 +4052,13 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX8GISEL-NEXT: s_mul_i32 s3, s3, s7
; GFX8GISEL-NEXT: s_add_u32 s2, s2, s3
; GFX8GISEL-NEXT: s_add_u32 s7, s2, s10
-; GFX8GISEL-NEXT: .LBB9_2: ; %Flow
+; GFX8GISEL-NEXT: .LBB11_2: ; %Flow
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8GISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9]
; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s7
; GFX8GISEL-NEXT: s_xor_b64 exec, exec, s[2:3]
-; GFX8GISEL-NEXT: s_cbranch_execz .LBB9_4
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB11_4
; GFX8GISEL-NEXT: ; %bb.3: ; %if
; GFX8GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec
@@ -3599,7 +4074,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX8GISEL-NEXT: s_add_u32 s7, s4, s8
; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s7
-; GFX8GISEL-NEXT: .LBB9_4: ; %endif
+; GFX8GISEL-NEXT: .LBB11_4: ; %endif
; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1
; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0
@@ -3614,7 +4089,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr4_sgpr5
; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc
; GFX9DAGISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
-; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB9_2
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB11_2
; GFX9DAGISEL-NEXT: ; %bb.1: ; %else
; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec
; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
@@ -3627,13 +4102,13 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX9DAGISEL-NEXT: s_mul_i32 s3, s3, s5
; GFX9DAGISEL-NEXT: s_add_u32 s2, s2, s3
; GFX9DAGISEL-NEXT: s_add_u32 s5, s2, s10
-; GFX9DAGISEL-NEXT: .LBB9_2: ; %Flow
+; GFX9DAGISEL-NEXT: .LBB11_2: ; %Flow
; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9]
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s5
; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3]
-; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB9_4
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB11_4
; GFX9DAGISEL-NEXT: ; %bb.3: ; %if
; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec
; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
@@ -3647,7 +4122,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX9DAGISEL-NEXT: s_add_u32 s5, s5, s8
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s5
-; GFX9DAGISEL-NEXT: .LBB9_4: ; %endif
+; GFX9DAGISEL-NEXT: .LBB11_4: ; %endif
; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX9DAGISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -3660,7 +4135,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX9GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7
; GFX9GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc
; GFX9GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
-; GFX9GISEL-NEXT: s_cbranch_execz .LBB9_2
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB11_2
; GFX9GISEL-NEXT: ; %bb.1: ; %else
; GFX9GISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
@@ -3673,13 +4148,13 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX9GISEL-NEXT: s_mul_i32 s3, s3, s7
; GFX9GISEL-NEXT: s_add_u32 s2, s2, s3
; GFX9GISEL-NEXT: s_add_u32 s7, s2, s10
-; GFX9GISEL-NEXT: .LBB9_2: ; %Flow
+; GFX9GISEL-NEXT: .LBB11_2: ; %Flow
; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9GISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9]
; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s7
; GFX9GISEL-NEXT: s_xor_b64 exec, exec, s[2:3]
-; GFX9GISEL-NEXT: s_cbranch_execz .LBB9_4
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB11_4
; GFX9GISEL-NEXT: ; %bb.3: ; %if
; GFX9GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec
@@ -3695,7 +4170,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX9GISEL-NEXT: s_add_u32 s5, s5, s8
; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s5
-; GFX9GISEL-NEXT: .LBB9_4: ; %endif
+; GFX9GISEL-NEXT: .LBB11_4: ; %endif
; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX9GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -3710,7 +4185,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr8_sgpr9
; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064DAGISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB9_2
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB11_2
; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else
; GFX1064DAGISEL-NEXT: s_mov_b64 s[8:9], exec
; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s8, s[8:9]
@@ -3723,7 +4198,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1064DAGISEL-NEXT: s_add_u32 s3, s10, s3
; GFX1064DAGISEL-NEXT: s_mul_i32 s8, s2, s8
; GFX1064DAGISEL-NEXT: s_add_u32 s9, s3, s9
-; GFX1064DAGISEL-NEXT: .LBB9_2: ; %Flow
+; GFX1064DAGISEL-NEXT: .LBB11_2: ; %Flow
; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[4:5]
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s8
@@ -3755,7 +4230,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1064GISEL-NEXT: ; implicit-def: $sgpr8_sgpr9
; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc
; GFX1064GISEL-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
-; GFX1064GISEL-NEXT: s_cbranch_execz .LBB9_2
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB11_2
; GFX1064GISEL-NEXT: ; %bb.1: ; %else
; GFX1064GISEL-NEXT: s_mov_b64 s[8:9], exec
; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s8, s[8:9]
@@ -3768,13 +4243,13 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1064GISEL-NEXT: s_add_u32 s3, s10, s3
; GFX1064GISEL-NEXT: s_mul_i32 s8, s2, s8
; GFX1064GISEL-NEXT: s_add_u32 s9, s3, s9
-; GFX1064GISEL-NEXT: .LBB9_2: ; %Flow
+; GFX1064GISEL-NEXT: .LBB11_2: ; %Flow
; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064GISEL-NEXT: s_or_saveexec_b64 s[2:3], s[6:7]
; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s8
; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, s9
; GFX1064GISEL-NEXT: s_xor_b64 exec, exec, s[2:3]
-; GFX1064GISEL-NEXT: s_cbranch_execz .LBB9_4
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB11_4
; GFX1064GISEL-NEXT: ; %bb.3: ; %if
; GFX1064GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec
@@ -3790,7 +4265,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1064GISEL-NEXT: s_add_u32 s5, s7, s5
; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, s5
-; GFX1064GISEL-NEXT: .LBB9_4: ; %endif
+; GFX1064GISEL-NEXT: .LBB11_4: ; %endif
; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX1064GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -3805,7 +4280,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr4_sgpr5
; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s8, vcc_lo
; GFX1032DAGISEL-NEXT: s_xor_b32 s8, exec_lo, s8
-; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB9_2
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB11_2
; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else
; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo
; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s4, s4
@@ -3818,7 +4293,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1032DAGISEL-NEXT: s_add_u32 s3, s9, s3
; GFX1032DAGISEL-NEXT: s_mul_i32 s4, s2, s4
; GFX1032DAGISEL-NEXT: s_add_u32 s5, s3, s5
-; GFX1032DAGISEL-NEXT: .LBB9_2: ; %Flow
+; GFX1032DAGISEL-NEXT: .LBB11_2: ; %Flow
; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s2, s8
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s4
@@ -3850,7 +4325,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1032GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7
; GFX1032GISEL-NEXT: s_and_saveexec_b32 s8, vcc_lo
; GFX1032GISEL-NEXT: s_xor_b32 s8, exec_lo, s8
-; GFX1032GISEL-NEXT: s_cbranch_execz .LBB9_2
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB11_2
; GFX1032GISEL-NEXT: ; %bb.1: ; %else
; GFX1032GISEL-NEXT: s_mov_b32 s6, exec_lo
; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s6, s6
@@ -3863,13 +4338,13 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1032GISEL-NEXT: s_add_u32 s3, s9, s3
; GFX1032GISEL-NEXT: s_mul_i32 s6, s2, s6
; GFX1032GISEL-NEXT: s_add_u32 s7, s3, s7
-; GFX1032GISEL-NEXT: .LBB9_2: ; %Flow
+; GFX1032GISEL-NEXT: .LBB11_2: ; %Flow
; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032GISEL-NEXT: s_or_saveexec_b32 s2, s8
; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s7
; GFX1032GISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s2
-; GFX1032GISEL-NEXT: s_cbranch_execz .LBB9_4
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB11_4
; GFX1032GISEL-NEXT: ; %bb.3: ; %if
; GFX1032GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo
@@ -3885,7 +4360,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1032GISEL-NEXT: s_add_u32 s5, s5, s8
; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s5
-; GFX1032GISEL-NEXT: .LBB9_4: ; %endif
+; GFX1032GISEL-NEXT: .LBB11_4: ; %endif
; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX1032GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -3902,7 +4377,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
; GFX1164DAGISEL-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
-; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB9_2
+; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB11_2
; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else
; GFX1164DAGISEL-NEXT: s_mov_b64 s[8:9], exec
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -3917,7 +4392,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1164DAGISEL-NEXT: s_add_u32 s3, s10, s3
; GFX1164DAGISEL-NEXT: s_mul_i32 s8, s2, s8
; GFX1164DAGISEL-NEXT: s_add_u32 s9, s3, s9
-; GFX1164DAGISEL-NEXT: .LBB9_2: ; %Flow
+; GFX1164DAGISEL-NEXT: .LBB11_2: ; %Flow
; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[6:7]
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s8
@@ -3953,7 +4428,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
; GFX1164GISEL-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
-; GFX1164GISEL-NEXT: s_cbranch_execz .LBB9_2
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB11_2
; GFX1164GISEL-NEXT: ; %bb.1: ; %else
; GFX1164GISEL-NEXT: s_mov_b64 s[8:9], exec
; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -3968,13 +4443,13 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1164GISEL-NEXT: s_add_u32 s3, s10, s3
; GFX1164GISEL-NEXT: s_mul_i32 s8, s2, s8
; GFX1164GISEL-NEXT: s_add_u32 s9, s3, s9
-; GFX1164GISEL-NEXT: .LBB9_2: ; %Flow
+; GFX1164GISEL-NEXT: .LBB11_2: ; %Flow
; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164GISEL-NEXT: s_or_saveexec_b64 s[2:3], s[6:7]
; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s8
; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, s9
; GFX1164GISEL-NEXT: s_xor_b64 exec, exec, s[2:3]
-; GFX1164GISEL-NEXT: s_cbranch_execz .LBB9_4
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB11_4
; GFX1164GISEL-NEXT: ; %bb.3: ; %if
; GFX1164GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX1164GISEL-NEXT: s_mov_b64 s[6:7], exec
@@ -3992,7 +4467,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1164GISEL-NEXT: s_add_u32 s5, s5, s7
; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, s5
-; GFX1164GISEL-NEXT: .LBB9_4: ; %endif
+; GFX1164GISEL-NEXT: .LBB11_4: ; %endif
; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX1164GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
@@ -4009,7 +4484,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
; GFX1132DAGISEL-NEXT: s_xor_b32 s8, exec_lo, s8
-; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB9_2
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB11_2
; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else
; GFX1132DAGISEL-NEXT: s_mov_b32 s6, exec_lo
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -4024,7 +4499,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1132DAGISEL-NEXT: s_add_u32 s3, s9, s3
; GFX1132DAGISEL-NEXT: s_mul_i32 s6, s2, s6
; GFX1132DAGISEL-NEXT: s_add_u32 s7, s3, s7
-; GFX1132DAGISEL-NEXT: .LBB9_2: ; %Flow
+; GFX1132DAGISEL-NEXT: .LBB11_2: ; %Flow
; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s2, s8
; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
@@ -4059,7 +4534,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
; GFX1132GISEL-NEXT: s_xor_b32 s8, exec_lo, s8
-; GFX1132GISEL-NEXT: s_cbranch_execz .LBB9_2
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB11_2
; GFX1132GISEL-NEXT: ; %bb.1: ; %else
; GFX1132GISEL-NEXT: s_mov_b32 s6, exec_lo
; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -4074,12 +4549,12 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1132GISEL-NEXT: s_add_u32 s3, s9, s3
; GFX1132GISEL-NEXT: s_mul_i32 s6, s2, s6
; GFX1132GISEL-NEXT: s_add_u32 s7, s3, s7
-; GFX1132GISEL-NEXT: .LBB9_2: ; %Flow
+; GFX1132GISEL-NEXT: .LBB11_2: ; %Flow
; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132GISEL-NEXT: s_or_saveexec_b32 s2, s8
; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
; GFX1132GISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s2
-; GFX1132GISEL-NEXT: s_cbranch_execz .LBB9_4
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB11_4
; GFX1132GISEL-NEXT: ; %bb.3: ; %if
; GFX1132GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo
@@ -4097,7 +4572,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1132GISEL-NEXT: s_add_u32 s5, s5, s6
; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX1132GISEL-NEXT: .LBB9_4: ; %endif
+; GFX1132GISEL-NEXT: .LBB11_4: ; %endif
; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1132GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX1132GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
@@ -4114,7 +4589,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
; GFX12DAGISEL-NEXT: s_xor_b32 s8, exec_lo, s8
-; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB9_2
+; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB11_2
; GFX12DAGISEL-NEXT: ; %bb.1: ; %else
; GFX12DAGISEL-NEXT: s_mov_b32 s6, exec_lo
; GFX12DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -4129,7 +4604,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX12DAGISEL-NEXT: s_add_co_u32 s3, s9, s3
; GFX12DAGISEL-NEXT: s_mul_i32 s6, s2, s6
; GFX12DAGISEL-NEXT: s_add_co_u32 s7, s3, s7
-; GFX12DAGISEL-NEXT: .LBB9_2: ; %Flow
+; GFX12DAGISEL-NEXT: .LBB11_2: ; %Flow
; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0
; GFX12DAGISEL-NEXT: s_or_saveexec_b32 s2, s8
; GFX12DAGISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
@@ -4174,3 +4649,9 @@ endif:
store i64 %combine, ptr addrspace(1) %out
ret void
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX1132DAGISEL-FAKE16: {{.*}}
+; GFX1132GISEL-FAKE16: {{.*}}
+; GFX1164DAGISEL-FAKE16: {{.*}}
+; GFX1164GISEL-FAKE16: {{.*}}
+; GFX12DAGISEL-FAKE16: {{.*}}
More information about the llvm-branch-commits
mailing list