[llvm-branch-commits] [llvm] [AMDGPU] DPP wave reduction for long types - 2 (PR #189225)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Sun Mar 29 04:09:26 PDT 2026
https://github.com/easyonaadit created https://github.com/llvm/llvm-project/pull/189225
Supported Ops: `add`, `sub`
>From 0f973c85618ef5acba7ffd84a3c22cc37e38c347 Mon Sep 17 00:00:00 2001
From: Aaditya <Aaditya.AlokDeshpande at amd.com>
Date: Thu, 26 Mar 2026 16:23:41 +0530
Subject: [PATCH] [AMDGPU] DPP wave reduction for long types - 2
Supported Ops: `add`, `sub`
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 93 +-
.../CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll | 1221 ++++++++++++++--
.../CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll | 1259 +++++++++++++++--
3 files changed, 2265 insertions(+), 308 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index dfec8aaf56767..4eb6bad007d59 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5736,6 +5736,8 @@ getDPPOpcForWaveReduction(unsigned Opc, const GCNSubtarget &ST) {
case AMDGPU::V_CMP_LT_I64_e64: // min.i64
case AMDGPU::V_CMP_GT_U64_e64: // umax.u64
case AMDGPU::V_CMP_GT_I64_e64: // max.i64
+ case AMDGPU::S_ADD_U64_PSEUDO:
+ case AMDGPU::S_SUB_U64_PSEUDO:
DPPOpc = AMDGPU::V_MOV_B64_DPP_PSEUDO;
break;
default:
@@ -5745,7 +5747,10 @@ getDPPOpcForWaveReduction(unsigned Opc, const GCNSubtarget &ST) {
if (!ST.getInstrInfo()->isVALU(Opc)) {
if (Opc == AMDGPU::S_SUB_I32)
ClampOpc = AMDGPU::S_ADD_I32;
- ClampOpc = ST.getInstrInfo()->getVALUOp(ClampOpc);
+ if (Opc == AMDGPU::S_ADD_U64_PSEUDO || Opc == AMDGPU::S_SUB_U64_PSEUDO)
+ ClampOpc = AMDGPU::V_ADD_CO_U32_e64;
+ else
+ ClampOpc = ST.getInstrInfo()->getVALUOp(ClampOpc);
}
return {DPPOpc, ClampOpc};
}
@@ -6301,40 +6306,80 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
DPPInstr.addImm(SISrcMods::NONE); // src1 modifier
if (!NeedsMovDPP)
DPPInstr.addReg(Src); // src1
+ if (AMDGPU::getNamedOperandIdx(DPPOpc, AMDGPU::OpName::clamp) >= 0)
+ DPPInstr.addImm(0); // clamp
DPPInstr
.addImm(DPPCtrl) // dpp-ctrl
.addImm(0xf) // row-mask
.addImm(0xf) // bank-mask
.addImm(0); // bound-control
};
- auto BuildClampInstr = [&](Register Dst, Register Src0, Register Src1) {
- auto ClampInstr = BuildMI(*CurrBB, MI, DL, TII->get(ClampOpc), Dst);
+ auto BuildClampInstr = [&](Register Dst, Register Src0, Register Src1,
+ bool isAddSub = false,
+ bool needsCarryIn = false,
+ Register CarryIn = Register()) {
+ unsigned InstrOpc = ClampOpc;
+ Register CarryOutReg = MRI.createVirtualRegister(WaveMaskRegClass);
+ if (needsCarryIn)
+ InstrOpc = AMDGPU::V_ADDC_U32_e64;
+ auto ClampInstr = BuildMI(*CurrBB, MI, DL, TII->get(InstrOpc), Dst);
if (isFPOp)
ClampInstr.addImm(SISrcMods::NONE); // src0 mod
+ if (isAddSub) {
+ if (needsCarryIn)
+ ClampInstr.addReg(CarryOutReg,
+ RegState::Define |
+ RegState::Dead); // killed carry-out reg
+ else
+ ClampInstr.addReg(CarryOutReg, RegState::Define); // carry-out reg
+ }
ClampInstr.addReg(Src0); // src0
if (isFPOp)
ClampInstr.addImm(SISrcMods::NONE); // src1 mod
ClampInstr.addReg(Src1); // src1
- if (TII->hasIntClamp(*ClampInstr) || TII->hasFPClamp(*ClampInstr))
+ if (needsCarryIn)
+ ClampInstr.addReg(CarryIn, RegState::Kill); // carry-in reg
+ if (AMDGPU::getNamedOperandIdx(InstrOpc, AMDGPU::OpName::clamp) >= 0)
ClampInstr.addImm(0); // clamp
if (isFPOp)
ClampInstr.addImm(0); // omod
LastBcastInstr = ClampInstr;
+ return CarryOutReg;
};
auto BuildPostDPPInstr = [&](Register Src0, Register Src1) {
- Register CmpMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
- Register MinMaxResultReg = MRI.createVirtualRegister(SrcRegClass);
- BuildMI(*CurrBB, MI, DL, TII->get(Opc), CmpMaskReg)
- .addReg(Src0) // src0
- .addReg(Src1); // src1
- LastBcastInstr =
- BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B64_PSEUDO),
- MinMaxResultReg)
- .addReg(Src1) // src0
- .addReg(Src0) // src1
- .addReg(CmpMaskReg); // src2
- CurrBB = Expand64BitV_CND_MASK(*LastBcastInstr, CurrBB);
- return MinMaxResultReg;
+ bool isAddSubOpc =
+ Opc == AMDGPU::S_ADD_U64_PSEUDO || Opc == AMDGPU::S_SUB_U64_PSEUDO;
+ Register ReturnReg = MRI.createVirtualRegister(SrcRegClass);
+ if (isAddSubOpc) {
+ Register ResLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register ResHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ MachineOperand Src0Operand =
+ MachineOperand::CreateReg(Src0, /*isDef=*/false);
+ MachineOperand Src1Operand =
+ MachineOperand::CreateReg(Src1, /*isDef=*/false);
+ auto [Src0Lo, Src0Hi] = ExtractSubRegs(MI, Src0Operand, SrcRegClass);
+ auto [Src1Lo, Src1Hi] = ExtractSubRegs(MI, Src1Operand, SrcRegClass);
+ Register CarryReg =
+ BuildClampInstr(ResLo, Src0Lo.getReg(), Src1Lo.getReg(),
+ isAddSubOpc, /*needsCarryIn*/ false);
+ BuildClampInstr(ResHi, Src0Hi.getReg(), Src1Hi.getReg(), isAddSubOpc,
+ /*needsCarryIn*/ isAddSubOpc ? true : false,
+ CarryReg);
+ BuildRegSequence(*CurrBB, MI, ReturnReg, ResLo, ResHi);
+ } else {
+ Register CmpMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
+ BuildMI(*CurrBB, MI, DL, TII->get(Opc), CmpMaskReg)
+ .addReg(Src0) // src0
+ .addReg(Src1); // src1
+ LastBcastInstr =
+ BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B64_PSEUDO),
+ ReturnReg)
+ .addReg(Src1) // src0
+ .addReg(Src0) // src1
+ .addReg(CmpMaskReg); // src2
+ CurrBB = Expand64BitV_CND_MASK(*LastBcastInstr, CurrBB);
+ }
+ return ReturnReg;
};
// Set inactive lanes to the identity value.
@@ -6549,14 +6594,22 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
BuildRegSequence(*CurrBB, MI, ReducedValSGPR, LaneValueLoReg,
LaneValueHiReg);
}
- if (Opc == AMDGPU::S_SUB_I32)
+ if (Opc == AMDGPU::S_SUB_I32) {
BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedReducedVal)
.addImm(0)
.addReg(ReducedValSGPR);
+ } else if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
+ auto NegatedValInstr =
+ BuildMI(*CurrBB, MI, DL, TII->get(Opc), NegatedReducedVal)
+ .addImm(0)
+ .addReg(ReducedValSGPR);
+ CurrBB = Expand64BitScalarArithmetic(*NegatedValInstr, CurrBB);
+ }
// Mark the final result as a whole-wave-mode calculation.
BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::STRICT_WWM), DstReg)
- .addReg(Opc == AMDGPU::S_SUB_I32 ? NegatedReducedVal
- : ReducedValSGPR);
+ .addReg(Opc == AMDGPU::S_SUB_I32 || Opc == AMDGPU::S_SUB_U64_PSEUDO
+ ? NegatedReducedVal
+ : ReducedValSGPR);
RetBB = CurrBB;
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll
index d7fa9bc800634..b78b580d3ed59 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll
@@ -1009,6 +1009,943 @@ entry:
ret void
}
+define void @divergent_value_dpp_i64(ptr addrspace(1) %out, i64 %in) {
+; GFX8DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8DAGISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX8DAGISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e64 v4, 0, v2, s[4:5]
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e64 v5, 0, v3, s[4:5]
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT: s_nop 0
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
+; GFX8DAGISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT: s_nop 0
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
+; GFX8DAGISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT: s_nop 0
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
+; GFX8DAGISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT: s_nop 0
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
+; GFX8DAGISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT: s_nop 0
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
+; GFX8DAGISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT: s_nop 0
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
+; GFX8DAGISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v4, 63
+; GFX8DAGISEL-NEXT: v_readlane_b32 s7, v5, 63
+; GFX8DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s7
+; GFX8DAGISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8DAGISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX8DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX8DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8GISEL-LABEL: divergent_value_dpp_i64:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX8GISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX8GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v2, s[4:5]
+; GFX8GISEL-NEXT: v_cndmask_b32_e64 v5, 0, v3, s[4:5]
+; GFX8GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT: s_nop 0
+; GFX8GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
+; GFX8GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT: s_nop 0
+; GFX8GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
+; GFX8GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT: s_nop 0
+; GFX8GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
+; GFX8GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT: s_nop 0
+; GFX8GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
+; GFX8GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT: s_nop 0
+; GFX8GISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
+; GFX8GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT: s_nop 0
+; GFX8GISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
+; GFX8GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8GISEL-NEXT: v_readlane_b32 s6, v4, 63
+; GFX8GISEL-NEXT: v_readlane_b32 s7, v5, 63
+; GFX8GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s7
+; GFX8GISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX8GISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX8GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9DAGISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9DAGISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e64 v4, 0, v2, s[4:5]
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e64 v5, 0, v3, s[4:5]
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9DAGISEL-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9DAGISEL-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9DAGISEL-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9DAGISEL-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9DAGISEL-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9DAGISEL-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
+; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v4, 63
+; GFX9DAGISEL-NEXT: v_readlane_b32 s7, v5, 63
+; GFX9DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v3, s7
+; GFX9DAGISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9DAGISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9GISEL-LABEL: divergent_value_dpp_i64:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9GISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX9GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v2, s[4:5]
+; GFX9GISEL-NEXT: v_cndmask_b32_e64 v5, 0, v3, s[4:5]
+; GFX9GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9GISEL-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
+; GFX9GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9GISEL-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
+; GFX9GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9GISEL-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
+; GFX9GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9GISEL-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
+; GFX9GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9GISEL-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
+; GFX9GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9GISEL-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
+; GFX9GISEL-NEXT: v_readlane_b32 s6, v4, 63
+; GFX9GISEL-NEXT: v_readlane_b32 s7, v5, 63
+; GFX9GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9GISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX9GISEL-NEXT: v_mov_b32_e32 v3, s7
+; GFX9GISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9GISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1064DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX1064DAGISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e64 v4, 0, v2, s[4:5]
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e64 v5, 0, v3, s[4:5]
+; GFX1064DAGISEL-NEXT: v_mbcnt_lo_u32_b32 v8, -1, 0
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1064DAGISEL-NEXT: v_mbcnt_hi_u32_b32 v8, -1, v8
+; GFX1064DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT: v_add_nc_u32_e32 v8, 32, v8
+; GFX1064DAGISEL-NEXT: v_add_co_u32 v4, vcc, v4, v6
+; GFX1064DAGISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc, v5, v7, vcc
+; GFX1064DAGISEL-NEXT: v_mul_lo_u32 v8, 4, v8
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1064DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT: v_add_co_u32 v4, vcc, v4, v6
+; GFX1064DAGISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc, v5, v7, vcc
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1064DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT: v_add_co_u32 v4, vcc, v4, v6
+; GFX1064DAGISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc, v5, v7, vcc
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1064DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT: v_add_co_u32 v4, vcc, v4, v6
+; GFX1064DAGISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc, v5, v7, vcc
+; GFX1064DAGISEL-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1064DAGISEL-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(1)
+; GFX1064DAGISEL-NEXT: v_add_co_u32 v4, vcc, v4, v6
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc, v5, v7, vcc
+; GFX1064DAGISEL-NEXT: ds_permute_b32 v6, v8, v4
+; GFX1064DAGISEL-NEXT: ds_permute_b32 v7, v8, v5
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(1)
+; GFX1064DAGISEL-NEXT: v_add_co_u32 v4, vcc, v4, v6
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc, v5, v7, vcc
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s6, v4, 63
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s7, v5, 63
+; GFX1064DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v3, s7
+; GFX1064DAGISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1064DAGISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX1064DAGISEL-NEXT: s_clause 0x4 ; 20-byte Folded Reload
+; GFX1064DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32
+; GFX1064DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4
+; GFX1064DAGISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
+; GFX1064DAGISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12
+; GFX1064DAGISEL-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16
+; GFX1064DAGISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1064DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1064GISEL-LABEL: divergent_value_dpp_i64:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX1064GISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1064GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v2, s[4:5]
+; GFX1064GISEL-NEXT: v_cndmask_b32_e64 v5, 0, v3, s[4:5]
+; GFX1064GISEL-NEXT: v_mbcnt_lo_u32_b32 v8, -1, 0
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1064GISEL-NEXT: v_mbcnt_hi_u32_b32 v8, -1, v8
+; GFX1064GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT: v_add_nc_u32_e32 v8, 32, v8
+; GFX1064GISEL-NEXT: v_add_co_u32 v4, vcc, v4, v6
+; GFX1064GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc, v5, v7, vcc
+; GFX1064GISEL-NEXT: v_mul_lo_u32 v8, 4, v8
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1064GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT: v_add_co_u32 v4, vcc, v4, v6
+; GFX1064GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc, v5, v7, vcc
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1064GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT: v_add_co_u32 v4, vcc, v4, v6
+; GFX1064GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc, v5, v7, vcc
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1064GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT: v_add_co_u32 v4, vcc, v4, v6
+; GFX1064GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc, v5, v7, vcc
+; GFX1064GISEL-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1064GISEL-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(1)
+; GFX1064GISEL-NEXT: v_add_co_u32 v4, vcc, v4, v6
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc, v5, v7, vcc
+; GFX1064GISEL-NEXT: ds_permute_b32 v6, v8, v4
+; GFX1064GISEL-NEXT: ds_permute_b32 v7, v8, v5
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(1)
+; GFX1064GISEL-NEXT: v_add_co_u32 v4, vcc, v4, v6
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc, v5, v7, vcc
+; GFX1064GISEL-NEXT: v_readlane_b32 s6, v4, 63
+; GFX1064GISEL-NEXT: v_readlane_b32 s7, v5, 63
+; GFX1064GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v3, s7
+; GFX1064GISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1064GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX1064GISEL-NEXT: s_clause 0x4 ; 20-byte Folded Reload
+; GFX1064GISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32
+; GFX1064GISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4
+; GFX1064GISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
+; GFX1064GISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12
+; GFX1064GISEL-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16
+; GFX1064GISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1064GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1032DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: s_xor_saveexec_b32 s4, -1
+; GFX1032DAGISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX1032DAGISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX1032DAGISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX1032DAGISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX1032DAGISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032DAGISEL-NEXT: s_mov_b32 exec_lo, s4
+; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s6, -1
+; GFX1032DAGISEL-NEXT: v_cndmask_b32_e64 v4, 0, v2, s6
+; GFX1032DAGISEL-NEXT: v_cndmask_b32_e64 v5, 0, v3, s6
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1032DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1032DAGISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1032DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1032DAGISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1032DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1032DAGISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1032DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1032DAGISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo
+; GFX1032DAGISEL-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1032DAGISEL-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(1)
+; GFX1032DAGISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s4, v4, 31
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v5, 31
+; GFX1032DAGISEL-NEXT: s_mov_b32 exec_lo, s6
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032DAGISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1032DAGISEL-NEXT: s_xor_saveexec_b32 s4, -1
+; GFX1032DAGISEL-NEXT: s_clause 0x3 ; 16-byte Folded Reload
+; GFX1032DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32
+; GFX1032DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4
+; GFX1032DAGISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
+; GFX1032DAGISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12
+; GFX1032DAGISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032DAGISEL-NEXT: s_mov_b32 exec_lo, s4
+; GFX1032DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1032DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1032GISEL-LABEL: divergent_value_dpp_i64:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032GISEL-NEXT: s_xor_saveexec_b32 s4, -1
+; GFX1032GISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX1032GISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX1032GISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX1032GISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX1032GISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032GISEL-NEXT: s_mov_b32 exec_lo, s4
+; GFX1032GISEL-NEXT: s_or_saveexec_b32 s6, -1
+; GFX1032GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v2, s6
+; GFX1032GISEL-NEXT: v_cndmask_b32_e64 v5, 0, v3, s6
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1032GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1032GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1032GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1032GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1032GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1032GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1032GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1032GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo
+; GFX1032GISEL-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1032GISEL-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(1)
+; GFX1032GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo
+; GFX1032GISEL-NEXT: v_readlane_b32 s4, v4, 31
+; GFX1032GISEL-NEXT: v_readlane_b32 s5, v5, 31
+; GFX1032GISEL-NEXT: s_mov_b32 exec_lo, s6
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032GISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1032GISEL-NEXT: s_xor_saveexec_b32 s4, -1
+; GFX1032GISEL-NEXT: s_clause 0x3 ; 16-byte Folded Reload
+; GFX1032GISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32
+; GFX1032GISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4
+; GFX1032GISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
+; GFX1032GISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12
+; GFX1032GISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032GISEL-NEXT: s_mov_b32 exec_lo, s4
+; GFX1032GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1032GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1164DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-NEXT: s_clause 0x4 ; 20-byte Folded Spill
+; GFX1164DAGISEL-NEXT: scratch_store_b32 off, v4, s32
+; GFX1164DAGISEL-NEXT: scratch_store_b32 off, v5, s32 offset:4
+; GFX1164DAGISEL-NEXT: scratch_store_b32 off, v6, s32 offset:8
+; GFX1164DAGISEL-NEXT: scratch_store_b32 off, v7, s32 offset:12
+; GFX1164DAGISEL-NEXT: scratch_store_b32 off, v8, s32 offset:16
+; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1164DAGISEL-NEXT: v_cndmask_b32_e64 v4, 0, v2, s[0:1]
+; GFX1164DAGISEL-NEXT: v_cndmask_b32_e64 v5, 0, v3, s[0:1]
+; GFX1164DAGISEL-NEXT: v_mbcnt_lo_u32_b32 v8, -1, 0
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-NEXT: v_mbcnt_hi_u32_b32 v8, -1, v8
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164DAGISEL-NEXT: v_add_nc_u32_e32 v8, 32, v8
+; GFX1164DAGISEL-NEXT: v_add_co_u32 v4, vcc, v4, v6
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT: v_mul_lo_u32 v8, 4, v8
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v7, vcc
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT: v_add_co_u32 v4, vcc, v4, v6
+; GFX1164DAGISEL-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v7, vcc
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT: v_add_co_u32 v4, vcc, v4, v6
+; GFX1164DAGISEL-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v7, vcc
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT: v_add_co_u32 v4, vcc, v4, v6
+; GFX1164DAGISEL-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v7, vcc
+; GFX1164DAGISEL-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(1)
+; GFX1164DAGISEL-NEXT: v_add_co_u32 v4, vcc, v4, v6
+; GFX1164DAGISEL-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164DAGISEL-NEXT: ds_permute_b32 v6, v8, v4
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(1)
+; GFX1164DAGISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v7, vcc
+; GFX1164DAGISEL-NEXT: ds_permute_b32 v7, v8, v5
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(1)
+; GFX1164DAGISEL-NEXT: v_add_co_u32 v4, vcc, v4, v6
+; GFX1164DAGISEL-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: v_readlane_b32 s2, v4, 63
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v7, vcc
+; GFX1164DAGISEL-NEXT: v_readlane_b32 s3, v5, 63
+; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v3, s3
+; GFX1164DAGISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1164DAGISEL-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-NEXT: s_waitcnt_depctr depctr_sa_sdst(0)
+; GFX1164DAGISEL-NEXT: s_clause 0x4 ; 20-byte Folded Reload
+; GFX1164DAGISEL-NEXT: scratch_load_b32 v4, off, s32
+; GFX1164DAGISEL-NEXT: scratch_load_b32 v5, off, s32 offset:4
+; GFX1164DAGISEL-NEXT: scratch_load_b32 v6, off, s32 offset:8
+; GFX1164DAGISEL-NEXT: scratch_load_b32 v7, off, s32 offset:12
+; GFX1164DAGISEL-NEXT: scratch_load_b32 v8, off, s32 offset:16
+; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1164DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1164GISEL-LABEL: divergent_value_dpp_i64:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164GISEL-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX1164GISEL-NEXT: s_clause 0x4 ; 20-byte Folded Spill
+; GFX1164GISEL-NEXT: scratch_store_b32 off, v4, s32
+; GFX1164GISEL-NEXT: scratch_store_b32 off, v5, s32 offset:4
+; GFX1164GISEL-NEXT: scratch_store_b32 off, v6, s32 offset:8
+; GFX1164GISEL-NEXT: scratch_store_b32 off, v7, s32 offset:12
+; GFX1164GISEL-NEXT: scratch_store_b32 off, v8, s32 offset:16
+; GFX1164GISEL-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164GISEL-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1164GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v2, s[0:1]
+; GFX1164GISEL-NEXT: v_cndmask_b32_e64 v5, 0, v3, s[0:1]
+; GFX1164GISEL-NEXT: v_mbcnt_lo_u32_b32 v8, -1, 0
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164GISEL-NEXT: v_mbcnt_hi_u32_b32 v8, -1, v8
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164GISEL-NEXT: v_add_nc_u32_e32 v8, 32, v8
+; GFX1164GISEL-NEXT: v_add_co_u32 v4, vcc, v4, v6
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT: v_mul_lo_u32 v8, 4, v8
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v7, vcc
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT: v_add_co_u32 v4, vcc, v4, v6
+; GFX1164GISEL-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v7, vcc
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT: v_add_co_u32 v4, vcc, v4, v6
+; GFX1164GISEL-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v7, vcc
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT: v_add_co_u32 v4, vcc, v4, v6
+; GFX1164GISEL-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1164GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v7, vcc
+; GFX1164GISEL-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(1)
+; GFX1164GISEL-NEXT: v_add_co_u32 v4, vcc, v4, v6
+; GFX1164GISEL-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164GISEL-NEXT: ds_permute_b32 v6, v8, v4
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(1)
+; GFX1164GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v7, vcc
+; GFX1164GISEL-NEXT: ds_permute_b32 v7, v8, v5
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(1)
+; GFX1164GISEL-NEXT: v_add_co_u32 v4, vcc, v4, v6
+; GFX1164GISEL-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT: v_readlane_b32 s2, v4, 63
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v7, vcc
+; GFX1164GISEL-NEXT: v_readlane_b32 s3, v5, 63
+; GFX1164GISEL-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v3, s3
+; GFX1164GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1164GISEL-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX1164GISEL-NEXT: s_waitcnt_depctr depctr_sa_sdst(0)
+; GFX1164GISEL-NEXT: s_clause 0x4 ; 20-byte Folded Reload
+; GFX1164GISEL-NEXT: scratch_load_b32 v4, off, s32
+; GFX1164GISEL-NEXT: scratch_load_b32 v5, off, s32 offset:4
+; GFX1164GISEL-NEXT: scratch_load_b32 v6, off, s32 offset:8
+; GFX1164GISEL-NEXT: scratch_load_b32 v7, off, s32 offset:12
+; GFX1164GISEL-NEXT: scratch_load_b32 v8, off, s32 offset:16
+; GFX1164GISEL-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1164GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1132DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX1132DAGISEL-NEXT: s_clause 0x3 ; 16-byte Folded Spill
+; GFX1132DAGISEL-NEXT: scratch_store_b32 off, v4, s32
+; GFX1132DAGISEL-NEXT: scratch_store_b32 off, v5, s32 offset:4
+; GFX1132DAGISEL-NEXT: scratch_store_b32 off, v6, s32 offset:8
+; GFX1132DAGISEL-NEXT: scratch_store_b32 off, v7, s32 offset:12
+; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_cndmask_b32_e64 v4, 0, v2, s2
+; GFX1132DAGISEL-NEXT: v_cndmask_b32_e64 v5, 0, v3, s2
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1132DAGISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1132DAGISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo
+; GFX1132DAGISEL-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(1)
+; GFX1132DAGISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT: v_readlane_b32 s0, v4, 31
+; GFX1132DAGISEL-NEXT: v_readlane_b32 s1, v5, 31
+; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX1132DAGISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1132DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX1132DAGISEL-NEXT: s_clause 0x3 ; 16-byte Folded Reload
+; GFX1132DAGISEL-NEXT: scratch_load_b32 v4, off, s32
+; GFX1132DAGISEL-NEXT: scratch_load_b32 v5, off, s32 offset:4
+; GFX1132DAGISEL-NEXT: scratch_load_b32 v6, off, s32 offset:8
+; GFX1132DAGISEL-NEXT: scratch_load_b32 v7, off, s32 offset:12
+; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1132DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1132GISEL-LABEL: divergent_value_dpp_i64:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132GISEL-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX1132GISEL-NEXT: s_clause 0x3 ; 16-byte Folded Spill
+; GFX1132GISEL-NEXT: scratch_store_b32 off, v4, s32
+; GFX1132GISEL-NEXT: scratch_store_b32 off, v5, s32 offset:4
+; GFX1132GISEL-NEXT: scratch_store_b32 off, v6, s32 offset:8
+; GFX1132GISEL-NEXT: scratch_store_b32 off, v7, s32 offset:12
+; GFX1132GISEL-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132GISEL-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v2, s2
+; GFX1132GISEL-NEXT: v_cndmask_b32_e64 v5, 0, v3, s2
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1132GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1132GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo
+; GFX1132GISEL-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1132GISEL-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(1)
+; GFX1132GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT: v_readlane_b32 s0, v4, 31
+; GFX1132GISEL-NEXT: v_readlane_b32 s1, v5, 31
+; GFX1132GISEL-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX1132GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1132GISEL-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX1132GISEL-NEXT: s_clause 0x3 ; 16-byte Folded Reload
+; GFX1132GISEL-NEXT: scratch_load_b32 v4, off, s32
+; GFX1132GISEL-NEXT: scratch_load_b32 v5, off, s32 offset:4
+; GFX1132GISEL-NEXT: scratch_load_b32 v6, off, s32 offset:8
+; GFX1132GISEL-NEXT: scratch_load_b32 v7, off, s32 offset:12
+; GFX1132GISEL-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1132GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX12DAGISEL: ; %bb.0: ; %entry
+; GFX12DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12DAGISEL-NEXT: s_wait_expcnt 0x0
+; GFX12DAGISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12DAGISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX12DAGISEL-NEXT: s_clause 0x3 ; 16-byte Folded Spill
+; GFX12DAGISEL-NEXT: scratch_store_b32 off, v4, s32
+; GFX12DAGISEL-NEXT: scratch_store_b32 off, v5, s32 offset:4
+; GFX12DAGISEL-NEXT: scratch_store_b32 off, v6, s32 offset:8
+; GFX12DAGISEL-NEXT: scratch_store_b32 off, v7, s32 offset:12
+; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
+; GFX12DAGISEL-NEXT: s_mov_b32 exec_lo, s0
+; GFX12DAGISEL-NEXT: s_or_saveexec_b32 s2, -1
+; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
+; GFX12DAGISEL-NEXT: v_cndmask_b32_e64 v4, 0, v2, s2
+; GFX12DAGISEL-NEXT: v_cndmask_b32_e64 v5, 0, v3, s2
+; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12DAGISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX12DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX12DAGISEL-NEXT: s_wait_alu depctr_va_vcc(0)
+; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12DAGISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo
+; GFX12DAGISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12DAGISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX12DAGISEL-NEXT: s_wait_alu depctr_va_vcc(0)
+; GFX12DAGISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo
+; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12DAGISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX12DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX12DAGISEL-NEXT: s_wait_alu depctr_va_vcc(0)
+; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12DAGISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo
+; GFX12DAGISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12DAGISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX12DAGISEL-NEXT: s_wait_alu depctr_va_vcc(0)
+; GFX12DAGISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo
+; GFX12DAGISEL-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX12DAGISEL-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX12DAGISEL-NEXT: s_wait_dscnt 0x1
+; GFX12DAGISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX12DAGISEL-NEXT: s_wait_dscnt 0x0
+; GFX12DAGISEL-NEXT: s_wait_alu depctr_va_vcc(0)
+; GFX12DAGISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo
+; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12DAGISEL-NEXT: v_readlane_b32 s0, v4, 31
+; GFX12DAGISEL-NEXT: v_readlane_b32 s1, v5, 31
+; GFX12DAGISEL-NEXT: s_mov_b32 exec_lo, s2
+; GFX12DAGISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12DAGISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX12DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX12DAGISEL-NEXT: s_clause 0x3 ; 16-byte Folded Reload
+; GFX12DAGISEL-NEXT: scratch_load_b32 v4, off, s32
+; GFX12DAGISEL-NEXT: scratch_load_b32 v5, off, s32 offset:4
+; GFX12DAGISEL-NEXT: scratch_load_b32 v6, off, s32 offset:8
+; GFX12DAGISEL-NEXT: scratch_load_b32 v7, off, s32 offset:12
+; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
+; GFX12DAGISEL-NEXT: s_mov_b32 exec_lo, s0
+; GFX12DAGISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12DAGISEL-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %result = call i64 @llvm.amdgcn.wave.reduce.add.i64(i64 %in, i32 2)
+ store i64 %result, ptr addrspace(1) %out
+ ret void
+}
+
define amdgpu_kernel void @default_stratergy(ptr addrspace(1) %out) {
; GFX8DAGISEL-LABEL: default_stratergy:
; GFX8DAGISEL: ; %bb.0: ; %entry
@@ -1375,7 +2312,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr2
; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX8DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB5_2
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX8DAGISEL-NEXT: ; %bb.1: ; %else
; GFX8DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
@@ -1383,24 +2320,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr0
; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8DAGISEL-NEXT: s_mul_i32 s2, s6, s2
-; GFX8DAGISEL-NEXT: .LBB5_2: ; %Flow
+; GFX8DAGISEL-NEXT: .LBB6_2: ; %Flow
; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s2
; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB5_6
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB6_6
; GFX8DAGISEL-NEXT: ; %bb.3: ; %if
; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX8DAGISEL-NEXT: s_mov_b32 s6, 0
-; GFX8DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX8DAGISEL-NEXT: s_add_i32 s6, s6, s8
; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4
; GFX8DAGISEL-NEXT: ; %bb.5:
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6
-; GFX8DAGISEL-NEXT: .LBB5_6: ; %endif
+; GFX8DAGISEL-NEXT: .LBB6_6: ; %endif
; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
@@ -1415,7 +2352,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX8GISEL-NEXT: ; implicit-def: $sgpr6
; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX8GISEL-NEXT: s_cbranch_execz .LBB5_2
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX8GISEL-NEXT: ; %bb.1: ; %else
; GFX8GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
@@ -1423,20 +2360,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX8GISEL-NEXT: ; implicit-def: $vgpr0
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8GISEL-NEXT: s_mul_i32 s6, s6, s2
-; GFX8GISEL-NEXT: .LBB5_2: ; %Flow
+; GFX8GISEL-NEXT: .LBB6_2: ; %Flow
; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX8GISEL-NEXT: s_cbranch_execz .LBB5_5
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB6_5
; GFX8GISEL-NEXT: ; %bb.3: ; %if
; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX8GISEL-NEXT: s_mov_b32 s6, 0
-; GFX8GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
; GFX8GISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX8GISEL-NEXT: s_add_i32 s6, s6, s8
; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB5_4
-; GFX8GISEL-NEXT: .LBB5_5: ; %endif
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB6_4
+; GFX8GISEL-NEXT: .LBB6_5: ; %endif
; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6
@@ -1452,7 +2389,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr2
; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX9DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB5_2
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX9DAGISEL-NEXT: ; %bb.1: ; %else
; GFX9DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
@@ -1460,24 +2397,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr0
; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9DAGISEL-NEXT: s_mul_i32 s2, s6, s2
-; GFX9DAGISEL-NEXT: .LBB5_2: ; %Flow
+; GFX9DAGISEL-NEXT: .LBB6_2: ; %Flow
; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB5_6
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB6_6
; GFX9DAGISEL-NEXT: ; %bb.3: ; %if
; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX9DAGISEL-NEXT: s_mov_b32 s6, 0
-; GFX9DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX9DAGISEL-NEXT: s_add_i32 s6, s6, s8
; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4
; GFX9DAGISEL-NEXT: ; %bb.5:
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6
-; GFX9DAGISEL-NEXT: .LBB5_6: ; %endif
+; GFX9DAGISEL-NEXT: .LBB6_6: ; %endif
; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
@@ -1491,7 +2428,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX9GISEL-NEXT: ; implicit-def: $sgpr6
; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX9GISEL-NEXT: s_cbranch_execz .LBB5_2
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX9GISEL-NEXT: ; %bb.1: ; %else
; GFX9GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
@@ -1499,20 +2436,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX9GISEL-NEXT: ; implicit-def: $vgpr0
; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9GISEL-NEXT: s_mul_i32 s6, s6, s2
-; GFX9GISEL-NEXT: .LBB5_2: ; %Flow
+; GFX9GISEL-NEXT: .LBB6_2: ; %Flow
; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX9GISEL-NEXT: s_cbranch_execz .LBB5_5
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB6_5
; GFX9GISEL-NEXT: ; %bb.3: ; %if
; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX9GISEL-NEXT: s_mov_b32 s6, 0
-; GFX9GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
; GFX9GISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX9GISEL-NEXT: s_add_i32 s6, s6, s8
; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB5_4
-; GFX9GISEL-NEXT: .LBB5_5: ; %endif
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB6_4
+; GFX9GISEL-NEXT: .LBB6_5: ; %endif
; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6
@@ -1527,7 +2464,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr2
; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB5_2
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else
; GFX1064DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
@@ -1535,24 +2472,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s6, s2
-; GFX1064DAGISEL-NEXT: .LBB5_2: ; %Flow
+; GFX1064DAGISEL-NEXT: .LBB6_2: ; %Flow
; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB5_6
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB6_6
; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if
; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX1064DAGISEL-NEXT: s_mov_b32 s6, 0
-; GFX1064DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX1064DAGISEL-NEXT: s_add_i32 s6, s6, s8
; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4
; GFX1064DAGISEL-NEXT: ; %bb.5:
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6
-; GFX1064DAGISEL-NEXT: .LBB5_6: ; %endif
+; GFX1064DAGISEL-NEXT: .LBB6_6: ; %endif
; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
@@ -1566,7 +2503,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6
; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064GISEL-NEXT: s_cbranch_execz .LBB5_2
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX1064GISEL-NEXT: ; %bb.1: ; %else
; GFX1064GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
@@ -1574,20 +2511,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064GISEL-NEXT: s_mul_i32 s6, s6, s2
-; GFX1064GISEL-NEXT: .LBB5_2: ; %Flow
+; GFX1064GISEL-NEXT: .LBB6_2: ; %Flow
; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX1064GISEL-NEXT: s_cbranch_execz .LBB5_5
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB6_5
; GFX1064GISEL-NEXT: ; %bb.3: ; %if
; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX1064GISEL-NEXT: s_mov_b32 s6, 0
-; GFX1064GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
; GFX1064GISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX1064GISEL-NEXT: s_add_i32 s6, s6, s8
; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB5_4
-; GFX1064GISEL-NEXT: .LBB5_5: ; %endif
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB6_4
+; GFX1064GISEL-NEXT: .LBB6_5: ; %endif
; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6
@@ -1602,7 +2539,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr1
; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB5_2
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else
; GFX1032DAGISEL-NEXT: s_load_dword s1, s[4:5], 0x2c
; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
@@ -1610,24 +2547,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032DAGISEL-NEXT: s_mul_i32 s1, s1, s2
-; GFX1032DAGISEL-NEXT: .LBB5_2: ; %Flow
+; GFX1032DAGISEL-NEXT: .LBB6_2: ; %Flow
; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, s0
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB5_6
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB6_6
; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if
; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
; GFX1032DAGISEL-NEXT: s_mov_b32 s1, 0
-; GFX1032DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s3, s2
; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s3
; GFX1032DAGISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1032DAGISEL-NEXT: s_add_i32 s1, s1, s6
; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4
; GFX1032DAGISEL-NEXT: ; %bb.5:
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX1032DAGISEL-NEXT: .LBB5_6: ; %endif
+; GFX1032DAGISEL-NEXT: .LBB6_6: ; %endif
; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
@@ -1641,7 +2578,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1032GISEL-NEXT: ; implicit-def: $sgpr0
; GFX1032GISEL-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032GISEL-NEXT: s_xor_b32 s1, exec_lo, s1
-; GFX1032GISEL-NEXT: s_cbranch_execz .LBB5_2
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX1032GISEL-NEXT: ; %bb.1: ; %else
; GFX1032GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c
; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
@@ -1649,20 +2586,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032GISEL-NEXT: s_mul_i32 s0, s0, s2
-; GFX1032GISEL-NEXT: .LBB5_2: ; %Flow
+; GFX1032GISEL-NEXT: .LBB6_2: ; %Flow
; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s1, s1
-; GFX1032GISEL-NEXT: s_cbranch_execz .LBB5_5
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB6_5
; GFX1032GISEL-NEXT: ; %bb.3: ; %if
; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
; GFX1032GISEL-NEXT: s_mov_b32 s0, 0
-; GFX1032GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX1032GISEL-NEXT: s_ff1_i32_b32 s3, s2
; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s3
; GFX1032GISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1032GISEL-NEXT: s_add_i32 s0, s0, s6
; GFX1032GISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB5_4
-; GFX1032GISEL-NEXT: .LBB5_5: ; %endif
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB6_4
+; GFX1032GISEL-NEXT: .LBB6_5: ; %endif
; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX1032GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0
@@ -1679,7 +2616,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB5_2
+; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else
; GFX1164DAGISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c
; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
@@ -1688,25 +2625,25 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s6, s2
-; GFX1164DAGISEL-NEXT: .LBB5_2: ; %Flow
+; GFX1164DAGISEL-NEXT: .LBB6_2: ; %Flow
; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB5_6
+; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB6_6
; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if
; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX1164DAGISEL-NEXT: s_mov_b32 s6, 0
-; GFX1164DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX1164DAGISEL-NEXT: s_add_i32 s6, s6, s8
; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4
; GFX1164DAGISEL-NEXT: ; %bb.5:
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6
-; GFX1164DAGISEL-NEXT: .LBB5_6: ; %endif
+; GFX1164DAGISEL-NEXT: .LBB6_6: ; %endif
; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
@@ -1722,7 +2659,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164GISEL-NEXT: s_cbranch_execz .LBB5_2
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX1164GISEL-NEXT: ; %bb.1: ; %else
; GFX1164GISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c
; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
@@ -1731,21 +2668,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164GISEL-NEXT: s_mul_i32 s6, s6, s2
-; GFX1164GISEL-NEXT: .LBB5_2: ; %Flow
+; GFX1164GISEL-NEXT: .LBB6_2: ; %Flow
; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1]
-; GFX1164GISEL-NEXT: s_cbranch_execz .LBB5_5
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB6_5
; GFX1164GISEL-NEXT: ; %bb.3: ; %if
; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX1164GISEL-NEXT: s_mov_b32 s6, 0
-; GFX1164GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX1164GISEL-NEXT: s_add_i32 s6, s6, s8
; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB5_4
-; GFX1164GISEL-NEXT: .LBB5_5: ; %endif
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB6_4
+; GFX1164GISEL-NEXT: .LBB6_5: ; %endif
; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6
@@ -1762,7 +2699,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
; GFX1132DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB5_2
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else
; GFX1132DAGISEL-NEXT: s_load_b32 s1, s[4:5], 0x2c
; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
@@ -1771,25 +2708,25 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132DAGISEL-NEXT: s_mul_i32 s1, s1, s2
-; GFX1132DAGISEL-NEXT: .LBB5_2: ; %Flow
+; GFX1132DAGISEL-NEXT: .LBB6_2: ; %Flow
; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, s0
; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB5_6
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB6_6
; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if
; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0
-; GFX1132DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s3
; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1132DAGISEL-NEXT: s_add_i32 s1, s1, s6
; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4
; GFX1132DAGISEL-NEXT: ; %bb.5:
; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX1132DAGISEL-NEXT: .LBB5_6: ; %endif
+; GFX1132DAGISEL-NEXT: .LBB6_6: ; %endif
; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0
@@ -1805,7 +2742,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
; GFX1132GISEL-NEXT: s_xor_b32 s1, exec_lo, s1
-; GFX1132GISEL-NEXT: s_cbranch_execz .LBB5_2
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX1132GISEL-NEXT: ; %bb.1: ; %else
; GFX1132GISEL-NEXT: s_load_b32 s0, s[4:5], 0x2c
; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
@@ -1814,21 +2751,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132GISEL-NEXT: s_mul_i32 s0, s0, s2
-; GFX1132GISEL-NEXT: .LBB5_2: ; %Flow
+; GFX1132GISEL-NEXT: .LBB6_2: ; %Flow
; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s1, s1
-; GFX1132GISEL-NEXT: s_cbranch_execz .LBB5_5
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB6_5
; GFX1132GISEL-NEXT: ; %bb.3: ; %if
; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
; GFX1132GISEL-NEXT: s_mov_b32 s0, 0
-; GFX1132GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2
; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s3
; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1132GISEL-NEXT: s_add_i32 s0, s0, s6
; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB5_4
-; GFX1132GISEL-NEXT: .LBB5_5: ; %endif
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB6_4
+; GFX1132GISEL-NEXT: .LBB6_5: ; %endif
; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24
; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
@@ -1844,7 +2781,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
; GFX12DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB5_2
+; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX12DAGISEL-NEXT: ; %bb.1: ; %else
; GFX12DAGISEL-NEXT: s_load_b32 s1, s[4:5], 0x2c
; GFX12DAGISEL-NEXT: s_mov_b32 s2, exec_lo
@@ -1853,15 +2790,15 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX12DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0
; GFX12DAGISEL-NEXT: s_mul_i32 s1, s1, s2
-; GFX12DAGISEL-NEXT: .LBB5_2: ; %Flow
+; GFX12DAGISEL-NEXT: .LBB6_2: ; %Flow
; GFX12DAGISEL-NEXT: s_or_saveexec_b32 s0, s0
; GFX12DAGISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX12DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB5_6
+; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB6_6
; GFX12DAGISEL-NEXT: ; %bb.3: ; %if
; GFX12DAGISEL-NEXT: s_mov_b32 s2, exec_lo
; GFX12DAGISEL-NEXT: s_mov_b32 s1, 0
-; GFX12DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX12DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX12DAGISEL-NEXT: s_ctz_i32_b32 s3, s2
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12DAGISEL-NEXT: v_readlane_b32 s6, v0, s3
@@ -1869,10 +2806,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX12DAGISEL-NEXT: s_add_co_i32 s1, s1, s6
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4
+; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4
; GFX12DAGISEL-NEXT: ; %bb.5:
; GFX12DAGISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX12DAGISEL-NEXT: .LBB5_6: ; %endif
+; GFX12DAGISEL-NEXT: .LBB6_6: ; %endif
; GFX12DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX12DAGISEL-NEXT: v_mov_b32_e32 v0, 0
@@ -2120,7 +3057,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], 0
; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX8DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s8, s[6:7]
; GFX8DAGISEL-NEXT: v_readlane_b32 s9, v2, s8
; GFX8DAGISEL-NEXT: v_readlane_b32 s10, v3, s8
@@ -2128,7 +3065,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX8DAGISEL-NEXT: s_bitset0_b64 s[6:7], s8
; GFX8DAGISEL-NEXT: s_addc_u32 s5, s5, s10
; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX8DAGISEL-NEXT: ; %bb.2:
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -2141,7 +3078,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX8GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8GISEL-NEXT: s_mov_b64 s[4:5], 0
; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX8GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX8GISEL-NEXT: s_ff1_i32_b64 s8, s[6:7]
; GFX8GISEL-NEXT: v_readlane_b32 s9, v2, s8
; GFX8GISEL-NEXT: v_readlane_b32 s10, v3, s8
@@ -2149,7 +3086,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX8GISEL-NEXT: s_bitset0_b64 s[6:7], s8
; GFX8GISEL-NEXT: s_addc_u32 s5, s5, s10
; GFX8GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX8GISEL-NEXT: ; %bb.2:
; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -2162,7 +3099,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], 0
; GFX9DAGISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX9DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s8, s[6:7]
; GFX9DAGISEL-NEXT: v_readlane_b32 s9, v2, s8
; GFX9DAGISEL-NEXT: v_readlane_b32 s10, v3, s8
@@ -2170,7 +3107,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX9DAGISEL-NEXT: s_bitset0_b64 s[6:7], s8
; GFX9DAGISEL-NEXT: s_addc_u32 s5, s5, s10
; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX9DAGISEL-NEXT: ; %bb.2:
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -2183,7 +3120,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX9GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9GISEL-NEXT: s_mov_b64 s[4:5], 0
; GFX9GISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX9GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX9GISEL-NEXT: s_ff1_i32_b64 s8, s[6:7]
; GFX9GISEL-NEXT: v_readlane_b32 s9, v2, s8
; GFX9GISEL-NEXT: v_readlane_b32 s10, v3, s8
@@ -2191,7 +3128,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX9GISEL-NEXT: s_bitset0_b64 s[6:7], s8
; GFX9GISEL-NEXT: s_addc_u32 s5, s5, s10
; GFX9GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX9GISEL-NEXT: ; %bb.2:
; GFX9GISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX9GISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -2204,7 +3141,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1064DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], 0
; GFX1064DAGISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX1064DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s8, s[6:7]
; GFX1064DAGISEL-NEXT: v_readlane_b32 s9, v2, s8
; GFX1064DAGISEL-NEXT: v_readlane_b32 s10, v3, s8
@@ -2212,7 +3149,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1064DAGISEL-NEXT: s_add_u32 s4, s4, s9
; GFX1064DAGISEL-NEXT: s_addc_u32 s5, s5, s10
; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1064DAGISEL-NEXT: ; %bb.2:
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -2224,7 +3161,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1064GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], 0
; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX1064GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1064GISEL-NEXT: s_ff1_i32_b64 s8, s[6:7]
; GFX1064GISEL-NEXT: v_readlane_b32 s9, v2, s8
; GFX1064GISEL-NEXT: v_readlane_b32 s10, v3, s8
@@ -2232,7 +3169,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1064GISEL-NEXT: s_add_u32 s4, s4, s9
; GFX1064GISEL-NEXT: s_addc_u32 s5, s5, s10
; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1064GISEL-NEXT: ; %bb.2:
; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX1064GISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -2244,7 +3181,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1032DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1032DAGISEL-NEXT: s_mov_b64 s[4:5], 0
; GFX1032DAGISEL-NEXT: s_mov_b32 s6, exec_lo
-; GFX1032DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s7, s6
; GFX1032DAGISEL-NEXT: v_readlane_b32 s8, v2, s7
; GFX1032DAGISEL-NEXT: v_readlane_b32 s9, v3, s7
@@ -2252,7 +3189,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1032DAGISEL-NEXT: s_add_u32 s4, s4, s8
; GFX1032DAGISEL-NEXT: s_addc_u32 s5, s5, s9
; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s6, 0
-; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1032DAGISEL-NEXT: ; %bb.2:
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -2264,7 +3201,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1032GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1032GISEL-NEXT: s_mov_b64 s[4:5], 0
; GFX1032GISEL-NEXT: s_mov_b32 s6, exec_lo
-; GFX1032GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1032GISEL-NEXT: s_ff1_i32_b32 s7, s6
; GFX1032GISEL-NEXT: v_readlane_b32 s8, v2, s7
; GFX1032GISEL-NEXT: v_readlane_b32 s9, v3, s7
@@ -2272,7 +3209,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1032GISEL-NEXT: s_add_u32 s4, s4, s8
; GFX1032GISEL-NEXT: s_addc_u32 s5, s5, s9
; GFX1032GISEL-NEXT: s_cmp_lg_u32 s6, 0
-; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1032GISEL-NEXT: ; %bb.2:
; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX1032GISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -2284,7 +3221,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], 0
; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s4, s[2:3]
; GFX1164DAGISEL-NEXT: v_readlane_b32 s5, v2, s4
@@ -2293,7 +3230,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1164DAGISEL-NEXT: s_add_u32 s0, s0, s5
; GFX1164DAGISEL-NEXT: s_addc_u32 s1, s1, s6
; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1164DAGISEL-NEXT: ; %bb.2:
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v3, s1
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, s0
@@ -2305,7 +3242,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1164GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], 0
; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164GISEL-NEXT: s_ctz_i32_b64 s4, s[2:3]
; GFX1164GISEL-NEXT: v_readlane_b32 s5, v2, s4
@@ -2314,7 +3251,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1164GISEL-NEXT: s_add_u32 s0, s0, s5
; GFX1164GISEL-NEXT: s_addc_u32 s1, s1, s6
; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1164GISEL-NEXT: ; %bb.2:
; GFX1164GISEL-NEXT: v_mov_b32_e32 v3, s1
; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, s0
@@ -2326,7 +3263,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1132DAGISEL-NEXT: s_mov_b64 s[0:1], 0
; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2
; GFX1132DAGISEL-NEXT: v_readlane_b32 s4, v2, s3
@@ -2335,7 +3272,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1132DAGISEL-NEXT: s_add_u32 s0, s0, s4
; GFX1132DAGISEL-NEXT: s_addc_u32 s1, s1, s5
; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1132DAGISEL-NEXT: ; %bb.2:
; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132DAGISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
@@ -2346,7 +3283,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1132GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1132GISEL-NEXT: s_mov_b64 s[0:1], 0
; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2
; GFX1132GISEL-NEXT: v_readlane_b32 s4, v2, s3
@@ -2355,7 +3292,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1132GISEL-NEXT: s_add_u32 s0, s0, s4
; GFX1132GISEL-NEXT: s_addc_u32 s1, s1, s5
; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1132GISEL-NEXT: ; %bb.2:
; GFX1132GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
@@ -2370,7 +3307,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0
; GFX12DAGISEL-NEXT: s_mov_b64 s[0:1], 0
; GFX12DAGISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX12DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX12DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12DAGISEL-NEXT: s_ctz_i32_b32 s3, s2
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
@@ -2380,7 +3317,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX12DAGISEL-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX12DAGISEL-NEXT: ; %bb.2:
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12DAGISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
@@ -2401,7 +3338,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr6_sgpr7
; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc
; GFX8DAGISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
-; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB8_2
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX8DAGISEL-NEXT: ; %bb.1: ; %else
; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s7, s[6:7]
@@ -2410,7 +3347,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX8DAGISEL-NEXT: s_mul_hi_u32 s2, s2, s7
; GFX8DAGISEL-NEXT: s_mul_i32 s3, s3, s7
; GFX8DAGISEL-NEXT: s_add_u32 s7, s2, s3
-; GFX8DAGISEL-NEXT: .LBB8_2: ; %Flow
+; GFX8DAGISEL-NEXT: .LBB9_2: ; %Flow
; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9]
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s6
@@ -2439,7 +3376,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX8GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7
; GFX8GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc
; GFX8GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
-; GFX8GISEL-NEXT: s_cbranch_execz .LBB8_2
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX8GISEL-NEXT: ; %bb.1: ; %else
; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s7, s[6:7]
@@ -2448,10 +3385,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX8GISEL-NEXT: s_mul_hi_u32 s2, s2, s7
; GFX8GISEL-NEXT: s_mul_i32 s3, s3, s7
; GFX8GISEL-NEXT: s_add_u32 s7, s2, s3
-; GFX8GISEL-NEXT: .LBB8_2: ; %Flow
+; GFX8GISEL-NEXT: .LBB9_2: ; %Flow
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[8:9]
-; GFX8GISEL-NEXT: s_cbranch_execz .LBB8_4
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX8GISEL-NEXT: ; %bb.3: ; %if
; GFX8GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec
@@ -2461,7 +3398,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX8GISEL-NEXT: s_mul_hi_u32 s4, s4, s7
; GFX8GISEL-NEXT: s_mul_i32 s5, s5, s7
; GFX8GISEL-NEXT: s_add_u32 s7, s4, s5
-; GFX8GISEL-NEXT: .LBB8_4: ; %endif
+; GFX8GISEL-NEXT: .LBB9_4: ; %endif
; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1
@@ -2478,7 +3415,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr4_sgpr5
; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc
; GFX9DAGISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
-; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB8_2
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX9DAGISEL-NEXT: ; %bb.1: ; %else
; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec
; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s5, s[4:5]
@@ -2487,7 +3424,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX9DAGISEL-NEXT: s_mul_hi_u32 s2, s2, s5
; GFX9DAGISEL-NEXT: s_mul_i32 s3, s3, s5
; GFX9DAGISEL-NEXT: s_add_u32 s5, s2, s3
-; GFX9DAGISEL-NEXT: .LBB8_2: ; %Flow
+; GFX9DAGISEL-NEXT: .LBB9_2: ; %Flow
; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9]
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4
@@ -2515,7 +3452,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX9GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7
; GFX9GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc
; GFX9GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
-; GFX9GISEL-NEXT: s_cbranch_execz .LBB8_2
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX9GISEL-NEXT: ; %bb.1: ; %else
; GFX9GISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s7, s[6:7]
@@ -2524,10 +3461,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX9GISEL-NEXT: s_mul_hi_u32 s2, s2, s7
; GFX9GISEL-NEXT: s_mul_i32 s3, s3, s7
; GFX9GISEL-NEXT: s_add_u32 s7, s2, s3
-; GFX9GISEL-NEXT: .LBB8_2: ; %Flow
+; GFX9GISEL-NEXT: .LBB9_2: ; %Flow
; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[8:9]
-; GFX9GISEL-NEXT: s_cbranch_execz .LBB8_4
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX9GISEL-NEXT: ; %bb.3: ; %if
; GFX9GISEL-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec
@@ -2537,7 +3474,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX9GISEL-NEXT: s_mul_hi_u32 s5, s8, s4
; GFX9GISEL-NEXT: s_mul_i32 s4, s9, s4
; GFX9GISEL-NEXT: s_add_u32 s7, s5, s4
-; GFX9GISEL-NEXT: .LBB8_4: ; %endif
+; GFX9GISEL-NEXT: .LBB9_4: ; %endif
; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s7
@@ -2554,7 +3491,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr8_sgpr9
; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064DAGISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB8_2
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else
; GFX1064DAGISEL-NEXT: s_mov_b64 s[8:9], exec
; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s8, s[8:9]
@@ -2563,7 +3500,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1064DAGISEL-NEXT: s_mul_i32 s3, s3, s8
; GFX1064DAGISEL-NEXT: s_mul_i32 s8, s2, s8
; GFX1064DAGISEL-NEXT: s_add_u32 s9, s9, s3
-; GFX1064DAGISEL-NEXT: .LBB8_2: ; %Flow
+; GFX1064DAGISEL-NEXT: .LBB9_2: ; %Flow
; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[4:5]
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s8
@@ -2591,7 +3528,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7
; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc
; GFX1064GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
-; GFX1064GISEL-NEXT: s_cbranch_execz .LBB8_2
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX1064GISEL-NEXT: ; %bb.1: ; %else
; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
@@ -2600,10 +3537,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1064GISEL-NEXT: s_mul_i32 s3, s3, s6
; GFX1064GISEL-NEXT: s_mul_i32 s6, s2, s6
; GFX1064GISEL-NEXT: s_add_u32 s7, s7, s3
-; GFX1064GISEL-NEXT: .LBB8_2: ; %Flow
+; GFX1064GISEL-NEXT: .LBB9_2: ; %Flow
; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[8:9]
-; GFX1064GISEL-NEXT: s_cbranch_execz .LBB8_4
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX1064GISEL-NEXT: ; %bb.3: ; %if
; GFX1064GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec
@@ -2613,7 +3550,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1064GISEL-NEXT: s_mul_i32 s7, s7, s4
; GFX1064GISEL-NEXT: s_mul_i32 s6, s6, s4
; GFX1064GISEL-NEXT: s_add_u32 s7, s5, s7
-; GFX1064GISEL-NEXT: .LBB8_4: ; %endif
+; GFX1064GISEL-NEXT: .LBB9_4: ; %endif
; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, s7
@@ -2630,7 +3567,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr4_sgpr5
; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s8, vcc_lo
; GFX1032DAGISEL-NEXT: s_xor_b32 s8, exec_lo, s8
-; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB8_2
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else
; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo
; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s4, s4
@@ -2639,7 +3576,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1032DAGISEL-NEXT: s_mul_i32 s3, s3, s4
; GFX1032DAGISEL-NEXT: s_mul_i32 s4, s2, s4
; GFX1032DAGISEL-NEXT: s_add_u32 s5, s5, s3
-; GFX1032DAGISEL-NEXT: .LBB8_2: ; %Flow
+; GFX1032DAGISEL-NEXT: .LBB9_2: ; %Flow
; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s2, s8
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s4
@@ -2667,7 +3604,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1032GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7
; GFX1032GISEL-NEXT: s_and_saveexec_b32 s8, vcc_lo
; GFX1032GISEL-NEXT: s_xor_b32 s8, exec_lo, s8
-; GFX1032GISEL-NEXT: s_cbranch_execz .LBB8_2
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX1032GISEL-NEXT: ; %bb.1: ; %else
; GFX1032GISEL-NEXT: s_mov_b32 s6, exec_lo
; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s6, s6
@@ -2676,10 +3613,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1032GISEL-NEXT: s_mul_i32 s3, s3, s6
; GFX1032GISEL-NEXT: s_mul_i32 s6, s2, s6
; GFX1032GISEL-NEXT: s_add_u32 s7, s7, s3
-; GFX1032GISEL-NEXT: .LBB8_2: ; %Flow
+; GFX1032GISEL-NEXT: .LBB9_2: ; %Flow
; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s2, s8
-; GFX1032GISEL-NEXT: s_cbranch_execz .LBB8_4
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX1032GISEL-NEXT: ; %bb.3: ; %if
; GFX1032GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo
@@ -2689,7 +3626,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1032GISEL-NEXT: s_mul_i32 s5, s7, s3
; GFX1032GISEL-NEXT: s_mul_i32 s6, s6, s3
; GFX1032GISEL-NEXT: s_add_u32 s7, s4, s5
-; GFX1032GISEL-NEXT: .LBB8_4: ; %endif
+; GFX1032GISEL-NEXT: .LBB9_4: ; %endif
; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s7
@@ -2708,7 +3645,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
; GFX1164DAGISEL-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
-; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB8_2
+; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else
; GFX1164DAGISEL-NEXT: s_mov_b64 s[8:9], exec
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -2718,7 +3655,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1164DAGISEL-NEXT: s_mul_i32 s3, s3, s8
; GFX1164DAGISEL-NEXT: s_mul_i32 s8, s2, s8
; GFX1164DAGISEL-NEXT: s_add_u32 s9, s9, s3
-; GFX1164DAGISEL-NEXT: .LBB8_2: ; %Flow
+; GFX1164DAGISEL-NEXT: .LBB9_2: ; %Flow
; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[6:7]
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s8
@@ -2750,7 +3687,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
; GFX1164GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
-; GFX1164GISEL-NEXT: s_cbranch_execz .LBB8_2
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX1164GISEL-NEXT: ; %bb.1: ; %else
; GFX1164GISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -2760,10 +3697,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1164GISEL-NEXT: s_mul_i32 s3, s3, s6
; GFX1164GISEL-NEXT: s_mul_i32 s6, s2, s6
; GFX1164GISEL-NEXT: s_add_u32 s7, s7, s3
-; GFX1164GISEL-NEXT: .LBB8_2: ; %Flow
+; GFX1164GISEL-NEXT: .LBB9_2: ; %Flow
; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[2:3], s[8:9]
-; GFX1164GISEL-NEXT: s_cbranch_execz .LBB8_4
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX1164GISEL-NEXT: ; %bb.3: ; %if
; GFX1164GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX1164GISEL-NEXT: s_mov_b64 s[6:7], exec
@@ -2774,7 +3711,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1164GISEL-NEXT: s_mul_i32 s5, s5, s6
; GFX1164GISEL-NEXT: s_mul_i32 s6, s4, s6
; GFX1164GISEL-NEXT: s_add_u32 s7, s7, s5
-; GFX1164GISEL-NEXT: .LBB8_4: ; %endif
+; GFX1164GISEL-NEXT: .LBB9_4: ; %endif
; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, s7
@@ -2793,7 +3730,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
; GFX1132DAGISEL-NEXT: s_xor_b32 s8, exec_lo, s8
-; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB8_2
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else
; GFX1132DAGISEL-NEXT: s_mov_b32 s6, exec_lo
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -2803,7 +3740,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1132DAGISEL-NEXT: s_mul_i32 s3, s3, s6
; GFX1132DAGISEL-NEXT: s_mul_i32 s6, s2, s6
; GFX1132DAGISEL-NEXT: s_add_u32 s7, s7, s3
-; GFX1132DAGISEL-NEXT: .LBB8_2: ; %Flow
+; GFX1132DAGISEL-NEXT: .LBB9_2: ; %Flow
; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s2, s8
; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
@@ -2833,7 +3770,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
; GFX1132GISEL-NEXT: s_xor_b32 s8, exec_lo, s8
-; GFX1132GISEL-NEXT: s_cbranch_execz .LBB8_2
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX1132GISEL-NEXT: ; %bb.1: ; %else
; GFX1132GISEL-NEXT: s_mov_b32 s6, exec_lo
; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -2843,10 +3780,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1132GISEL-NEXT: s_mul_i32 s3, s3, s6
; GFX1132GISEL-NEXT: s_mul_i32 s6, s2, s6
; GFX1132GISEL-NEXT: s_add_u32 s7, s7, s3
-; GFX1132GISEL-NEXT: .LBB8_2: ; %Flow
+; GFX1132GISEL-NEXT: .LBB9_2: ; %Flow
; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s2, s8
-; GFX1132GISEL-NEXT: s_cbranch_execz .LBB8_4
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX1132GISEL-NEXT: ; %bb.3: ; %if
; GFX1132GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo
@@ -2857,7 +3794,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1132GISEL-NEXT: s_mul_i32 s5, s5, s3
; GFX1132GISEL-NEXT: s_mul_i32 s6, s4, s3
; GFX1132GISEL-NEXT: s_add_u32 s7, s7, s5
-; GFX1132GISEL-NEXT: .LBB8_4: ; %endif
+; GFX1132GISEL-NEXT: .LBB9_4: ; %endif
; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
; GFX1132GISEL-NEXT: v_mov_b32_e32 v2, 0
@@ -2875,7 +3812,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
; GFX12DAGISEL-NEXT: s_xor_b32 s8, exec_lo, s8
-; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB8_2
+; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX12DAGISEL-NEXT: ; %bb.1: ; %else
; GFX12DAGISEL-NEXT: s_mov_b32 s6, exec_lo
; GFX12DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -2885,7 +3822,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX12DAGISEL-NEXT: s_mul_i32 s3, s3, s6
; GFX12DAGISEL-NEXT: s_mul_i32 s6, s2, s6
; GFX12DAGISEL-NEXT: s_add_co_u32 s7, s7, s3
-; GFX12DAGISEL-NEXT: .LBB8_2: ; %Flow
+; GFX12DAGISEL-NEXT: .LBB9_2: ; %Flow
; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0
; GFX12DAGISEL-NEXT: s_or_saveexec_b32 s2, s8
; GFX12DAGISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll
index d3341eb0f8099..4d83751d4454d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll
@@ -1054,6 +1054,973 @@ entry:
ret void
}
+define void @divergent_value_dpp_i64(ptr addrspace(1) %out, i64 %in) {
+; GFX8DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8DAGISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX8DAGISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e64 v4, 0, v2, s[4:5]
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e64 v5, 0, v3, s[4:5]
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT: s_nop 0
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
+; GFX8DAGISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT: s_nop 0
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
+; GFX8DAGISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT: s_nop 0
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
+; GFX8DAGISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT: s_nop 0
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
+; GFX8DAGISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT: s_nop 0
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
+; GFX8DAGISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT: s_nop 0
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
+; GFX8DAGISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v4, 63
+; GFX8DAGISEL-NEXT: v_readlane_b32 s7, v5, 63
+; GFX8DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8DAGISEL-NEXT: s_sub_u32 s4, 0, s6
+; GFX8DAGISEL-NEXT: s_subb_u32 s5, 0, s7
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s5
+; GFX8DAGISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8DAGISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX8DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX8DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8GISEL-LABEL: divergent_value_dpp_i64:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX8GISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX8GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v2, s[4:5]
+; GFX8GISEL-NEXT: v_cndmask_b32_e64 v5, 0, v3, s[4:5]
+; GFX8GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT: s_nop 0
+; GFX8GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
+; GFX8GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT: s_nop 0
+; GFX8GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
+; GFX8GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT: s_nop 0
+; GFX8GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
+; GFX8GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT: s_nop 0
+; GFX8GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
+; GFX8GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT: s_nop 0
+; GFX8GISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
+; GFX8GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT: s_nop 0
+; GFX8GISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
+; GFX8GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8GISEL-NEXT: v_readlane_b32 s6, v4, 63
+; GFX8GISEL-NEXT: v_readlane_b32 s7, v5, 63
+; GFX8GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8GISEL-NEXT: s_sub_u32 s4, 0, s6
+; GFX8GISEL-NEXT: s_subb_u32 s5, 0, s7
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s5
+; GFX8GISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX8GISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX8GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9DAGISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9DAGISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e64 v4, 0, v2, s[4:5]
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e64 v5, 0, v3, s[4:5]
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9DAGISEL-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9DAGISEL-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9DAGISEL-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9DAGISEL-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9DAGISEL-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9DAGISEL-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
+; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v4, 63
+; GFX9DAGISEL-NEXT: v_readlane_b32 s7, v5, 63
+; GFX9DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9DAGISEL-NEXT: s_sub_u32 s4, 0, s6
+; GFX9DAGISEL-NEXT: s_subb_u32 s5, 0, s7
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v3, s5
+; GFX9DAGISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9DAGISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9GISEL-LABEL: divergent_value_dpp_i64:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9GISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX9GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v2, s[4:5]
+; GFX9GISEL-NEXT: v_cndmask_b32_e64 v5, 0, v3, s[4:5]
+; GFX9GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9GISEL-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
+; GFX9GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9GISEL-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
+; GFX9GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9GISEL-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
+; GFX9GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9GISEL-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
+; GFX9GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9GISEL-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
+; GFX9GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9GISEL-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
+; GFX9GISEL-NEXT: v_readlane_b32 s6, v4, 63
+; GFX9GISEL-NEXT: v_readlane_b32 s7, v5, 63
+; GFX9GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9GISEL-NEXT: s_sub_u32 s4, 0, s6
+; GFX9GISEL-NEXT: s_subb_u32 s5, 0, s7
+; GFX9GISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v3, s5
+; GFX9GISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9GISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1064DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX1064DAGISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e64 v4, 0, v2, s[4:5]
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e64 v5, 0, v3, s[4:5]
+; GFX1064DAGISEL-NEXT: v_mbcnt_lo_u32_b32 v8, -1, 0
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1064DAGISEL-NEXT: v_mbcnt_hi_u32_b32 v8, -1, v8
+; GFX1064DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT: v_add_nc_u32_e32 v8, 32, v8
+; GFX1064DAGISEL-NEXT: v_add_co_u32 v4, vcc, v4, v6
+; GFX1064DAGISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc, v5, v7, vcc
+; GFX1064DAGISEL-NEXT: v_mul_lo_u32 v8, 4, v8
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1064DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT: v_add_co_u32 v4, vcc, v4, v6
+; GFX1064DAGISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc, v5, v7, vcc
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1064DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT: v_add_co_u32 v4, vcc, v4, v6
+; GFX1064DAGISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc, v5, v7, vcc
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1064DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT: v_add_co_u32 v4, vcc, v4, v6
+; GFX1064DAGISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc, v5, v7, vcc
+; GFX1064DAGISEL-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1064DAGISEL-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(1)
+; GFX1064DAGISEL-NEXT: v_add_co_u32 v4, vcc, v4, v6
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc, v5, v7, vcc
+; GFX1064DAGISEL-NEXT: ds_permute_b32 v6, v8, v4
+; GFX1064DAGISEL-NEXT: ds_permute_b32 v7, v8, v5
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(1)
+; GFX1064DAGISEL-NEXT: v_add_co_u32 v4, vcc, v4, v6
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc, v5, v7, vcc
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s6, v4, 63
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s7, v5, 63
+; GFX1064DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064DAGISEL-NEXT: s_sub_u32 s4, 0, s6
+; GFX1064DAGISEL-NEXT: s_subb_u32 s5, 0, s7
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v3, s5
+; GFX1064DAGISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1064DAGISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX1064DAGISEL-NEXT: s_clause 0x4 ; 20-byte Folded Reload
+; GFX1064DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32
+; GFX1064DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4
+; GFX1064DAGISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
+; GFX1064DAGISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12
+; GFX1064DAGISEL-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16
+; GFX1064DAGISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1064DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1064GISEL-LABEL: divergent_value_dpp_i64:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX1064GISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1064GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v2, s[4:5]
+; GFX1064GISEL-NEXT: v_cndmask_b32_e64 v5, 0, v3, s[4:5]
+; GFX1064GISEL-NEXT: v_mbcnt_lo_u32_b32 v8, -1, 0
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1064GISEL-NEXT: v_mbcnt_hi_u32_b32 v8, -1, v8
+; GFX1064GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT: v_add_nc_u32_e32 v8, 32, v8
+; GFX1064GISEL-NEXT: v_add_co_u32 v4, vcc, v4, v6
+; GFX1064GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc, v5, v7, vcc
+; GFX1064GISEL-NEXT: v_mul_lo_u32 v8, 4, v8
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1064GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT: v_add_co_u32 v4, vcc, v4, v6
+; GFX1064GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc, v5, v7, vcc
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1064GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT: v_add_co_u32 v4, vcc, v4, v6
+; GFX1064GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc, v5, v7, vcc
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1064GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT: v_add_co_u32 v4, vcc, v4, v6
+; GFX1064GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc, v5, v7, vcc
+; GFX1064GISEL-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1064GISEL-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(1)
+; GFX1064GISEL-NEXT: v_add_co_u32 v4, vcc, v4, v6
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc, v5, v7, vcc
+; GFX1064GISEL-NEXT: ds_permute_b32 v6, v8, v4
+; GFX1064GISEL-NEXT: ds_permute_b32 v7, v8, v5
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(1)
+; GFX1064GISEL-NEXT: v_add_co_u32 v4, vcc, v4, v6
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc, v5, v7, vcc
+; GFX1064GISEL-NEXT: v_readlane_b32 s6, v4, 63
+; GFX1064GISEL-NEXT: v_readlane_b32 s7, v5, 63
+; GFX1064GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064GISEL-NEXT: s_sub_u32 s4, 0, s6
+; GFX1064GISEL-NEXT: s_subb_u32 s5, 0, s7
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v3, s5
+; GFX1064GISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1064GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX1064GISEL-NEXT: s_clause 0x4 ; 20-byte Folded Reload
+; GFX1064GISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32
+; GFX1064GISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4
+; GFX1064GISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
+; GFX1064GISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12
+; GFX1064GISEL-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16
+; GFX1064GISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1064GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1032DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: s_xor_saveexec_b32 s4, -1
+; GFX1032DAGISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX1032DAGISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX1032DAGISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX1032DAGISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX1032DAGISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032DAGISEL-NEXT: s_mov_b32 exec_lo, s4
+; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s4, -1
+; GFX1032DAGISEL-NEXT: v_cndmask_b32_e64 v4, 0, v2, s4
+; GFX1032DAGISEL-NEXT: v_cndmask_b32_e64 v5, 0, v3, s4
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1032DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1032DAGISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1032DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1032DAGISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1032DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1032DAGISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1032DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1032DAGISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo
+; GFX1032DAGISEL-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1032DAGISEL-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(1)
+; GFX1032DAGISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v4, 31
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v5, 31
+; GFX1032DAGISEL-NEXT: s_mov_b32 exec_lo, s4
+; GFX1032DAGISEL-NEXT: s_sub_u32 s4, 0, s5
+; GFX1032DAGISEL-NEXT: s_subb_u32 s5, 0, s6
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032DAGISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1032DAGISEL-NEXT: s_xor_saveexec_b32 s4, -1
+; GFX1032DAGISEL-NEXT: s_clause 0x3 ; 16-byte Folded Reload
+; GFX1032DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32
+; GFX1032DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4
+; GFX1032DAGISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
+; GFX1032DAGISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12
+; GFX1032DAGISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032DAGISEL-NEXT: s_mov_b32 exec_lo, s4
+; GFX1032DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1032DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1032GISEL-LABEL: divergent_value_dpp_i64:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032GISEL-NEXT: s_xor_saveexec_b32 s4, -1
+; GFX1032GISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX1032GISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX1032GISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX1032GISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX1032GISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032GISEL-NEXT: s_mov_b32 exec_lo, s4
+; GFX1032GISEL-NEXT: s_or_saveexec_b32 s4, -1
+; GFX1032GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v2, s4
+; GFX1032GISEL-NEXT: v_cndmask_b32_e64 v5, 0, v3, s4
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1032GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1032GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1032GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1032GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1032GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1032GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1032GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1032GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo
+; GFX1032GISEL-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1032GISEL-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(1)
+; GFX1032GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo
+; GFX1032GISEL-NEXT: v_readlane_b32 s5, v4, 31
+; GFX1032GISEL-NEXT: v_readlane_b32 s6, v5, 31
+; GFX1032GISEL-NEXT: s_mov_b32 exec_lo, s4
+; GFX1032GISEL-NEXT: s_sub_u32 s4, 0, s5
+; GFX1032GISEL-NEXT: s_subb_u32 s5, 0, s6
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032GISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1032GISEL-NEXT: s_xor_saveexec_b32 s4, -1
+; GFX1032GISEL-NEXT: s_clause 0x3 ; 16-byte Folded Reload
+; GFX1032GISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32
+; GFX1032GISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4
+; GFX1032GISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
+; GFX1032GISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12
+; GFX1032GISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032GISEL-NEXT: s_mov_b32 exec_lo, s4
+; GFX1032GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1032GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1164DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-NEXT: s_clause 0x4 ; 20-byte Folded Spill
+; GFX1164DAGISEL-NEXT: scratch_store_b32 off, v4, s32
+; GFX1164DAGISEL-NEXT: scratch_store_b32 off, v5, s32 offset:4
+; GFX1164DAGISEL-NEXT: scratch_store_b32 off, v6, s32 offset:8
+; GFX1164DAGISEL-NEXT: scratch_store_b32 off, v7, s32 offset:12
+; GFX1164DAGISEL-NEXT: scratch_store_b32 off, v8, s32 offset:16
+; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1164DAGISEL-NEXT: v_cndmask_b32_e64 v4, 0, v2, s[0:1]
+; GFX1164DAGISEL-NEXT: v_cndmask_b32_e64 v5, 0, v3, s[0:1]
+; GFX1164DAGISEL-NEXT: v_mbcnt_lo_u32_b32 v8, -1, 0
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-NEXT: v_mbcnt_hi_u32_b32 v8, -1, v8
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164DAGISEL-NEXT: v_add_nc_u32_e32 v8, 32, v8
+; GFX1164DAGISEL-NEXT: v_add_co_u32 v4, vcc, v4, v6
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT: v_mul_lo_u32 v8, 4, v8
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v7, vcc
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT: v_add_co_u32 v4, vcc, v4, v6
+; GFX1164DAGISEL-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v7, vcc
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT: v_add_co_u32 v4, vcc, v4, v6
+; GFX1164DAGISEL-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v7, vcc
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT: v_add_co_u32 v4, vcc, v4, v6
+; GFX1164DAGISEL-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v7, vcc
+; GFX1164DAGISEL-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(1)
+; GFX1164DAGISEL-NEXT: v_add_co_u32 v4, vcc, v4, v6
+; GFX1164DAGISEL-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164DAGISEL-NEXT: ds_permute_b32 v6, v8, v4
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(1)
+; GFX1164DAGISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v7, vcc
+; GFX1164DAGISEL-NEXT: ds_permute_b32 v7, v8, v5
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(1)
+; GFX1164DAGISEL-NEXT: v_add_co_u32 v4, vcc, v4, v6
+; GFX1164DAGISEL-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: v_readlane_b32 s2, v4, 63
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v7, vcc
+; GFX1164DAGISEL-NEXT: v_readlane_b32 s3, v5, 63
+; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_sub_u32 s0, 0, s2
+; GFX1164DAGISEL-NEXT: s_waitcnt_depctr depctr_sa_sdst(0)
+; GFX1164DAGISEL-NEXT: s_subb_u32 s1, 0, s3
+; GFX1164DAGISEL-NEXT: s_waitcnt_depctr depctr_sa_sdst(0)
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164DAGISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1164DAGISEL-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-NEXT: s_clause 0x4 ; 20-byte Folded Reload
+; GFX1164DAGISEL-NEXT: scratch_load_b32 v4, off, s32
+; GFX1164DAGISEL-NEXT: scratch_load_b32 v5, off, s32 offset:4
+; GFX1164DAGISEL-NEXT: scratch_load_b32 v6, off, s32 offset:8
+; GFX1164DAGISEL-NEXT: scratch_load_b32 v7, off, s32 offset:12
+; GFX1164DAGISEL-NEXT: scratch_load_b32 v8, off, s32 offset:16
+; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1164DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1164GISEL-LABEL: divergent_value_dpp_i64:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164GISEL-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX1164GISEL-NEXT: s_clause 0x4 ; 20-byte Folded Spill
+; GFX1164GISEL-NEXT: scratch_store_b32 off, v4, s32
+; GFX1164GISEL-NEXT: scratch_store_b32 off, v5, s32 offset:4
+; GFX1164GISEL-NEXT: scratch_store_b32 off, v6, s32 offset:8
+; GFX1164GISEL-NEXT: scratch_store_b32 off, v7, s32 offset:12
+; GFX1164GISEL-NEXT: scratch_store_b32 off, v8, s32 offset:16
+; GFX1164GISEL-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164GISEL-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1164GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v2, s[0:1]
+; GFX1164GISEL-NEXT: v_cndmask_b32_e64 v5, 0, v3, s[0:1]
+; GFX1164GISEL-NEXT: v_mbcnt_lo_u32_b32 v8, -1, 0
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164GISEL-NEXT: v_mbcnt_hi_u32_b32 v8, -1, v8
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164GISEL-NEXT: v_add_nc_u32_e32 v8, 32, v8
+; GFX1164GISEL-NEXT: v_add_co_u32 v4, vcc, v4, v6
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT: v_mul_lo_u32 v8, 4, v8
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v7, vcc
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT: v_add_co_u32 v4, vcc, v4, v6
+; GFX1164GISEL-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v7, vcc
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT: v_add_co_u32 v4, vcc, v4, v6
+; GFX1164GISEL-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v7, vcc
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT: v_add_co_u32 v4, vcc, v4, v6
+; GFX1164GISEL-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1164GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v7, vcc
+; GFX1164GISEL-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(1)
+; GFX1164GISEL-NEXT: v_add_co_u32 v4, vcc, v4, v6
+; GFX1164GISEL-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164GISEL-NEXT: ds_permute_b32 v6, v8, v4
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(1)
+; GFX1164GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v7, vcc
+; GFX1164GISEL-NEXT: ds_permute_b32 v7, v8, v5
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(1)
+; GFX1164GISEL-NEXT: v_add_co_u32 v4, vcc, v4, v6
+; GFX1164GISEL-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT: v_readlane_b32 s2, v4, 63
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v7, vcc
+; GFX1164GISEL-NEXT: v_readlane_b32 s3, v5, 63
+; GFX1164GISEL-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164GISEL-NEXT: s_sub_u32 s0, 0, s2
+; GFX1164GISEL-NEXT: s_waitcnt_depctr depctr_sa_sdst(0)
+; GFX1164GISEL-NEXT: s_subb_u32 s1, 0, s3
+; GFX1164GISEL-NEXT: s_waitcnt_depctr depctr_sa_sdst(0)
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1164GISEL-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX1164GISEL-NEXT: s_clause 0x4 ; 20-byte Folded Reload
+; GFX1164GISEL-NEXT: scratch_load_b32 v4, off, s32
+; GFX1164GISEL-NEXT: scratch_load_b32 v5, off, s32 offset:4
+; GFX1164GISEL-NEXT: scratch_load_b32 v6, off, s32 offset:8
+; GFX1164GISEL-NEXT: scratch_load_b32 v7, off, s32 offset:12
+; GFX1164GISEL-NEXT: scratch_load_b32 v8, off, s32 offset:16
+; GFX1164GISEL-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1164GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1132DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX1132DAGISEL-NEXT: s_clause 0x3 ; 16-byte Folded Spill
+; GFX1132DAGISEL-NEXT: scratch_store_b32 off, v4, s32
+; GFX1132DAGISEL-NEXT: scratch_store_b32 off, v5, s32 offset:4
+; GFX1132DAGISEL-NEXT: scratch_store_b32 off, v6, s32 offset:8
+; GFX1132DAGISEL-NEXT: scratch_store_b32 off, v7, s32 offset:12
+; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_cndmask_b32_e64 v4, 0, v2, s0
+; GFX1132DAGISEL-NEXT: v_cndmask_b32_e64 v5, 0, v3, s0
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1132DAGISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1132DAGISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo
+; GFX1132DAGISEL-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(1)
+; GFX1132DAGISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT: v_readlane_b32 s1, v4, 31
+; GFX1132DAGISEL-NEXT: v_readlane_b32 s2, v5, 31
+; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_sub_u32 s0, 0, s1
+; GFX1132DAGISEL-NEXT: s_subb_u32 s1, 0, s2
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX1132DAGISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1132DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX1132DAGISEL-NEXT: s_clause 0x3 ; 16-byte Folded Reload
+; GFX1132DAGISEL-NEXT: scratch_load_b32 v4, off, s32
+; GFX1132DAGISEL-NEXT: scratch_load_b32 v5, off, s32 offset:4
+; GFX1132DAGISEL-NEXT: scratch_load_b32 v6, off, s32 offset:8
+; GFX1132DAGISEL-NEXT: scratch_load_b32 v7, off, s32 offset:12
+; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1132DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1132GISEL-LABEL: divergent_value_dpp_i64:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132GISEL-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX1132GISEL-NEXT: s_clause 0x3 ; 16-byte Folded Spill
+; GFX1132GISEL-NEXT: scratch_store_b32 off, v4, s32
+; GFX1132GISEL-NEXT: scratch_store_b32 off, v5, s32 offset:4
+; GFX1132GISEL-NEXT: scratch_store_b32 off, v6, s32 offset:8
+; GFX1132GISEL-NEXT: scratch_store_b32 off, v7, s32 offset:12
+; GFX1132GISEL-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132GISEL-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v2, s0
+; GFX1132GISEL-NEXT: v_cndmask_b32_e64 v5, 0, v3, s0
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1132GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1132GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo
+; GFX1132GISEL-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1132GISEL-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(1)
+; GFX1132GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT: v_readlane_b32 s1, v4, 31
+; GFX1132GISEL-NEXT: v_readlane_b32 s2, v5, 31
+; GFX1132GISEL-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132GISEL-NEXT: s_sub_u32 s0, 0, s1
+; GFX1132GISEL-NEXT: s_subb_u32 s1, 0, s2
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX1132GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1132GISEL-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX1132GISEL-NEXT: s_clause 0x3 ; 16-byte Folded Reload
+; GFX1132GISEL-NEXT: scratch_load_b32 v4, off, s32
+; GFX1132GISEL-NEXT: scratch_load_b32 v5, off, s32 offset:4
+; GFX1132GISEL-NEXT: scratch_load_b32 v6, off, s32 offset:8
+; GFX1132GISEL-NEXT: scratch_load_b32 v7, off, s32 offset:12
+; GFX1132GISEL-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1132GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX12DAGISEL: ; %bb.0: ; %entry
+; GFX12DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12DAGISEL-NEXT: s_wait_expcnt 0x0
+; GFX12DAGISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12DAGISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX12DAGISEL-NEXT: s_clause 0x3 ; 16-byte Folded Spill
+; GFX12DAGISEL-NEXT: scratch_store_b32 off, v4, s32
+; GFX12DAGISEL-NEXT: scratch_store_b32 off, v5, s32 offset:4
+; GFX12DAGISEL-NEXT: scratch_store_b32 off, v6, s32 offset:8
+; GFX12DAGISEL-NEXT: scratch_store_b32 off, v7, s32 offset:12
+; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
+; GFX12DAGISEL-NEXT: s_mov_b32 exec_lo, s0
+; GFX12DAGISEL-NEXT: s_or_saveexec_b32 s2, -1
+; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
+; GFX12DAGISEL-NEXT: v_cndmask_b32_e64 v4, 0, v2, s2
+; GFX12DAGISEL-NEXT: v_cndmask_b32_e64 v5, 0, v3, s2
+; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12DAGISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX12DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX12DAGISEL-NEXT: s_wait_alu depctr_va_vcc(0)
+; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12DAGISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo
+; GFX12DAGISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12DAGISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX12DAGISEL-NEXT: s_wait_alu depctr_va_vcc(0)
+; GFX12DAGISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo
+; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12DAGISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX12DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX12DAGISEL-NEXT: s_wait_alu depctr_va_vcc(0)
+; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12DAGISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo
+; GFX12DAGISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12DAGISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX12DAGISEL-NEXT: s_wait_alu depctr_va_vcc(0)
+; GFX12DAGISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo
+; GFX12DAGISEL-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX12DAGISEL-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX12DAGISEL-NEXT: s_wait_dscnt 0x1
+; GFX12DAGISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX12DAGISEL-NEXT: s_wait_dscnt 0x0
+; GFX12DAGISEL-NEXT: s_wait_alu depctr_va_vcc(0)
+; GFX12DAGISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo
+; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12DAGISEL-NEXT: v_readlane_b32 s0, v4, 31
+; GFX12DAGISEL-NEXT: v_readlane_b32 s1, v5, 31
+; GFX12DAGISEL-NEXT: s_mov_b32 exec_lo, s2
+; GFX12DAGISEL-NEXT: s_sub_nc_u64 s[0:1], 0, s[0:1]
+; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
+; GFX12DAGISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12DAGISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX12DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX12DAGISEL-NEXT: s_clause 0x3 ; 16-byte Folded Reload
+; GFX12DAGISEL-NEXT: scratch_load_b32 v4, off, s32
+; GFX12DAGISEL-NEXT: scratch_load_b32 v5, off, s32 offset:4
+; GFX12DAGISEL-NEXT: scratch_load_b32 v6, off, s32 offset:8
+; GFX12DAGISEL-NEXT: scratch_load_b32 v7, off, s32 offset:12
+; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
+; GFX12DAGISEL-NEXT: s_mov_b32 exec_lo, s0
+; GFX12DAGISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12DAGISEL-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %result = call i64 @llvm.amdgcn.wave.reduce.sub.i64(i64 %in, i32 2)
+ store i64 %result, ptr addrspace(1) %out
+ ret void
+}
+
define amdgpu_kernel void @default_stratergy(ptr addrspace(1) %out) {
; GFX8DAGISEL-LABEL: default_stratergy:
; GFX8DAGISEL: ; %bb.0: ; %entry
@@ -1435,7 +2402,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr2
; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX8DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB5_2
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX8DAGISEL-NEXT: ; %bb.1: ; %else
; GFX8DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
@@ -1444,24 +2411,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8DAGISEL-NEXT: s_sub_i32 s3, 0, s6
; GFX8DAGISEL-NEXT: s_mul_i32 s2, s3, s2
-; GFX8DAGISEL-NEXT: .LBB5_2: ; %Flow
+; GFX8DAGISEL-NEXT: .LBB6_2: ; %Flow
; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s2
; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB5_6
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB6_6
; GFX8DAGISEL-NEXT: ; %bb.3: ; %if
; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX8DAGISEL-NEXT: s_mov_b32 s6, 0
-; GFX8DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX8DAGISEL-NEXT: s_sub_i32 s6, s6, s8
; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4
; GFX8DAGISEL-NEXT: ; %bb.5:
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6
-; GFX8DAGISEL-NEXT: .LBB5_6: ; %endif
+; GFX8DAGISEL-NEXT: .LBB6_6: ; %endif
; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
@@ -1476,7 +2443,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX8GISEL-NEXT: ; implicit-def: $sgpr6
; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX8GISEL-NEXT: s_cbranch_execz .LBB5_2
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX8GISEL-NEXT: ; %bb.1: ; %else
; GFX8GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
@@ -1485,20 +2452,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8GISEL-NEXT: s_sub_i32 s3, 0, s6
; GFX8GISEL-NEXT: s_mul_i32 s6, s3, s2
-; GFX8GISEL-NEXT: .LBB5_2: ; %Flow
+; GFX8GISEL-NEXT: .LBB6_2: ; %Flow
; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX8GISEL-NEXT: s_cbranch_execz .LBB5_5
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB6_5
; GFX8GISEL-NEXT: ; %bb.3: ; %if
; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX8GISEL-NEXT: s_mov_b32 s6, 0
-; GFX8GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
; GFX8GISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX8GISEL-NEXT: s_sub_i32 s6, s6, s8
; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB5_4
-; GFX8GISEL-NEXT: .LBB5_5: ; %endif
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB6_4
+; GFX8GISEL-NEXT: .LBB6_5: ; %endif
; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6
@@ -1514,7 +2481,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr2
; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX9DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB5_2
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX9DAGISEL-NEXT: ; %bb.1: ; %else
; GFX9DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
@@ -1523,24 +2490,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9DAGISEL-NEXT: s_sub_i32 s3, 0, s6
; GFX9DAGISEL-NEXT: s_mul_i32 s2, s3, s2
-; GFX9DAGISEL-NEXT: .LBB5_2: ; %Flow
+; GFX9DAGISEL-NEXT: .LBB6_2: ; %Flow
; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB5_6
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB6_6
; GFX9DAGISEL-NEXT: ; %bb.3: ; %if
; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX9DAGISEL-NEXT: s_mov_b32 s6, 0
-; GFX9DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX9DAGISEL-NEXT: s_sub_i32 s6, s6, s8
; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4
; GFX9DAGISEL-NEXT: ; %bb.5:
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6
-; GFX9DAGISEL-NEXT: .LBB5_6: ; %endif
+; GFX9DAGISEL-NEXT: .LBB6_6: ; %endif
; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
@@ -1554,7 +2521,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX9GISEL-NEXT: ; implicit-def: $sgpr6
; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX9GISEL-NEXT: s_cbranch_execz .LBB5_2
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX9GISEL-NEXT: ; %bb.1: ; %else
; GFX9GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
@@ -1563,20 +2530,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9GISEL-NEXT: s_sub_i32 s3, 0, s6
; GFX9GISEL-NEXT: s_mul_i32 s6, s3, s2
-; GFX9GISEL-NEXT: .LBB5_2: ; %Flow
+; GFX9GISEL-NEXT: .LBB6_2: ; %Flow
; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX9GISEL-NEXT: s_cbranch_execz .LBB5_5
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB6_5
; GFX9GISEL-NEXT: ; %bb.3: ; %if
; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX9GISEL-NEXT: s_mov_b32 s6, 0
-; GFX9GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
; GFX9GISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX9GISEL-NEXT: s_sub_i32 s6, s6, s8
; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB5_4
-; GFX9GISEL-NEXT: .LBB5_5: ; %endif
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB6_4
+; GFX9GISEL-NEXT: .LBB6_5: ; %endif
; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6
@@ -1591,7 +2558,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr2
; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB5_2
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else
; GFX1064DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
@@ -1600,24 +2567,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064DAGISEL-NEXT: s_sub_i32 s3, 0, s6
; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s3, s2
-; GFX1064DAGISEL-NEXT: .LBB5_2: ; %Flow
+; GFX1064DAGISEL-NEXT: .LBB6_2: ; %Flow
; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB5_6
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB6_6
; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if
; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX1064DAGISEL-NEXT: s_mov_b32 s6, 0
-; GFX1064DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX1064DAGISEL-NEXT: s_sub_i32 s6, s6, s8
; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4
; GFX1064DAGISEL-NEXT: ; %bb.5:
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6
-; GFX1064DAGISEL-NEXT: .LBB5_6: ; %endif
+; GFX1064DAGISEL-NEXT: .LBB6_6: ; %endif
; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
@@ -1631,7 +2598,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6
; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064GISEL-NEXT: s_cbranch_execz .LBB5_2
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX1064GISEL-NEXT: ; %bb.1: ; %else
; GFX1064GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
@@ -1640,20 +2607,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064GISEL-NEXT: s_sub_i32 s3, 0, s6
; GFX1064GISEL-NEXT: s_mul_i32 s6, s3, s2
-; GFX1064GISEL-NEXT: .LBB5_2: ; %Flow
+; GFX1064GISEL-NEXT: .LBB6_2: ; %Flow
; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX1064GISEL-NEXT: s_cbranch_execz .LBB5_5
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB6_5
; GFX1064GISEL-NEXT: ; %bb.3: ; %if
; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX1064GISEL-NEXT: s_mov_b32 s6, 0
-; GFX1064GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
; GFX1064GISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX1064GISEL-NEXT: s_sub_i32 s6, s6, s8
; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB5_4
-; GFX1064GISEL-NEXT: .LBB5_5: ; %endif
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB6_4
+; GFX1064GISEL-NEXT: .LBB6_5: ; %endif
; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6
@@ -1668,7 +2635,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr1
; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB5_2
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else
; GFX1032DAGISEL-NEXT: s_load_dword s1, s[4:5], 0x2c
; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
@@ -1677,24 +2644,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032DAGISEL-NEXT: s_sub_i32 s1, 0, s1
; GFX1032DAGISEL-NEXT: s_mul_i32 s1, s1, s2
-; GFX1032DAGISEL-NEXT: .LBB5_2: ; %Flow
+; GFX1032DAGISEL-NEXT: .LBB6_2: ; %Flow
; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, s0
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB5_6
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB6_6
; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if
; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
; GFX1032DAGISEL-NEXT: s_mov_b32 s1, 0
-; GFX1032DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s3, s2
; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s3
; GFX1032DAGISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1032DAGISEL-NEXT: s_sub_i32 s1, s1, s6
; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4
; GFX1032DAGISEL-NEXT: ; %bb.5:
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX1032DAGISEL-NEXT: .LBB5_6: ; %endif
+; GFX1032DAGISEL-NEXT: .LBB6_6: ; %endif
; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
@@ -1708,7 +2675,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1032GISEL-NEXT: ; implicit-def: $sgpr0
; GFX1032GISEL-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032GISEL-NEXT: s_xor_b32 s1, exec_lo, s1
-; GFX1032GISEL-NEXT: s_cbranch_execz .LBB5_2
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX1032GISEL-NEXT: ; %bb.1: ; %else
; GFX1032GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c
; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
@@ -1717,20 +2684,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032GISEL-NEXT: s_sub_i32 s0, 0, s0
; GFX1032GISEL-NEXT: s_mul_i32 s0, s0, s2
-; GFX1032GISEL-NEXT: .LBB5_2: ; %Flow
+; GFX1032GISEL-NEXT: .LBB6_2: ; %Flow
; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s1, s1
-; GFX1032GISEL-NEXT: s_cbranch_execz .LBB5_5
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB6_5
; GFX1032GISEL-NEXT: ; %bb.3: ; %if
; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
; GFX1032GISEL-NEXT: s_mov_b32 s0, 0
-; GFX1032GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX1032GISEL-NEXT: s_ff1_i32_b32 s3, s2
; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s3
; GFX1032GISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1032GISEL-NEXT: s_sub_i32 s0, s0, s6
; GFX1032GISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB5_4
-; GFX1032GISEL-NEXT: .LBB5_5: ; %endif
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB6_4
+; GFX1032GISEL-NEXT: .LBB6_5: ; %endif
; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX1032GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0
@@ -1747,7 +2714,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB5_2
+; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else
; GFX1164DAGISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c
; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
@@ -1757,25 +2724,25 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164DAGISEL-NEXT: s_sub_i32 s3, 0, s6
; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s3, s2
-; GFX1164DAGISEL-NEXT: .LBB5_2: ; %Flow
+; GFX1164DAGISEL-NEXT: .LBB6_2: ; %Flow
; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB5_6
+; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB6_6
; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if
; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX1164DAGISEL-NEXT: s_mov_b32 s6, 0
-; GFX1164DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX1164DAGISEL-NEXT: s_sub_i32 s6, s6, s8
; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4
; GFX1164DAGISEL-NEXT: ; %bb.5:
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6
-; GFX1164DAGISEL-NEXT: .LBB5_6: ; %endif
+; GFX1164DAGISEL-NEXT: .LBB6_6: ; %endif
; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
@@ -1791,7 +2758,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164GISEL-NEXT: s_cbranch_execz .LBB5_2
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX1164GISEL-NEXT: ; %bb.1: ; %else
; GFX1164GISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c
; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
@@ -1801,21 +2768,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164GISEL-NEXT: s_sub_i32 s3, 0, s6
; GFX1164GISEL-NEXT: s_mul_i32 s6, s3, s2
-; GFX1164GISEL-NEXT: .LBB5_2: ; %Flow
+; GFX1164GISEL-NEXT: .LBB6_2: ; %Flow
; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1]
-; GFX1164GISEL-NEXT: s_cbranch_execz .LBB5_5
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB6_5
; GFX1164GISEL-NEXT: ; %bb.3: ; %if
; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX1164GISEL-NEXT: s_mov_b32 s6, 0
-; GFX1164GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX1164GISEL-NEXT: s_sub_i32 s6, s6, s8
; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB5_4
-; GFX1164GISEL-NEXT: .LBB5_5: ; %endif
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB6_4
+; GFX1164GISEL-NEXT: .LBB6_5: ; %endif
; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6
@@ -1832,7 +2799,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
; GFX1132DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB5_2
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else
; GFX1132DAGISEL-NEXT: s_load_b32 s1, s[4:5], 0x2c
; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
@@ -1842,25 +2809,25 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132DAGISEL-NEXT: s_sub_i32 s1, 0, s1
; GFX1132DAGISEL-NEXT: s_mul_i32 s1, s1, s2
-; GFX1132DAGISEL-NEXT: .LBB5_2: ; %Flow
+; GFX1132DAGISEL-NEXT: .LBB6_2: ; %Flow
; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, s0
; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB5_6
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB6_6
; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if
; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0
-; GFX1132DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s3
; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1132DAGISEL-NEXT: s_sub_i32 s1, s1, s6
; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4
; GFX1132DAGISEL-NEXT: ; %bb.5:
; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX1132DAGISEL-NEXT: .LBB5_6: ; %endif
+; GFX1132DAGISEL-NEXT: .LBB6_6: ; %endif
; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0
@@ -1876,7 +2843,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
; GFX1132GISEL-NEXT: s_xor_b32 s1, exec_lo, s1
-; GFX1132GISEL-NEXT: s_cbranch_execz .LBB5_2
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX1132GISEL-NEXT: ; %bb.1: ; %else
; GFX1132GISEL-NEXT: s_load_b32 s0, s[4:5], 0x2c
; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
@@ -1886,21 +2853,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132GISEL-NEXT: s_sub_i32 s0, 0, s0
; GFX1132GISEL-NEXT: s_mul_i32 s0, s0, s2
-; GFX1132GISEL-NEXT: .LBB5_2: ; %Flow
+; GFX1132GISEL-NEXT: .LBB6_2: ; %Flow
; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s1, s1
-; GFX1132GISEL-NEXT: s_cbranch_execz .LBB5_5
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB6_5
; GFX1132GISEL-NEXT: ; %bb.3: ; %if
; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
; GFX1132GISEL-NEXT: s_mov_b32 s0, 0
-; GFX1132GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2
; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s3
; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1132GISEL-NEXT: s_sub_i32 s0, s0, s6
; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB5_4
-; GFX1132GISEL-NEXT: .LBB5_5: ; %endif
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB6_4
+; GFX1132GISEL-NEXT: .LBB6_5: ; %endif
; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24
; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
@@ -1916,7 +2883,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
; GFX12DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB5_2
+; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX12DAGISEL-NEXT: ; %bb.1: ; %else
; GFX12DAGISEL-NEXT: s_load_b32 s1, s[4:5], 0x2c
; GFX12DAGISEL-NEXT: s_mov_b32 s2, exec_lo
@@ -1926,15 +2893,15 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0
; GFX12DAGISEL-NEXT: s_sub_co_i32 s1, 0, s1
; GFX12DAGISEL-NEXT: s_mul_i32 s1, s1, s2
-; GFX12DAGISEL-NEXT: .LBB5_2: ; %Flow
+; GFX12DAGISEL-NEXT: .LBB6_2: ; %Flow
; GFX12DAGISEL-NEXT: s_or_saveexec_b32 s0, s0
; GFX12DAGISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX12DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB5_6
+; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB6_6
; GFX12DAGISEL-NEXT: ; %bb.3: ; %if
; GFX12DAGISEL-NEXT: s_mov_b32 s2, exec_lo
; GFX12DAGISEL-NEXT: s_mov_b32 s1, 0
-; GFX12DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX12DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX12DAGISEL-NEXT: s_ctz_i32_b32 s3, s2
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12DAGISEL-NEXT: v_readlane_b32 s6, v0, s3
@@ -1942,10 +2909,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX12DAGISEL-NEXT: s_sub_co_i32 s1, s1, s6
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4
+; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4
; GFX12DAGISEL-NEXT: ; %bb.5:
; GFX12DAGISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX12DAGISEL-NEXT: .LBB5_6: ; %endif
+; GFX12DAGISEL-NEXT: .LBB6_6: ; %endif
; GFX12DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX12DAGISEL-NEXT: v_mov_b32_e32 v0, 0
@@ -2250,7 +3217,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], 0
; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX8DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s8, s[6:7]
; GFX8DAGISEL-NEXT: v_readlane_b32 s9, v2, s8
; GFX8DAGISEL-NEXT: v_readlane_b32 s10, v3, s8
@@ -2258,7 +3225,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX8DAGISEL-NEXT: s_bitset0_b64 s[6:7], s8
; GFX8DAGISEL-NEXT: s_subb_u32 s5, s5, s10
; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX8DAGISEL-NEXT: ; %bb.2:
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -2271,7 +3238,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX8GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8GISEL-NEXT: s_mov_b64 s[4:5], 0
; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX8GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX8GISEL-NEXT: s_ff1_i32_b64 s8, s[6:7]
; GFX8GISEL-NEXT: v_readlane_b32 s9, v2, s8
; GFX8GISEL-NEXT: v_readlane_b32 s10, v3, s8
@@ -2279,7 +3246,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX8GISEL-NEXT: s_bitset0_b64 s[6:7], s8
; GFX8GISEL-NEXT: s_subb_u32 s5, s5, s10
; GFX8GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX8GISEL-NEXT: ; %bb.2:
; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -2292,7 +3259,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], 0
; GFX9DAGISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX9DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s8, s[6:7]
; GFX9DAGISEL-NEXT: v_readlane_b32 s9, v2, s8
; GFX9DAGISEL-NEXT: v_readlane_b32 s10, v3, s8
@@ -2300,7 +3267,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX9DAGISEL-NEXT: s_bitset0_b64 s[6:7], s8
; GFX9DAGISEL-NEXT: s_subb_u32 s5, s5, s10
; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX9DAGISEL-NEXT: ; %bb.2:
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -2313,7 +3280,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX9GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9GISEL-NEXT: s_mov_b64 s[4:5], 0
; GFX9GISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX9GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX9GISEL-NEXT: s_ff1_i32_b64 s8, s[6:7]
; GFX9GISEL-NEXT: v_readlane_b32 s9, v2, s8
; GFX9GISEL-NEXT: v_readlane_b32 s10, v3, s8
@@ -2321,7 +3288,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX9GISEL-NEXT: s_bitset0_b64 s[6:7], s8
; GFX9GISEL-NEXT: s_subb_u32 s5, s5, s10
; GFX9GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX9GISEL-NEXT: ; %bb.2:
; GFX9GISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX9GISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -2334,7 +3301,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1064DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], 0
; GFX1064DAGISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX1064DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s8, s[6:7]
; GFX1064DAGISEL-NEXT: v_readlane_b32 s9, v2, s8
; GFX1064DAGISEL-NEXT: v_readlane_b32 s10, v3, s8
@@ -2342,7 +3309,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1064DAGISEL-NEXT: s_sub_u32 s4, s4, s9
; GFX1064DAGISEL-NEXT: s_subb_u32 s5, s5, s10
; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1064DAGISEL-NEXT: ; %bb.2:
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -2354,7 +3321,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1064GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], 0
; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX1064GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1064GISEL-NEXT: s_ff1_i32_b64 s8, s[6:7]
; GFX1064GISEL-NEXT: v_readlane_b32 s9, v2, s8
; GFX1064GISEL-NEXT: v_readlane_b32 s10, v3, s8
@@ -2362,7 +3329,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1064GISEL-NEXT: s_sub_u32 s4, s4, s9
; GFX1064GISEL-NEXT: s_subb_u32 s5, s5, s10
; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1064GISEL-NEXT: ; %bb.2:
; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX1064GISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -2374,7 +3341,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1032DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1032DAGISEL-NEXT: s_mov_b64 s[4:5], 0
; GFX1032DAGISEL-NEXT: s_mov_b32 s6, exec_lo
-; GFX1032DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s7, s6
; GFX1032DAGISEL-NEXT: v_readlane_b32 s8, v2, s7
; GFX1032DAGISEL-NEXT: v_readlane_b32 s9, v3, s7
@@ -2382,7 +3349,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1032DAGISEL-NEXT: s_sub_u32 s4, s4, s8
; GFX1032DAGISEL-NEXT: s_subb_u32 s5, s5, s9
; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s6, 0
-; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1032DAGISEL-NEXT: ; %bb.2:
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -2394,7 +3361,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1032GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1032GISEL-NEXT: s_mov_b64 s[4:5], 0
; GFX1032GISEL-NEXT: s_mov_b32 s6, exec_lo
-; GFX1032GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1032GISEL-NEXT: s_ff1_i32_b32 s7, s6
; GFX1032GISEL-NEXT: v_readlane_b32 s8, v2, s7
; GFX1032GISEL-NEXT: v_readlane_b32 s9, v3, s7
@@ -2402,7 +3369,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1032GISEL-NEXT: s_sub_u32 s4, s4, s8
; GFX1032GISEL-NEXT: s_subb_u32 s5, s5, s9
; GFX1032GISEL-NEXT: s_cmp_lg_u32 s6, 0
-; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1032GISEL-NEXT: ; %bb.2:
; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX1032GISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -2414,7 +3381,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], 0
; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s4, s[2:3]
; GFX1164DAGISEL-NEXT: v_readlane_b32 s5, v2, s4
@@ -2423,7 +3390,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1164DAGISEL-NEXT: s_sub_u32 s0, s0, s5
; GFX1164DAGISEL-NEXT: s_subb_u32 s1, s1, s6
; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1164DAGISEL-NEXT: ; %bb.2:
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v3, s1
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, s0
@@ -2435,7 +3402,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1164GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], 0
; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164GISEL-NEXT: s_ctz_i32_b64 s4, s[2:3]
; GFX1164GISEL-NEXT: v_readlane_b32 s5, v2, s4
@@ -2444,7 +3411,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1164GISEL-NEXT: s_sub_u32 s0, s0, s5
; GFX1164GISEL-NEXT: s_subb_u32 s1, s1, s6
; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1164GISEL-NEXT: ; %bb.2:
; GFX1164GISEL-NEXT: v_mov_b32_e32 v3, s1
; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, s0
@@ -2456,7 +3423,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1132DAGISEL-NEXT: s_mov_b64 s[0:1], 0
; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2
; GFX1132DAGISEL-NEXT: v_readlane_b32 s4, v2, s3
@@ -2465,7 +3432,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1132DAGISEL-NEXT: s_sub_u32 s0, s0, s4
; GFX1132DAGISEL-NEXT: s_subb_u32 s1, s1, s5
; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1132DAGISEL-NEXT: ; %bb.2:
; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132DAGISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
@@ -2476,7 +3443,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1132GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1132GISEL-NEXT: s_mov_b64 s[0:1], 0
; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2
; GFX1132GISEL-NEXT: v_readlane_b32 s4, v2, s3
@@ -2485,7 +3452,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1132GISEL-NEXT: s_sub_u32 s0, s0, s4
; GFX1132GISEL-NEXT: s_subb_u32 s1, s1, s5
; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1132GISEL-NEXT: ; %bb.2:
; GFX1132GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
@@ -2500,7 +3467,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0
; GFX12DAGISEL-NEXT: s_mov_b64 s[0:1], 0
; GFX12DAGISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX12DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX12DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12DAGISEL-NEXT: s_ctz_i32_b32 s3, s2
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
@@ -2510,7 +3477,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX12DAGISEL-NEXT: s_sub_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX12DAGISEL-NEXT: ; %bb.2:
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12DAGISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
@@ -2531,7 +3498,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr6_sgpr7
; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc
; GFX8DAGISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
-; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB8_2
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX8DAGISEL-NEXT: ; %bb.1: ; %else
; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
@@ -2544,13 +3511,13 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX8DAGISEL-NEXT: s_mul_i32 s3, s3, s7
; GFX8DAGISEL-NEXT: s_add_u32 s2, s2, s3
; GFX8DAGISEL-NEXT: s_add_u32 s7, s2, s10
-; GFX8DAGISEL-NEXT: .LBB8_2: ; %Flow
+; GFX8DAGISEL-NEXT: .LBB9_2: ; %Flow
; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9]
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s7
; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3]
-; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB8_4
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX8DAGISEL-NEXT: ; %bb.3: ; %if
; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
@@ -2564,7 +3531,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX8DAGISEL-NEXT: s_add_u32 s7, s4, s8
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s7
-; GFX8DAGISEL-NEXT: .LBB8_4: ; %endif
+; GFX8DAGISEL-NEXT: .LBB9_4: ; %endif
; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1
@@ -2578,7 +3545,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX8GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7
; GFX8GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc
; GFX8GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
-; GFX8GISEL-NEXT: s_cbranch_execz .LBB8_2
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX8GISEL-NEXT: ; %bb.1: ; %else
; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
@@ -2591,10 +3558,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX8GISEL-NEXT: s_mul_i32 s3, s3, s7
; GFX8GISEL-NEXT: s_add_u32 s2, s2, s3
; GFX8GISEL-NEXT: s_add_u32 s7, s2, s10
-; GFX8GISEL-NEXT: .LBB8_2: ; %Flow
+; GFX8GISEL-NEXT: .LBB9_2: ; %Flow
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[8:9]
-; GFX8GISEL-NEXT: s_cbranch_execz .LBB8_4
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX8GISEL-NEXT: ; %bb.3: ; %if
; GFX8GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec
@@ -2608,7 +3575,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX8GISEL-NEXT: s_mul_i32 s5, s5, s7
; GFX8GISEL-NEXT: s_add_u32 s4, s4, s5
; GFX8GISEL-NEXT: s_add_u32 s7, s4, s8
-; GFX8GISEL-NEXT: .LBB8_4: ; %endif
+; GFX8GISEL-NEXT: .LBB9_4: ; %endif
; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1
@@ -2625,7 +3592,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr4_sgpr5
; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc
; GFX9DAGISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
-; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB8_2
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX9DAGISEL-NEXT: ; %bb.1: ; %else
; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec
; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
@@ -2638,13 +3605,13 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX9DAGISEL-NEXT: s_mul_i32 s3, s3, s5
; GFX9DAGISEL-NEXT: s_add_u32 s2, s2, s3
; GFX9DAGISEL-NEXT: s_add_u32 s5, s2, s10
-; GFX9DAGISEL-NEXT: .LBB8_2: ; %Flow
+; GFX9DAGISEL-NEXT: .LBB9_2: ; %Flow
; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9]
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s5
; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3]
-; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB8_4
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX9DAGISEL-NEXT: ; %bb.3: ; %if
; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec
; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
@@ -2658,7 +3625,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX9DAGISEL-NEXT: s_add_u32 s5, s5, s8
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s5
-; GFX9DAGISEL-NEXT: .LBB8_4: ; %endif
+; GFX9DAGISEL-NEXT: .LBB9_4: ; %endif
; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX9DAGISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -2671,7 +3638,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX9GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7
; GFX9GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc
; GFX9GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
-; GFX9GISEL-NEXT: s_cbranch_execz .LBB8_2
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX9GISEL-NEXT: ; %bb.1: ; %else
; GFX9GISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
@@ -2684,10 +3651,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX9GISEL-NEXT: s_mul_i32 s3, s3, s7
; GFX9GISEL-NEXT: s_add_u32 s2, s2, s3
; GFX9GISEL-NEXT: s_add_u32 s7, s2, s10
-; GFX9GISEL-NEXT: .LBB8_2: ; %Flow
+; GFX9GISEL-NEXT: .LBB9_2: ; %Flow
; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[8:9]
-; GFX9GISEL-NEXT: s_cbranch_execz .LBB8_4
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX9GISEL-NEXT: ; %bb.3: ; %if
; GFX9GISEL-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec
@@ -2701,7 +3668,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX9GISEL-NEXT: s_mul_i32 s5, s8, s5
; GFX9GISEL-NEXT: s_add_u32 s4, s7, s4
; GFX9GISEL-NEXT: s_add_u32 s7, s4, s5
-; GFX9GISEL-NEXT: .LBB8_4: ; %endif
+; GFX9GISEL-NEXT: .LBB9_4: ; %endif
; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s7
@@ -2718,7 +3685,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr8_sgpr9
; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064DAGISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB8_2
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else
; GFX1064DAGISEL-NEXT: s_mov_b64 s[8:9], exec
; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s8, s[8:9]
@@ -2731,7 +3698,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1064DAGISEL-NEXT: s_add_u32 s3, s10, s3
; GFX1064DAGISEL-NEXT: s_mul_i32 s8, s2, s8
; GFX1064DAGISEL-NEXT: s_add_u32 s9, s3, s9
-; GFX1064DAGISEL-NEXT: .LBB8_2: ; %Flow
+; GFX1064DAGISEL-NEXT: .LBB9_2: ; %Flow
; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[4:5]
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s8
@@ -2763,7 +3730,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7
; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc
; GFX1064GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
-; GFX1064GISEL-NEXT: s_cbranch_execz .LBB8_2
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX1064GISEL-NEXT: ; %bb.1: ; %else
; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
@@ -2776,10 +3743,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1064GISEL-NEXT: s_add_u32 s3, s10, s3
; GFX1064GISEL-NEXT: s_mul_i32 s6, s2, s6
; GFX1064GISEL-NEXT: s_add_u32 s7, s3, s7
-; GFX1064GISEL-NEXT: .LBB8_2: ; %Flow
+; GFX1064GISEL-NEXT: .LBB9_2: ; %Flow
; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[8:9]
-; GFX1064GISEL-NEXT: s_cbranch_execz .LBB8_4
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX1064GISEL-NEXT: ; %bb.3: ; %if
; GFX1064GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec
@@ -2793,7 +3760,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1064GISEL-NEXT: s_add_u32 s7, s8, s7
; GFX1064GISEL-NEXT: s_mul_i32 s6, s6, s4
; GFX1064GISEL-NEXT: s_add_u32 s7, s7, s5
-; GFX1064GISEL-NEXT: .LBB8_4: ; %endif
+; GFX1064GISEL-NEXT: .LBB9_4: ; %endif
; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, s7
@@ -2810,7 +3777,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr4_sgpr5
; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s8, vcc_lo
; GFX1032DAGISEL-NEXT: s_xor_b32 s8, exec_lo, s8
-; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB8_2
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else
; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo
; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s4, s4
@@ -2823,7 +3790,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1032DAGISEL-NEXT: s_add_u32 s3, s9, s3
; GFX1032DAGISEL-NEXT: s_mul_i32 s4, s2, s4
; GFX1032DAGISEL-NEXT: s_add_u32 s5, s3, s5
-; GFX1032DAGISEL-NEXT: .LBB8_2: ; %Flow
+; GFX1032DAGISEL-NEXT: .LBB9_2: ; %Flow
; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s2, s8
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s4
@@ -2855,7 +3822,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1032GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7
; GFX1032GISEL-NEXT: s_and_saveexec_b32 s8, vcc_lo
; GFX1032GISEL-NEXT: s_xor_b32 s8, exec_lo, s8
-; GFX1032GISEL-NEXT: s_cbranch_execz .LBB8_2
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX1032GISEL-NEXT: ; %bb.1: ; %else
; GFX1032GISEL-NEXT: s_mov_b32 s6, exec_lo
; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s6, s6
@@ -2868,10 +3835,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1032GISEL-NEXT: s_add_u32 s3, s9, s3
; GFX1032GISEL-NEXT: s_mul_i32 s6, s2, s6
; GFX1032GISEL-NEXT: s_add_u32 s7, s3, s7
-; GFX1032GISEL-NEXT: .LBB8_2: ; %Flow
+; GFX1032GISEL-NEXT: .LBB9_2: ; %Flow
; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s2, s8
-; GFX1032GISEL-NEXT: s_cbranch_execz .LBB8_4
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX1032GISEL-NEXT: ; %bb.3: ; %if
; GFX1032GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo
@@ -2885,7 +3852,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1032GISEL-NEXT: s_add_u32 s5, s5, s7
; GFX1032GISEL-NEXT: s_mul_i32 s6, s6, s3
; GFX1032GISEL-NEXT: s_add_u32 s7, s5, s4
-; GFX1032GISEL-NEXT: .LBB8_4: ; %endif
+; GFX1032GISEL-NEXT: .LBB9_4: ; %endif
; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s7
@@ -2904,7 +3871,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
; GFX1164DAGISEL-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
-; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB8_2
+; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else
; GFX1164DAGISEL-NEXT: s_mov_b64 s[8:9], exec
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -2919,7 +3886,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1164DAGISEL-NEXT: s_add_u32 s3, s10, s3
; GFX1164DAGISEL-NEXT: s_mul_i32 s8, s2, s8
; GFX1164DAGISEL-NEXT: s_add_u32 s9, s3, s9
-; GFX1164DAGISEL-NEXT: .LBB8_2: ; %Flow
+; GFX1164DAGISEL-NEXT: .LBB9_2: ; %Flow
; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[6:7]
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s8
@@ -2955,7 +3922,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
; GFX1164GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
-; GFX1164GISEL-NEXT: s_cbranch_execz .LBB8_2
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX1164GISEL-NEXT: ; %bb.1: ; %else
; GFX1164GISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -2970,10 +3937,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1164GISEL-NEXT: s_add_u32 s3, s10, s3
; GFX1164GISEL-NEXT: s_mul_i32 s6, s2, s6
; GFX1164GISEL-NEXT: s_add_u32 s7, s3, s7
-; GFX1164GISEL-NEXT: .LBB8_2: ; %Flow
+; GFX1164GISEL-NEXT: .LBB9_2: ; %Flow
; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[2:3], s[8:9]
-; GFX1164GISEL-NEXT: s_cbranch_execz .LBB8_4
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX1164GISEL-NEXT: ; %bb.3: ; %if
; GFX1164GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX1164GISEL-NEXT: s_mov_b64 s[6:7], exec
@@ -2989,7 +3956,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1164GISEL-NEXT: s_add_u32 s5, s8, s5
; GFX1164GISEL-NEXT: s_mul_i32 s6, s4, s6
; GFX1164GISEL-NEXT: s_add_u32 s7, s5, s7
-; GFX1164GISEL-NEXT: .LBB8_4: ; %endif
+; GFX1164GISEL-NEXT: .LBB9_4: ; %endif
; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, s7
@@ -3008,7 +3975,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
; GFX1132DAGISEL-NEXT: s_xor_b32 s8, exec_lo, s8
-; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB8_2
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else
; GFX1132DAGISEL-NEXT: s_mov_b32 s6, exec_lo
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -3023,7 +3990,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1132DAGISEL-NEXT: s_add_u32 s3, s9, s3
; GFX1132DAGISEL-NEXT: s_mul_i32 s6, s2, s6
; GFX1132DAGISEL-NEXT: s_add_u32 s7, s3, s7
-; GFX1132DAGISEL-NEXT: .LBB8_2: ; %Flow
+; GFX1132DAGISEL-NEXT: .LBB9_2: ; %Flow
; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s2, s8
; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
@@ -3058,7 +4025,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
; GFX1132GISEL-NEXT: s_xor_b32 s8, exec_lo, s8
-; GFX1132GISEL-NEXT: s_cbranch_execz .LBB8_2
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX1132GISEL-NEXT: ; %bb.1: ; %else
; GFX1132GISEL-NEXT: s_mov_b32 s6, exec_lo
; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -3073,10 +4040,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1132GISEL-NEXT: s_add_u32 s3, s9, s3
; GFX1132GISEL-NEXT: s_mul_i32 s6, s2, s6
; GFX1132GISEL-NEXT: s_add_u32 s7, s3, s7
-; GFX1132GISEL-NEXT: .LBB8_2: ; %Flow
+; GFX1132GISEL-NEXT: .LBB9_2: ; %Flow
; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s2, s8
-; GFX1132GISEL-NEXT: s_cbranch_execz .LBB8_4
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX1132GISEL-NEXT: ; %bb.3: ; %if
; GFX1132GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo
@@ -3092,7 +4059,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1132GISEL-NEXT: s_add_u32 s5, s7, s5
; GFX1132GISEL-NEXT: s_mul_i32 s6, s4, s3
; GFX1132GISEL-NEXT: s_add_u32 s7, s5, s8
-; GFX1132GISEL-NEXT: .LBB8_4: ; %endif
+; GFX1132GISEL-NEXT: .LBB9_4: ; %endif
; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
; GFX1132GISEL-NEXT: v_mov_b32_e32 v2, 0
@@ -3110,7 +4077,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
; GFX12DAGISEL-NEXT: s_xor_b32 s8, exec_lo, s8
-; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB8_2
+; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX12DAGISEL-NEXT: ; %bb.1: ; %else
; GFX12DAGISEL-NEXT: s_mov_b32 s6, exec_lo
; GFX12DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -3125,7 +4092,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX12DAGISEL-NEXT: s_add_co_u32 s3, s9, s3
; GFX12DAGISEL-NEXT: s_mul_i32 s6, s2, s6
; GFX12DAGISEL-NEXT: s_add_co_u32 s7, s3, s7
-; GFX12DAGISEL-NEXT: .LBB8_2: ; %Flow
+; GFX12DAGISEL-NEXT: .LBB9_2: ; %Flow
; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0
; GFX12DAGISEL-NEXT: s_or_saveexec_b32 s2, s8
; GFX12DAGISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
More information about the llvm-branch-commits
mailing list