[llvm-branch-commits] [llvm] [AMDGPU] DPP wave reduction for long types - 1 (PR #189224)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Sun Mar 29 04:06:19 PDT 2026
https://github.com/easyonaadit created https://github.com/llvm/llvm-project/pull/189224
Supported Ops: `min`, `max`, `umin`, `umax`
>From 8f1f33a5e5099f8d2abad2eaeb7a92dcfbee5f5a Mon Sep 17 00:00:00 2001
From: Aaditya <Aaditya.AlokDeshpande at amd.com>
Date: Thu, 26 Mar 2026 14:48:04 +0530
Subject: [PATCH] [AMDGPU] DPP wave reduction for long types - 1
Supported Ops: `min`, `max`, `umin`, `umax`
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 234 +++-
.../CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll | 1192 +++++++++++++++--
.../CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll | 1192 +++++++++++++++--
.../CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll | 1152 ++++++++++++++--
.../CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll | 1152 ++++++++++++++--
5 files changed, 4444 insertions(+), 478 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 465bf3981cd5a..dfec8aaf56767 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5673,7 +5673,7 @@ static uint64_t getIdentityValueForWaveReduction(unsigned Opc) {
}
}
-static std::tuple<bool, bool> ClassifyWaveReductionOp(unsigned Opc) {
+static std::tuple<bool, bool, bool> ClassifyWaveReductionOp(unsigned Opc) {
bool is32BitOpc = Opc == AMDGPU::S_MIN_U32 || Opc == AMDGPU::S_MIN_I32 ||
Opc == AMDGPU::S_MAX_U32 || Opc == AMDGPU::S_MAX_I32 ||
Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32 ||
@@ -5688,7 +5688,8 @@ static std::tuple<bool, bool> ClassifyWaveReductionOp(unsigned Opc) {
Opc == AMDGPU::V_MIN_F64_e64 || Opc == AMDGPU::V_MAX_F64_e64 ||
Opc == AMDGPU::V_MIN_NUM_F64_e64 || Opc == AMDGPU::V_MAX_NUM_F64_e64 ||
Opc == AMDGPU::V_ADD_F64_e64 || Opc == AMDGPU::V_ADD_F64_pseudo_e64;
- return {is32BitOpc, isFPOp};
+ bool NeedsMovDPP = !is32BitOpc;
+ return {is32BitOpc, isFPOp, NeedsMovDPP};
}
static std::tuple<unsigned, unsigned>
@@ -5731,12 +5732,17 @@ getDPPOpcForWaveReduction(unsigned Opc, const GCNSubtarget &ST) {
case AMDGPU::V_MAX_F32_e64:
DPPOpc = AMDGPU::V_MAX_F32_dpp;
break;
+ case AMDGPU::V_CMP_LT_U64_e64: // umin.u64
+ case AMDGPU::V_CMP_LT_I64_e64: // min.i64
+ case AMDGPU::V_CMP_GT_U64_e64: // umax.u64
+ case AMDGPU::V_CMP_GT_I64_e64: // max.i64
+ DPPOpc = AMDGPU::V_MOV_B64_DPP_PSEUDO;
+ break;
default:
llvm_unreachable("unhandled lane op");
}
- bool isFPOp = std::get<1>(ClassifyWaveReductionOp(Opc));
unsigned ClampOpc = Opc;
- if (!isFPOp) {
+ if (!ST.getInstrInfo()->isVALU(Opc)) {
if (Opc == AMDGPU::S_SUB_I32)
ClampOpc = AMDGPU::S_ADD_I32;
ClampOpc = ST.getInstrInfo()->getVALUOp(ClampOpc);
@@ -6013,7 +6019,7 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
} else {
MachineBasicBlock::iterator I = BB.end();
Register SrcReg = MI.getOperand(1).getReg();
- auto [is32BitOpc, isFPOp] = ClassifyWaveReductionOp(Opc);
+ auto [is32BitOpc, isFPOp, NeedsMovDPP] = ClassifyWaveReductionOp(Opc);
// Create virtual registers required for lowering.
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
@@ -6246,7 +6252,7 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
RetBB = ComputeEnd;
} else {
assert(ST.hasDPP() && "Sub Target does not support DPP Operations");
-
+ MachineBasicBlock *CurrBB = &BB;
Register SrcWithIdentity = MRI.createVirtualRegister(SrcRegClass);
Register IdentityVGPR = MRI.createVirtualRegister(SrcRegClass);
Register IdentitySGPR = MRI.createVirtualRegister(DstRegClass);
@@ -6260,13 +6266,19 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
Register RowBcast31 = MRI.createVirtualRegister(SrcRegClass);
Register UndefExec = MRI.createVirtualRegister(WaveMaskRegClass);
Register FinalDPPResult;
- BuildMI(BB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), UndefExec);
+ MachineInstr *SrcWithIdentityInstr;
+ MachineInstr *LastBcastInstr;
+ BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), UndefExec);
uint64_t IdentityValue = getIdentityValueForWaveReduction(Opc);
- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), IdentitySGPR)
+ BuildMI(*CurrBB, MI, DL,
+ TII->get(is32BitOpc ? AMDGPU::S_MOV_B32
+ : AMDGPU::S_MOV_B64_IMM_PSEUDO),
+ IdentitySGPR)
.addImm(IdentityValue);
- BuildMI(BB, MI, DL, TII->get(AMDGPU::COPY), IdentityVGPR)
- .addReg(IdentitySGPR);
+ auto IdentityCopyInstr =
+ BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::COPY), IdentityVGPR)
+ .addReg(IdentitySGPR);
auto [DPPOpc, ClampOpc] = getDPPOpcForWaveReduction(Opc, ST);
auto BuildSetInactiveInstr = [&](Register Dst, Register Src0,
Register Src1) {
@@ -6281,21 +6293,22 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
auto BuildDPPMachineInstr = [&](Register Dst, Register Src,
unsigned DPPCtrl) {
auto DPPInstr =
- BuildMI(BB, MI, DL, TII->get(DPPOpc), Dst).addReg(Src); // old
+ BuildMI(*CurrBB, MI, DL, TII->get(DPPOpc), Dst).addReg(Src); // old
if (isFPOp)
DPPInstr.addImm(SISrcMods::NONE); // src0 modifier
DPPInstr.addReg(Src); // src0
if (isFPOp)
DPPInstr.addImm(SISrcMods::NONE); // src1 modifier
+ if (!NeedsMovDPP)
+ DPPInstr.addReg(Src); // src1
DPPInstr
- .addReg(Src) // src1
.addImm(DPPCtrl) // dpp-ctrl
.addImm(0xf) // row-mask
.addImm(0xf) // bank-mask
.addImm(0); // bound-control
};
auto BuildClampInstr = [&](Register Dst, Register Src0, Register Src1) {
- auto ClampInstr = BuildMI(BB, MI, DL, TII->get(ClampOpc), Dst);
+ auto ClampInstr = BuildMI(*CurrBB, MI, DL, TII->get(ClampOpc), Dst);
if (isFPOp)
ClampInstr.addImm(SISrcMods::NONE); // src0 mod
ClampInstr.addReg(Src0); // src0
@@ -6306,95 +6319,201 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
ClampInstr.addImm(0); // clamp
if (isFPOp)
ClampInstr.addImm(0); // omod
+ LastBcastInstr = ClampInstr;
+ };
+ auto BuildPostDPPInstr = [&](Register Src0, Register Src1) {
+ Register CmpMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
+ Register MinMaxResultReg = MRI.createVirtualRegister(SrcRegClass);
+ BuildMI(*CurrBB, MI, DL, TII->get(Opc), CmpMaskReg)
+ .addReg(Src0) // src0
+ .addReg(Src1); // src1
+ LastBcastInstr =
+ BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B64_PSEUDO),
+ MinMaxResultReg)
+ .addReg(Src1) // src0
+ .addReg(Src0) // src1
+ .addReg(CmpMaskReg); // src2
+ CurrBB = Expand64BitV_CND_MASK(*LastBcastInstr, CurrBB);
+ return MinMaxResultReg;
};
// Set inactive lanes to the identity value.
- MachineInstr *SrcWithIdentityInstr =
- BuildSetInactiveInstr(SrcWithIdentity, SrcReg, IdentityVGPR);
-
+ if (is32BitOpc) {
+ SrcWithIdentityInstr =
+ BuildSetInactiveInstr(SrcWithIdentity, SrcReg, IdentityVGPR);
+ } else {
+ Register SrcWithIdentitylo =
+ MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register SrcWithIdentityhi =
+ MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ auto [Reg0Sub0, Reg0Sub1] =
+ ExtractSubRegs(MI, IdentityCopyInstr->getOperand(0), SrcRegClass);
+ auto [SrcReg0Sub0, SrcReg0Sub1] =
+ ExtractSubRegs(MI, MI.getOperand(1), SrcRegClass);
+ MachineInstr *SetInactiveLoInstr = BuildSetInactiveInstr(
+ SrcWithIdentitylo, SrcReg0Sub0.getReg(), Reg0Sub0.getReg());
+ MachineInstr *SetInactiveHiInstr = BuildSetInactiveInstr(
+ SrcWithIdentityhi, SrcReg0Sub1.getReg(), Reg0Sub1.getReg());
+ SrcWithIdentityInstr =
+ BuildRegSequence(*CurrBB, MI, SrcWithIdentity,
+ SetInactiveLoInstr->getOperand(0).getReg(),
+ SetInactiveHiInstr->getOperand(0).getReg());
+ }
// DPP reduction
- BuildDPPMachineInstr(DPPRowShr1, SrcWithIdentity,
+ Register SrcWithIdentityReg =
+ SrcWithIdentityInstr->getOperand(0).getReg();
+ BuildDPPMachineInstr(DPPRowShr1, SrcWithIdentityReg,
AMDGPU::DPP::ROW_SHR_FIRST);
+ if (NeedsMovDPP)
+ DPPRowShr1 = BuildPostDPPInstr(SrcWithIdentityReg, DPPRowShr1);
BuildDPPMachineInstr(DPPRowShr2, DPPRowShr1,
(AMDGPU::DPP::ROW_SHR_FIRST + 1));
+ if (NeedsMovDPP)
+ DPPRowShr2 = BuildPostDPPInstr(DPPRowShr1, DPPRowShr2);
BuildDPPMachineInstr(DPPRowShr4, DPPRowShr2,
(AMDGPU::DPP::ROW_SHR_FIRST + 3));
+ if (NeedsMovDPP)
+ DPPRowShr4 = BuildPostDPPInstr(DPPRowShr2, DPPRowShr4);
BuildDPPMachineInstr(DPPRowShr8, DPPRowShr4,
(AMDGPU::DPP::ROW_SHR_FIRST + 7));
+ if (NeedsMovDPP)
+ DPPRowShr8 = BuildPostDPPInstr(DPPRowShr4, DPPRowShr8);
if (ST.hasDPPBroadcasts()) {
BuildDPPMachineInstr(RowBcast15, DPPRowShr8, AMDGPU::DPP::BCAST15);
+ if (NeedsMovDPP)
+ RowBcast15 = BuildPostDPPInstr(DPPRowShr8, RowBcast15);
} else {
// magic constant: 0x1E0
// To Set BIT_MODE : bit 15 = 0
// XOR mask : bit [14:10] = 0
// OR mask : bit [9:5] = 15
// AND mask : bit [4:0] = 0
- Register SwizzledValue =
- MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- BuildMI(BB, MI, DL, TII->get(AMDGPU::DS_SWIZZLE_B32), SwizzledValue)
- .addReg(DPPRowShr8) // addr
- .addImm(0x1E0) // swizzle offset (i16)
- .addImm(0x0); // gds (i1)
- BuildClampInstr(RowBcast15, DPPRowShr8, SwizzledValue);
+ if (is32BitOpc) {
+ Register SwizzledValue =
+ MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::DS_SWIZZLE_B32),
+ SwizzledValue)
+ .addReg(DPPRowShr8) // addr
+ .addImm(0x1E0) // swizzle offset (i16)
+ .addImm(0x0); // gds (i1)
+ BuildClampInstr(RowBcast15, DPPRowShr8, SwizzledValue);
+ } else {
+ Register SwizzledValuelo =
+ MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register SwizzledValuehi =
+ MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register SwizzledValue64 = MRI.createVirtualRegister(SrcRegClass);
+ MachineOperand DPPRowShr8Op =
+ MachineOperand::CreateReg(DPPRowShr8, /*isDef=*/false);
+ auto [Op1L, Op1H] = ExtractSubRegs(MI, DPPRowShr8Op, SrcRegClass);
+ BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::DS_SWIZZLE_B32),
+ SwizzledValuelo)
+ .add(Op1L) // addr
+ .addImm(0x1E0) // swizzle offset (i16)
+ .addImm(0x0); // gds (i1)
+ BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::DS_SWIZZLE_B32),
+ SwizzledValuehi)
+ .add(Op1H) // addr
+ .addImm(0x1E0) // swizzle offset (i16)
+ .addImm(0x0); // gds (i1)
+ BuildRegSequence(*CurrBB, MI, SwizzledValue64, SwizzledValuelo,
+ SwizzledValuehi);
+ if (NeedsMovDPP)
+ RowBcast15 = BuildPostDPPInstr(DPPRowShr8, SwizzledValue64);
+ else
+ BuildClampInstr(RowBcast15, DPPRowShr8, SwizzledValue64);
+ }
}
FinalDPPResult = RowBcast15;
if (!IsWave32) {
if (ST.hasDPPBroadcasts()) {
BuildDPPMachineInstr(RowBcast31, RowBcast15, AMDGPU::DPP::BCAST31);
+ if (NeedsMovDPP)
+ RowBcast31 = BuildPostDPPInstr(RowBcast15, RowBcast31);
} else {
Register ShiftedThreadID =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
Register PermuteByteOffset =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- Register PermutedValue =
- MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- Register Lane32Offset = MRI.createVirtualRegister(DstRegClass);
- Register WordSizeConst = MRI.createVirtualRegister(DstRegClass);
+ Register PermutedValue = MRI.createVirtualRegister(SrcRegClass);
+ Register Lane32Offset =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register WordSizeConst =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
Register ThreadIDRegLo =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
Register ThreadIDReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
// Get the thread ID.
- BuildMI(BB, MI, DL, TII->get(AMDGPU::V_MBCNT_LO_U32_B32_e64),
+ BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::V_MBCNT_LO_U32_B32_e64),
ThreadIDRegLo)
.addImm(-1)
.addImm(0);
- BuildMI(BB, MI, DL, TII->get(AMDGPU::V_MBCNT_HI_U32_B32_e64),
+ BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::V_MBCNT_HI_U32_B32_e64),
ThreadIDReg)
.addImm(-1)
.addReg(ThreadIDRegLo);
// shift each lane over by 32 positions, so value in 31st lane is
// present in 63rd lane.
- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), Lane32Offset)
+ BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), Lane32Offset)
.addImm(0x20);
- BuildMI(BB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e64), ShiftedThreadID)
+ BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e64),
+ ShiftedThreadID)
.addReg(ThreadIDReg)
.addReg(Lane32Offset)
.addImm(0); // clamp
// multiply by reg size.
- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), WordSizeConst)
+ BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), WordSizeConst)
.addImm(0x4);
- BuildMI(BB, MI, DL, TII->get(AMDGPU::V_MUL_LO_U32_e64),
+ BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::V_MUL_LO_U32_e64),
PermuteByteOffset)
.addReg(WordSizeConst)
.addReg(ShiftedThreadID);
// Permute the lanes
- BuildMI(BB, MI, DL, TII->get(AMDGPU::DS_PERMUTE_B32), PermutedValue)
- .addReg(PermuteByteOffset) // addr
- .addReg(RowBcast15) // data
- .addImm(0); // offset
- BuildClampInstr(RowBcast31, RowBcast15, PermutedValue);
+ if (is32BitOpc) {
+ BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::DS_PERMUTE_B32),
+ PermutedValue)
+ .addReg(PermuteByteOffset) // addr
+ .addReg(RowBcast15) // data
+ .addImm(0); // offset
+ } else {
+ Register PermutedValuelo =
+ MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register PermutedValuehi =
+ MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ MachineOperand RowBcast15Op =
+ MachineOperand::CreateReg(RowBcast15, /*isDef=*/false);
+ auto [RowBcast15Lo, RowBcast15Hi] =
+ ExtractSubRegs(MI, RowBcast15Op, SrcRegClass);
+ BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::DS_PERMUTE_B32),
+ PermutedValuelo)
+ .addReg(PermuteByteOffset) // addr
+ .add(RowBcast15Lo) // swizzle offset (i16)
+ .addImm(0x0); // gds (i1)
+ BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::DS_PERMUTE_B32),
+ PermutedValuehi)
+ .addReg(PermuteByteOffset) // addr
+ .add(RowBcast15Hi) // swizzle offset (i16)
+ .addImm(0x0); // gds (i1)
+ BuildRegSequence(*CurrBB, MI, PermutedValue, PermutedValuelo,
+ PermutedValuehi);
+ }
+ if (NeedsMovDPP)
+ RowBcast31 = BuildPostDPPInstr(RowBcast15, PermutedValue);
+ else
+ BuildClampInstr(RowBcast31, RowBcast15, PermutedValue);
}
FinalDPPResult = RowBcast31;
}
if (Opc == AMDGPU::V_SUB_F32_e64) {
Register NegatedValVGPR =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- BuildMI(BB, MI, DL, TII->get(AMDGPU::V_SUB_F32_e64), NegatedValVGPR)
+ BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::V_SUB_F32_e64),
+ NegatedValVGPR)
.addImm(SISrcMods::NONE) // src0 mods
.addReg(IdentityVGPR) // src0
.addImm(SISrcMods::NONE) // src1 mods
@@ -6404,18 +6523,41 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
FinalDPPResult = NegatedValVGPR;
}
// The final reduced value is in the last lane.
- BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), ReducedValSGPR)
- .addReg(FinalDPPResult)
- .addImm(ST.getWavefrontSize() - 1);
+ if (is32BitOpc) {
+ BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32),
+ ReducedValSGPR)
+ .addReg(FinalDPPResult)
+ .addImm(ST.getWavefrontSize() - 1);
+ } else {
+ Register LaneValueLoReg =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ Register LaneValueHiReg =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
+ MachineOperand FinalDPPResultOperand =
+ MachineOperand::CreateReg(FinalDPPResult, /*isDef=*/false);
+ auto [Op1L, Op1H] = ExtractSubRegs(MI, FinalDPPResultOperand, SrcRC);
+ // lane value input should be in an sgpr
+ BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32),
+ LaneValueLoReg)
+ .add(Op1L)
+ .addImm(ST.getWavefrontSize() - 1);
+ BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32),
+ LaneValueHiReg)
+ .add(Op1H)
+ .addImm(ST.getWavefrontSize() - 1);
+ BuildRegSequence(*CurrBB, MI, ReducedValSGPR, LaneValueLoReg,
+ LaneValueHiReg);
+ }
if (Opc == AMDGPU::S_SUB_I32)
- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedReducedVal)
+ BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedReducedVal)
.addImm(0)
.addReg(ReducedValSGPR);
// Mark the final result as a whole-wave-mode calculation.
- BuildMI(BB, MI, DL, TII->get(AMDGPU::STRICT_WWM), DstReg)
+ BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::STRICT_WWM), DstReg)
.addReg(Opc == AMDGPU::S_SUB_I32 ? NegatedReducedVal
: ReducedValSGPR);
- RetBB = &BB;
+ RetBB = CurrBB;
}
}
MI.eraseFromParent();
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll
index 502d58f66bd31..8f49864792794 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll
@@ -816,6 +816,982 @@ entry:
ret void
}
+define void @divergent_value_dpp_i64(ptr addrspace(1) %out, i64 %in) {
+; GFX8DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8DAGISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX8DAGISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX8DAGISEL-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e64 v5, 0, v2, s[4:5]
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e64 v6, v4, v3, s[4:5]
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v8, v6
+; GFX8DAGISEL-NEXT: s_nop 0
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v8, v8 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[7:8]
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e32 v4, v7, v5, vcc
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e32 v5, v8, v6, vcc
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT: s_nop 0
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT: s_nop 0
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT: s_nop 0
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT: s_nop 0
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT: s_nop 0
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v4, 63
+; GFX8DAGISEL-NEXT: v_readlane_b32 s7, v5, 63
+; GFX8DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s7
+; GFX8DAGISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8DAGISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX8DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX8DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8GISEL-LABEL: divergent_value_dpp_i64:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX8GISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX8GISEL-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX8GISEL-NEXT: v_cndmask_b32_e64 v5, 0, v2, s[4:5]
+; GFX8GISEL-NEXT: v_cndmask_b32_e64 v6, v4, v3, s[4:5]
+; GFX8GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT: v_mov_b32_e32 v8, v6
+; GFX8GISEL-NEXT: s_nop 0
+; GFX8GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_mov_b32_dpp v8, v8 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[7:8]
+; GFX8GISEL-NEXT: v_cndmask_b32_e32 v4, v7, v5, vcc
+; GFX8GISEL-NEXT: v_cndmask_b32_e32 v5, v8, v6, vcc
+; GFX8GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT: s_nop 0
+; GFX8GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX8GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT: s_nop 0
+; GFX8GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX8GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT: s_nop 0
+; GFX8GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX8GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT: s_nop 0
+; GFX8GISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX8GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT: s_nop 0
+; GFX8GISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX8GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8GISEL-NEXT: v_readlane_b32 s6, v4, 63
+; GFX8GISEL-NEXT: v_readlane_b32 s7, v5, 63
+; GFX8GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s7
+; GFX8GISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX8GISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX8GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9DAGISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9DAGISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX9DAGISEL-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e64 v5, 0, v2, s[4:5]
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e64 v6, v4, v3, s[4:5]
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v8, v6
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v8, v8 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[7:8]
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e32 v4, v7, v5, vcc
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e32 v5, v8, v6, vcc
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v4, 63
+; GFX9DAGISEL-NEXT: v_readlane_b32 s7, v5, 63
+; GFX9DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v3, s7
+; GFX9DAGISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9DAGISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9GISEL-LABEL: divergent_value_dpp_i64:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9GISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX9GISEL-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX9GISEL-NEXT: v_cndmask_b32_e64 v5, 0, v2, s[4:5]
+; GFX9GISEL-NEXT: v_cndmask_b32_e64 v6, v4, v3, s[4:5]
+; GFX9GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT: v_mov_b32_e32 v8, v6
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_mov_b32_dpp v8, v8 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[7:8]
+; GFX9GISEL-NEXT: v_cndmask_b32_e32 v4, v7, v5, vcc
+; GFX9GISEL-NEXT: v_cndmask_b32_e32 v5, v8, v6, vcc
+; GFX9GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX9GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX9GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX9GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX9GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX9GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX9GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX9GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX9GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX9GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX9GISEL-NEXT: v_readlane_b32 s6, v4, 63
+; GFX9GISEL-NEXT: v_readlane_b32 s7, v5, 63
+; GFX9GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9GISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX9GISEL-NEXT: v_mov_b32_e32 v3, s7
+; GFX9GISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9GISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1064DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX1064DAGISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e64 v4, 0, v2, s[4:5]
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v3, s[4:5]
+; GFX1064DAGISEL-NEXT: v_mbcnt_lo_u32_b32 v8, -1, 0
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1064DAGISEL-NEXT: v_mbcnt_hi_u32_b32 v8, -1, v8
+; GFX1064DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT: v_add_nc_u32_e32 v8, 32, v8
+; GFX1064DAGISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX1064DAGISEL-NEXT: v_mul_lo_u32 v8, 4, v8
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1064DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1064DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1064DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1064DAGISEL-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1064DAGISEL-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1064DAGISEL-NEXT: ds_permute_b32 v6, v8, v4
+; GFX1064DAGISEL-NEXT: ds_permute_b32 v7, v8, v5
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s6, v4, 63
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s7, v5, 63
+; GFX1064DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v3, s7
+; GFX1064DAGISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1064DAGISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX1064DAGISEL-NEXT: s_clause 0x6 ; 28-byte Folded Reload
+; GFX1064DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32
+; GFX1064DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4
+; GFX1064DAGISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
+; GFX1064DAGISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12
+; GFX1064DAGISEL-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16
+; GFX1064DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20
+; GFX1064DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24
+; GFX1064DAGISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1064DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1064GISEL-LABEL: divergent_value_dpp_i64:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX1064GISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1064GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v2, s[4:5]
+; GFX1064GISEL-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v3, s[4:5]
+; GFX1064GISEL-NEXT: v_mbcnt_lo_u32_b32 v8, -1, 0
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1064GISEL-NEXT: v_mbcnt_hi_u32_b32 v8, -1, v8
+; GFX1064GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT: v_add_nc_u32_e32 v8, 32, v8
+; GFX1064GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX1064GISEL-NEXT: v_mul_lo_u32 v8, 4, v8
+; GFX1064GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1064GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX1064GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1064GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX1064GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1064GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX1064GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1064GISEL-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1064GISEL-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX1064GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1064GISEL-NEXT: ds_permute_b32 v6, v8, v4
+; GFX1064GISEL-NEXT: ds_permute_b32 v7, v8, v5
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX1064GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1064GISEL-NEXT: v_readlane_b32 s6, v4, 63
+; GFX1064GISEL-NEXT: v_readlane_b32 s7, v5, 63
+; GFX1064GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v3, s7
+; GFX1064GISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1064GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX1064GISEL-NEXT: s_clause 0x6 ; 28-byte Folded Reload
+; GFX1064GISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32
+; GFX1064GISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4
+; GFX1064GISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
+; GFX1064GISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12
+; GFX1064GISEL-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16
+; GFX1064GISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20
+; GFX1064GISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24
+; GFX1064GISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1064GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1032DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: s_xor_saveexec_b32 s4, -1
+; GFX1032DAGISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX1032DAGISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX1032DAGISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX1032DAGISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX1032DAGISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX1032DAGISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX1032DAGISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032DAGISEL-NEXT: s_mov_b32 exec_lo, s4
+; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s6, -1
+; GFX1032DAGISEL-NEXT: v_cndmask_b32_e64 v4, 0, v2, s6
+; GFX1032DAGISEL-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v3, s6
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1032DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1032DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1032DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc_lo
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1032DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1032DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1032DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc_lo
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1032DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1032DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1032DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc_lo
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1032DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1032DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1032DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc_lo
+; GFX1032DAGISEL-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1032DAGISEL-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1032DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1032DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc_lo
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s4, v4, 31
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v5, 31
+; GFX1032DAGISEL-NEXT: s_mov_b32 exec_lo, s6
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032DAGISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1032DAGISEL-NEXT: s_xor_saveexec_b32 s4, -1
+; GFX1032DAGISEL-NEXT: s_clause 0x5 ; 24-byte Folded Reload
+; GFX1032DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32
+; GFX1032DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4
+; GFX1032DAGISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
+; GFX1032DAGISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12
+; GFX1032DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16
+; GFX1032DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20
+; GFX1032DAGISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032DAGISEL-NEXT: s_mov_b32 exec_lo, s4
+; GFX1032DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1032DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1032GISEL-LABEL: divergent_value_dpp_i64:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032GISEL-NEXT: s_xor_saveexec_b32 s4, -1
+; GFX1032GISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX1032GISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX1032GISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX1032GISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX1032GISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX1032GISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX1032GISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032GISEL-NEXT: s_mov_b32 exec_lo, s4
+; GFX1032GISEL-NEXT: s_or_saveexec_b32 s6, -1
+; GFX1032GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v2, s6
+; GFX1032GISEL-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v3, s6
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1032GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1032GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1032GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc_lo
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1032GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1032GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1032GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc_lo
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1032GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1032GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1032GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc_lo
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1032GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1032GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1032GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc_lo
+; GFX1032GISEL-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1032GISEL-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1032GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1032GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc_lo
+; GFX1032GISEL-NEXT: v_readlane_b32 s4, v4, 31
+; GFX1032GISEL-NEXT: v_readlane_b32 s5, v5, 31
+; GFX1032GISEL-NEXT: s_mov_b32 exec_lo, s6
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032GISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1032GISEL-NEXT: s_xor_saveexec_b32 s4, -1
+; GFX1032GISEL-NEXT: s_clause 0x5 ; 24-byte Folded Reload
+; GFX1032GISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32
+; GFX1032GISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4
+; GFX1032GISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
+; GFX1032GISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12
+; GFX1032GISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16
+; GFX1032GISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20
+; GFX1032GISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032GISEL-NEXT: s_mov_b32 exec_lo, s4
+; GFX1032GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1032GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1164DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-NEXT: s_clause 0x4 ; 28-byte Folded Spill
+; GFX1164DAGISEL-NEXT: scratch_store_b64 off, v[4:5], s32
+; GFX1164DAGISEL-NEXT: scratch_store_b64 off, v[6:7], s32 offset:8
+; GFX1164DAGISEL-NEXT: scratch_store_b32 off, v8, s32 offset:16
+; GFX1164DAGISEL-NEXT: scratch_store_b32 off, v4, s32 offset:20
+; GFX1164DAGISEL-NEXT: scratch_store_b32 off, v5, s32 offset:24
+; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1164DAGISEL-NEXT: v_cndmask_b32_e64 v4, 0, v2, s[0:1]
+; GFX1164DAGISEL-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v3, s[0:1]
+; GFX1164DAGISEL-NEXT: v_mbcnt_lo_u32_b32 v8, -1, 0
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-NEXT: v_mbcnt_hi_u32_b32 v8, -1, v8
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT: v_add_nc_u32_e32 v8, 32, v8
+; GFX1164DAGISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT: v_mul_lo_u32 v8, 4, v8
+; GFX1164DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX1164DAGISEL-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX1164DAGISEL-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX1164DAGISEL-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1164DAGISEL-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX1164DAGISEL-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1164DAGISEL-NEXT: ds_permute_b32 v6, v8, v4
+; GFX1164DAGISEL-NEXT: ds_permute_b32 v7, v8, v5
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX1164DAGISEL-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT: v_readlane_b32 s2, v4, 63
+; GFX1164DAGISEL-NEXT: v_readlane_b32 s3, v5, 63
+; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v3, s3
+; GFX1164DAGISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1164DAGISEL-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-NEXT: s_waitcnt_depctr depctr_sa_sdst(0)
+; GFX1164DAGISEL-NEXT: s_clause 0x4 ; 28-byte Folded Reload
+; GFX1164DAGISEL-NEXT: scratch_load_b64 v[4:5], off, s32
+; GFX1164DAGISEL-NEXT: scratch_load_b64 v[6:7], off, s32 offset:8
+; GFX1164DAGISEL-NEXT: scratch_load_b32 v8, off, s32 offset:16
+; GFX1164DAGISEL-NEXT: scratch_load_b32 v4, off, s32 offset:20
+; GFX1164DAGISEL-NEXT: scratch_load_b32 v5, off, s32 offset:24
+; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1164DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1164GISEL-LABEL: divergent_value_dpp_i64:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164GISEL-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX1164GISEL-NEXT: s_clause 0x4 ; 28-byte Folded Spill
+; GFX1164GISEL-NEXT: scratch_store_b64 off, v[4:5], s32
+; GFX1164GISEL-NEXT: scratch_store_b64 off, v[6:7], s32 offset:8
+; GFX1164GISEL-NEXT: scratch_store_b32 off, v8, s32 offset:16
+; GFX1164GISEL-NEXT: scratch_store_b32 off, v4, s32 offset:20
+; GFX1164GISEL-NEXT: scratch_store_b32 off, v5, s32 offset:24
+; GFX1164GISEL-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164GISEL-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1164GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v2, s[0:1]
+; GFX1164GISEL-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v3, s[0:1]
+; GFX1164GISEL-NEXT: v_mbcnt_lo_u32_b32 v8, -1, 0
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164GISEL-NEXT: v_mbcnt_hi_u32_b32 v8, -1, v8
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT: v_add_nc_u32_e32 v8, 32, v8
+; GFX1164GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT: v_mul_lo_u32 v8, 4, v8
+; GFX1164GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX1164GISEL-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX1164GISEL-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX1164GISEL-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1164GISEL-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1164GISEL-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX1164GISEL-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1164GISEL-NEXT: ds_permute_b32 v6, v8, v4
+; GFX1164GISEL-NEXT: ds_permute_b32 v7, v8, v5
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX1164GISEL-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT: v_readlane_b32 s2, v4, 63
+; GFX1164GISEL-NEXT: v_readlane_b32 s3, v5, 63
+; GFX1164GISEL-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v3, s3
+; GFX1164GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1164GISEL-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX1164GISEL-NEXT: s_waitcnt_depctr depctr_sa_sdst(0)
+; GFX1164GISEL-NEXT: s_clause 0x4 ; 28-byte Folded Reload
+; GFX1164GISEL-NEXT: scratch_load_b64 v[4:5], off, s32
+; GFX1164GISEL-NEXT: scratch_load_b64 v[6:7], off, s32 offset:8
+; GFX1164GISEL-NEXT: scratch_load_b32 v8, off, s32 offset:16
+; GFX1164GISEL-NEXT: scratch_load_b32 v4, off, s32 offset:20
+; GFX1164GISEL-NEXT: scratch_load_b32 v5, off, s32 offset:24
+; GFX1164GISEL-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1164GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1132DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX1132DAGISEL-NEXT: s_clause 0x3 ; 24-byte Folded Spill
+; GFX1132DAGISEL-NEXT: scratch_store_b64 off, v[4:5], s32
+; GFX1132DAGISEL-NEXT: scratch_store_b64 off, v[6:7], s32 offset:8
+; GFX1132DAGISEL-NEXT: scratch_store_b32 off, v4, s32 offset:16
+; GFX1132DAGISEL-NEXT: scratch_store_b32 off, v5, s32 offset:20
+; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_cndmask_b32_e64 v4, 0, v2, s2
+; GFX1132DAGISEL-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v3, s2
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1132DAGISEL-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v7, v5
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1132DAGISEL-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v7, v5
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1132DAGISEL-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v7, v5
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1132DAGISEL-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v7, v5
+; GFX1132DAGISEL-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1132DAGISEL-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v7, v5
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT: v_readlane_b32 s0, v4, 31
+; GFX1132DAGISEL-NEXT: v_readlane_b32 s1, v5, 31
+; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX1132DAGISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1132DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX1132DAGISEL-NEXT: s_clause 0x3 ; 24-byte Folded Reload
+; GFX1132DAGISEL-NEXT: scratch_load_b64 v[4:5], off, s32
+; GFX1132DAGISEL-NEXT: scratch_load_b64 v[6:7], off, s32 offset:8
+; GFX1132DAGISEL-NEXT: scratch_load_b32 v4, off, s32 offset:16
+; GFX1132DAGISEL-NEXT: scratch_load_b32 v5, off, s32 offset:20
+; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1132DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1132GISEL-LABEL: divergent_value_dpp_i64:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132GISEL-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX1132GISEL-NEXT: s_clause 0x3 ; 24-byte Folded Spill
+; GFX1132GISEL-NEXT: scratch_store_b64 off, v[4:5], s32
+; GFX1132GISEL-NEXT: scratch_store_b64 off, v[6:7], s32 offset:8
+; GFX1132GISEL-NEXT: scratch_store_b32 off, v4, s32 offset:16
+; GFX1132GISEL-NEXT: scratch_store_b32 off, v5, s32 offset:20
+; GFX1132GISEL-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132GISEL-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v2, s2
+; GFX1132GISEL-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v3, s2
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1132GISEL-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v7, v5
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1132GISEL-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v7, v5
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1132GISEL-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v7, v5
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1132GISEL-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v7, v5
+; GFX1132GISEL-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1132GISEL-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1132GISEL-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v7, v5
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT: v_readlane_b32 s0, v4, 31
+; GFX1132GISEL-NEXT: v_readlane_b32 s1, v5, 31
+; GFX1132GISEL-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX1132GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1132GISEL-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX1132GISEL-NEXT: s_clause 0x3 ; 24-byte Folded Reload
+; GFX1132GISEL-NEXT: scratch_load_b64 v[4:5], off, s32
+; GFX1132GISEL-NEXT: scratch_load_b64 v[6:7], off, s32 offset:8
+; GFX1132GISEL-NEXT: scratch_load_b32 v4, off, s32 offset:16
+; GFX1132GISEL-NEXT: scratch_load_b32 v5, off, s32 offset:20
+; GFX1132GISEL-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1132GISEL-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %result = call i64 @llvm.amdgcn.wave.reduce.max.i64(i64 %in, i32 2)
+ store i64 %result, ptr addrspace(1) %out
+ ret void
+}
+
define amdgpu_kernel void @default_stratergy(ptr addrspace(1) %out) {
; GFX8DAGISEL-LABEL: default_stratergy:
; GFX8DAGISEL: ; %bb.0: ; %entry
@@ -1168,20 +2144,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s2
; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB5_6
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB6_6
; GFX8DAGISEL-NEXT: ; %bb.3: ; %if
; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX8DAGISEL-NEXT: s_brev_b32 s6, 1
-; GFX8DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX8DAGISEL-NEXT: s_max_i32 s6, s6, s8
; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4
; GFX8DAGISEL-NEXT: ; %bb.5:
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6
-; GFX8DAGISEL-NEXT: .LBB5_6: ; %endif
+; GFX8DAGISEL-NEXT: .LBB6_6: ; %endif
; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
@@ -1196,26 +2172,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX8GISEL-NEXT: ; implicit-def: $sgpr6
; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX8GISEL-NEXT: s_cbranch_execz .LBB5_2
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX8GISEL-NEXT: ; %bb.1: ; %else
; GFX8GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
; GFX8GISEL-NEXT: ; implicit-def: $vgpr0
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8GISEL-NEXT: s_mov_b32 s6, s2
-; GFX8GISEL-NEXT: .LBB5_2: ; %Flow
+; GFX8GISEL-NEXT: .LBB6_2: ; %Flow
; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX8GISEL-NEXT: s_cbranch_execz .LBB5_5
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB6_5
; GFX8GISEL-NEXT: ; %bb.3: ; %if
; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX8GISEL-NEXT: s_brev_b32 s6, 1
-; GFX8GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
; GFX8GISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX8GISEL-NEXT: s_max_i32 s6, s6, s8
; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB5_4
-; GFX8GISEL-NEXT: .LBB5_5: ; %endif
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB6_4
+; GFX8GISEL-NEXT: .LBB6_5: ; %endif
; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6
@@ -1239,20 +2215,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB5_6
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB6_6
; GFX9DAGISEL-NEXT: ; %bb.3: ; %if
; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX9DAGISEL-NEXT: s_brev_b32 s6, 1
-; GFX9DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX9DAGISEL-NEXT: s_max_i32 s6, s6, s8
; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4
; GFX9DAGISEL-NEXT: ; %bb.5:
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6
-; GFX9DAGISEL-NEXT: .LBB5_6: ; %endif
+; GFX9DAGISEL-NEXT: .LBB6_6: ; %endif
; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
@@ -1266,26 +2242,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX9GISEL-NEXT: ; implicit-def: $sgpr6
; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX9GISEL-NEXT: s_cbranch_execz .LBB5_2
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX9GISEL-NEXT: ; %bb.1: ; %else
; GFX9GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
; GFX9GISEL-NEXT: ; implicit-def: $vgpr0
; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9GISEL-NEXT: s_mov_b32 s6, s2
-; GFX9GISEL-NEXT: .LBB5_2: ; %Flow
+; GFX9GISEL-NEXT: .LBB6_2: ; %Flow
; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX9GISEL-NEXT: s_cbranch_execz .LBB5_5
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB6_5
; GFX9GISEL-NEXT: ; %bb.3: ; %if
; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX9GISEL-NEXT: s_brev_b32 s6, 1
-; GFX9GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
; GFX9GISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX9GISEL-NEXT: s_max_i32 s6, s6, s8
; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB5_4
-; GFX9GISEL-NEXT: .LBB5_5: ; %endif
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB6_4
+; GFX9GISEL-NEXT: .LBB6_5: ; %endif
; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6
@@ -1308,20 +2284,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB5_6
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB6_6
; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if
; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX1064DAGISEL-NEXT: s_brev_b32 s6, 1
-; GFX1064DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX1064DAGISEL-NEXT: s_max_i32 s6, s6, s8
; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4
; GFX1064DAGISEL-NEXT: ; %bb.5:
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6
-; GFX1064DAGISEL-NEXT: .LBB5_6: ; %endif
+; GFX1064DAGISEL-NEXT: .LBB6_6: ; %endif
; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
@@ -1335,26 +2311,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6
; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064GISEL-NEXT: s_cbranch_execz .LBB5_2
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX1064GISEL-NEXT: ; %bb.1: ; %else
; GFX1064GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0
; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064GISEL-NEXT: s_mov_b32 s6, s2
-; GFX1064GISEL-NEXT: .LBB5_2: ; %Flow
+; GFX1064GISEL-NEXT: .LBB6_2: ; %Flow
; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX1064GISEL-NEXT: s_cbranch_execz .LBB5_5
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB6_5
; GFX1064GISEL-NEXT: ; %bb.3: ; %if
; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX1064GISEL-NEXT: s_brev_b32 s6, 1
-; GFX1064GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
; GFX1064GISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX1064GISEL-NEXT: s_max_i32 s6, s6, s8
; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB5_4
-; GFX1064GISEL-NEXT: .LBB5_5: ; %endif
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB6_4
+; GFX1064GISEL-NEXT: .LBB6_5: ; %endif
; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6
@@ -1377,20 +2353,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB5_6
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB6_6
; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if
; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
; GFX1032DAGISEL-NEXT: s_brev_b32 s1, 1
-; GFX1032DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s3, s2
; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s3
; GFX1032DAGISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1032DAGISEL-NEXT: s_max_i32 s1, s1, s6
; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4
; GFX1032DAGISEL-NEXT: ; %bb.5:
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX1032DAGISEL-NEXT: .LBB5_6: ; %endif
+; GFX1032DAGISEL-NEXT: .LBB6_6: ; %endif
; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
@@ -1404,26 +2380,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1032GISEL-NEXT: ; implicit-def: $sgpr0
; GFX1032GISEL-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032GISEL-NEXT: s_xor_b32 s1, exec_lo, s1
-; GFX1032GISEL-NEXT: s_cbranch_execz .LBB5_2
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX1032GISEL-NEXT: ; %bb.1: ; %else
; GFX1032GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c
; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0
; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032GISEL-NEXT: s_mov_b32 s0, s0
-; GFX1032GISEL-NEXT: .LBB5_2: ; %Flow
+; GFX1032GISEL-NEXT: .LBB6_2: ; %Flow
; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s1, s1
-; GFX1032GISEL-NEXT: s_cbranch_execz .LBB5_5
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB6_5
; GFX1032GISEL-NEXT: ; %bb.3: ; %if
; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
; GFX1032GISEL-NEXT: s_brev_b32 s0, 1
-; GFX1032GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX1032GISEL-NEXT: s_ff1_i32_b32 s3, s2
; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s3
; GFX1032GISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1032GISEL-NEXT: s_max_i32 s0, s0, s6
; GFX1032GISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB5_4
-; GFX1032GISEL-NEXT: .LBB5_5: ; %endif
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB6_4
+; GFX1032GISEL-NEXT: .LBB6_5: ; %endif
; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX1032GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0
@@ -1448,21 +2424,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB5_6
+; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB6_6
; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if
; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX1164DAGISEL-NEXT: s_brev_b32 s6, 1
-; GFX1164DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX1164DAGISEL-NEXT: s_max_i32 s6, s6, s8
; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4
; GFX1164DAGISEL-NEXT: ; %bb.5:
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6
-; GFX1164DAGISEL-NEXT: .LBB5_6: ; %endif
+; GFX1164DAGISEL-NEXT: .LBB6_6: ; %endif
; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
@@ -1478,27 +2454,27 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164GISEL-NEXT: s_cbranch_execz .LBB5_2
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX1164GISEL-NEXT: ; %bb.1: ; %else
; GFX1164GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0
; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164GISEL-NEXT: s_mov_b32 s6, s2
-; GFX1164GISEL-NEXT: .LBB5_2: ; %Flow
+; GFX1164GISEL-NEXT: .LBB6_2: ; %Flow
; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1]
-; GFX1164GISEL-NEXT: s_cbranch_execz .LBB5_5
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB6_5
; GFX1164GISEL-NEXT: ; %bb.3: ; %if
; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX1164GISEL-NEXT: s_brev_b32 s6, 1
-; GFX1164GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX1164GISEL-NEXT: s_max_i32 s6, s6, s8
; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB5_4
-; GFX1164GISEL-NEXT: .LBB5_5: ; %endif
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB6_4
+; GFX1164GISEL-NEXT: .LBB6_5: ; %endif
; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6
@@ -1523,21 +2499,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB5_6
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB6_6
; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if
; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
; GFX1132DAGISEL-NEXT: s_brev_b32 s1, 1
-; GFX1132DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s3
; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1132DAGISEL-NEXT: s_max_i32 s1, s1, s6
; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4
; GFX1132DAGISEL-NEXT: ; %bb.5:
; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX1132DAGISEL-NEXT: .LBB5_6: ; %endif
+; GFX1132DAGISEL-NEXT: .LBB6_6: ; %endif
; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0
@@ -1553,27 +2529,27 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
; GFX1132GISEL-NEXT: s_xor_b32 s1, exec_lo, s1
-; GFX1132GISEL-NEXT: s_cbranch_execz .LBB5_2
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX1132GISEL-NEXT: ; %bb.1: ; %else
; GFX1132GISEL-NEXT: s_load_b32 s0, s[4:5], 0x2c
; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0
; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132GISEL-NEXT: s_mov_b32 s0, s0
-; GFX1132GISEL-NEXT: .LBB5_2: ; %Flow
+; GFX1132GISEL-NEXT: .LBB6_2: ; %Flow
; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s1, s1
-; GFX1132GISEL-NEXT: s_cbranch_execz .LBB5_5
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB6_5
; GFX1132GISEL-NEXT: ; %bb.3: ; %if
; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
; GFX1132GISEL-NEXT: s_brev_b32 s0, 1
-; GFX1132GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2
; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s3
; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1132GISEL-NEXT: s_max_i32 s0, s0, s6
; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB5_4
-; GFX1132GISEL-NEXT: .LBB5_5: ; %endif
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB6_4
+; GFX1132GISEL-NEXT: .LBB6_5: ; %endif
; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24
; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
@@ -1712,7 +2688,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX8DAGISEL-NEXT: s_mov_b32 s4, 0
; GFX8DAGISEL-NEXT: s_brev_b32 s5, 1
; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX8DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s12, s[6:7]
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v4, s4
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v5, s5
@@ -1723,7 +2699,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX8DAGISEL-NEXT: s_bitset0_b64 s[6:7], s12
; GFX8DAGISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5]
; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX8DAGISEL-NEXT: ; %bb.2:
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -1737,7 +2713,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX8GISEL-NEXT: s_mov_b32 s4, 0
; GFX8GISEL-NEXT: s_brev_b32 s5, 1
; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX8GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX8GISEL-NEXT: s_ff1_i32_b64 s12, s[6:7]
; GFX8GISEL-NEXT: v_mov_b32_e32 v4, s4
; GFX8GISEL-NEXT: v_mov_b32_e32 v5, s5
@@ -1748,7 +2724,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX8GISEL-NEXT: s_bitset0_b64 s[6:7], s12
; GFX8GISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5]
; GFX8GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX8GISEL-NEXT: ; %bb.2:
; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -1762,7 +2738,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX9DAGISEL-NEXT: s_mov_b32 s4, 0
; GFX9DAGISEL-NEXT: s_brev_b32 s5, 1
; GFX9DAGISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX9DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s12, s[6:7]
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v4, s4
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v5, s5
@@ -1773,7 +2749,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX9DAGISEL-NEXT: s_bitset0_b64 s[6:7], s12
; GFX9DAGISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5]
; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX9DAGISEL-NEXT: ; %bb.2:
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -1787,7 +2763,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX9GISEL-NEXT: s_mov_b32 s4, 0
; GFX9GISEL-NEXT: s_brev_b32 s5, 1
; GFX9GISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX9GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX9GISEL-NEXT: s_ff1_i32_b64 s12, s[6:7]
; GFX9GISEL-NEXT: v_mov_b32_e32 v4, s4
; GFX9GISEL-NEXT: v_mov_b32_e32 v5, s5
@@ -1798,7 +2774,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX9GISEL-NEXT: s_bitset0_b64 s[6:7], s12
; GFX9GISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5]
; GFX9GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX9GISEL-NEXT: ; %bb.2:
; GFX9GISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX9GISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -1812,7 +2788,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1064DAGISEL-NEXT: s_mov_b32 s4, 0
; GFX1064DAGISEL-NEXT: s_brev_b32 s5, 1
; GFX1064DAGISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX1064DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s12, s[6:7]
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v4, s4
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v5, s5
@@ -1823,7 +2799,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[6:7], s12
; GFX1064DAGISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5]
; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1064DAGISEL-NEXT: ; %bb.2:
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -1836,7 +2812,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1064GISEL-NEXT: s_mov_b32 s4, 0
; GFX1064GISEL-NEXT: s_brev_b32 s5, 1
; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX1064GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1064GISEL-NEXT: s_ff1_i32_b64 s12, s[6:7]
; GFX1064GISEL-NEXT: v_mov_b32_e32 v4, s4
; GFX1064GISEL-NEXT: v_mov_b32_e32 v5, s5
@@ -1847,7 +2823,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1064GISEL-NEXT: s_bitset0_b64 s[6:7], s12
; GFX1064GISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5]
; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1064GISEL-NEXT: ; %bb.2:
; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX1064GISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -1860,7 +2836,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1032DAGISEL-NEXT: s_mov_b32 s4, 0
; GFX1032DAGISEL-NEXT: s_brev_b32 s5, 1
; GFX1032DAGISEL-NEXT: s_mov_b32 s6, exec_lo
-; GFX1032DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s7, s6
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v4, s4
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v5, s5
@@ -1871,7 +2847,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1032DAGISEL-NEXT: s_bitset0_b32 s6, s7
; GFX1032DAGISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5]
; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s6, 0
-; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1032DAGISEL-NEXT: ; %bb.2:
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -1884,7 +2860,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1032GISEL-NEXT: s_mov_b32 s4, 0
; GFX1032GISEL-NEXT: s_brev_b32 s5, 1
; GFX1032GISEL-NEXT: s_mov_b32 s6, exec_lo
-; GFX1032GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1032GISEL-NEXT: s_ff1_i32_b32 s7, s6
; GFX1032GISEL-NEXT: v_mov_b32_e32 v4, s4
; GFX1032GISEL-NEXT: v_mov_b32_e32 v5, s5
@@ -1895,7 +2871,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1032GISEL-NEXT: s_bitset0_b32 s6, s7
; GFX1032GISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5]
; GFX1032GISEL-NEXT: s_cmp_lg_u32 s6, 0
-; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1032GISEL-NEXT: ; %bb.2:
; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX1032GISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -1908,7 +2884,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1164DAGISEL-NEXT: s_mov_b32 s0, 0
; GFX1164DAGISEL-NEXT: s_brev_b32 s1, 1
; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s8, s[2:3]
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v4, s0
@@ -1920,7 +2896,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s8
; GFX1164DAGISEL-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1]
; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1164DAGISEL-NEXT: ; %bb.2:
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v3, s1
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, s0
@@ -1933,7 +2909,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1164GISEL-NEXT: s_mov_b32 s0, 0
; GFX1164GISEL-NEXT: s_brev_b32 s1, 1
; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX1164GISEL-NEXT: s_ctz_i32_b64 s8, s[2:3]
; GFX1164GISEL-NEXT: v_mov_b32_e32 v4, s0
@@ -1945,7 +2921,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s8
; GFX1164GISEL-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1]
; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1164GISEL-NEXT: ; %bb.2:
; GFX1164GISEL-NEXT: v_mov_b32_e32 v3, s1
; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, s0
@@ -1958,7 +2934,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1132DAGISEL-NEXT: s_mov_b32 s0, 0
; GFX1132DAGISEL-NEXT: s_brev_b32 s1, 1
; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2
; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
@@ -1969,7 +2945,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1132DAGISEL-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1]
; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1132DAGISEL-NEXT: ; %bb.2:
; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132DAGISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
@@ -1981,7 +2957,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1132GISEL-NEXT: s_mov_b32 s0, 0
; GFX1132GISEL-NEXT: s_brev_b32 s1, 1
; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2
; GFX1132GISEL-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
@@ -1992,7 +2968,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1132GISEL-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1]
; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1132GISEL-NEXT: ; %bb.2:
; GFX1132GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
@@ -2033,19 +3009,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX8GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7
; GFX8GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc
; GFX8GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
-; GFX8GISEL-NEXT: s_cbranch_execz .LBB8_2
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX8GISEL-NEXT: ; %bb.1: ; %else
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8GISEL-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX8GISEL-NEXT: .LBB8_2: ; %Flow
+; GFX8GISEL-NEXT: .LBB9_2: ; %Flow
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[8:9]
-; GFX8GISEL-NEXT: s_cbranch_execz .LBB8_4
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX8GISEL-NEXT: ; %bb.3: ; %if
; GFX8GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8GISEL-NEXT: s_mov_b64 s[6:7], s[4:5]
-; GFX8GISEL-NEXT: .LBB8_4: ; %endif
+; GFX8GISEL-NEXT: .LBB9_4: ; %endif
; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1
@@ -2082,19 +3058,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX9GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7
; GFX9GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc
; GFX9GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
-; GFX9GISEL-NEXT: s_cbranch_execz .LBB8_2
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX9GISEL-NEXT: ; %bb.1: ; %else
; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9GISEL-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX9GISEL-NEXT: .LBB8_2: ; %Flow
+; GFX9GISEL-NEXT: .LBB9_2: ; %Flow
; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[8:9]
-; GFX9GISEL-NEXT: s_cbranch_execz .LBB8_4
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX9GISEL-NEXT: ; %bb.3: ; %if
; GFX9GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9GISEL-NEXT: s_mov_b64 s[6:7], s[6:7]
-; GFX9GISEL-NEXT: .LBB8_4: ; %endif
+; GFX9GISEL-NEXT: .LBB9_4: ; %endif
; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s7
@@ -2131,19 +3107,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7
; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc
; GFX1064GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
-; GFX1064GISEL-NEXT: s_cbranch_execz .LBB8_2
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX1064GISEL-NEXT: ; %bb.1: ; %else
; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX1064GISEL-NEXT: .LBB8_2: ; %Flow
+; GFX1064GISEL-NEXT: .LBB9_2: ; %Flow
; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[8:9]
-; GFX1064GISEL-NEXT: s_cbranch_execz .LBB8_4
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX1064GISEL-NEXT: ; %bb.3: ; %if
; GFX1064GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], s[6:7]
-; GFX1064GISEL-NEXT: .LBB8_4: ; %endif
+; GFX1064GISEL-NEXT: .LBB9_4: ; %endif
; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, s7
@@ -2180,19 +3156,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1032GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7
; GFX1032GISEL-NEXT: s_and_saveexec_b32 s8, vcc_lo
; GFX1032GISEL-NEXT: s_xor_b32 s8, exec_lo, s8
-; GFX1032GISEL-NEXT: s_cbranch_execz .LBB8_2
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX1032GISEL-NEXT: ; %bb.1: ; %else
; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032GISEL-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX1032GISEL-NEXT: .LBB8_2: ; %Flow
+; GFX1032GISEL-NEXT: .LBB9_2: ; %Flow
; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s2, s8
-; GFX1032GISEL-NEXT: s_cbranch_execz .LBB8_4
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX1032GISEL-NEXT: ; %bb.3: ; %if
; GFX1032GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032GISEL-NEXT: s_mov_b64 s[6:7], s[6:7]
-; GFX1032GISEL-NEXT: .LBB8_4: ; %endif
+; GFX1032GISEL-NEXT: .LBB9_4: ; %endif
; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s7
@@ -2233,19 +3209,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
; GFX1164GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
-; GFX1164GISEL-NEXT: s_cbranch_execz .LBB8_2
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX1164GISEL-NEXT: ; %bb.1: ; %else
; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164GISEL-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX1164GISEL-NEXT: .LBB8_2: ; %Flow
+; GFX1164GISEL-NEXT: .LBB9_2: ; %Flow
; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[2:3], s[8:9]
-; GFX1164GISEL-NEXT: s_cbranch_execz .LBB8_4
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX1164GISEL-NEXT: ; %bb.3: ; %if
; GFX1164GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164GISEL-NEXT: s_mov_b64 s[6:7], s[4:5]
-; GFX1164GISEL-NEXT: .LBB8_4: ; %endif
+; GFX1164GISEL-NEXT: .LBB9_4: ; %endif
; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, s7
@@ -2284,19 +3260,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
; GFX1132GISEL-NEXT: s_xor_b32 s8, exec_lo, s8
-; GFX1132GISEL-NEXT: s_cbranch_execz .LBB8_2
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX1132GISEL-NEXT: ; %bb.1: ; %else
; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132GISEL-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX1132GISEL-NEXT: .LBB8_2: ; %Flow
+; GFX1132GISEL-NEXT: .LBB9_2: ; %Flow
; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s2, s8
-; GFX1132GISEL-NEXT: s_cbranch_execz .LBB8_4
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX1132GISEL-NEXT: ; %bb.3: ; %if
; GFX1132GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132GISEL-NEXT: s_mov_b64 s[6:7], s[4:5]
-; GFX1132GISEL-NEXT: .LBB8_4: ; %endif
+; GFX1132GISEL-NEXT: .LBB9_4: ; %endif
; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
; GFX1132GISEL-NEXT: v_mov_b32_e32 v2, 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll
index a196fd6388bed..23d10af9371b0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll
@@ -816,6 +816,982 @@ entry:
ret void
}
+define void @divergent_value_dpp_i64(ptr addrspace(1) %out, i64 %in) {
+; GFX8DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8DAGISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX8DAGISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX8DAGISEL-NEXT: v_bfrev_b32_e32 v4, -2
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e64 v5, -1, v2, s[4:5]
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e64 v6, v4, v3, s[4:5]
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v8, v6
+; GFX8DAGISEL-NEXT: s_nop 0
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v8, v8 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[5:6], v[7:8]
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e32 v4, v7, v5, vcc
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e32 v5, v8, v6, vcc
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT: s_nop 0
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT: s_nop 0
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT: s_nop 0
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT: s_nop 0
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT: s_nop 0
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v4, 63
+; GFX8DAGISEL-NEXT: v_readlane_b32 s7, v5, 63
+; GFX8DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s7
+; GFX8DAGISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8DAGISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX8DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX8DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8GISEL-LABEL: divergent_value_dpp_i64:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX8GISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX8GISEL-NEXT: v_bfrev_b32_e32 v4, -2
+; GFX8GISEL-NEXT: v_cndmask_b32_e64 v5, -1, v2, s[4:5]
+; GFX8GISEL-NEXT: v_cndmask_b32_e64 v6, v4, v3, s[4:5]
+; GFX8GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT: v_mov_b32_e32 v8, v6
+; GFX8GISEL-NEXT: s_nop 0
+; GFX8GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_mov_b32_dpp v8, v8 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[5:6], v[7:8]
+; GFX8GISEL-NEXT: v_cndmask_b32_e32 v4, v7, v5, vcc
+; GFX8GISEL-NEXT: v_cndmask_b32_e32 v5, v8, v6, vcc
+; GFX8GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT: s_nop 0
+; GFX8GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX8GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT: s_nop 0
+; GFX8GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX8GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT: s_nop 0
+; GFX8GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX8GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT: s_nop 0
+; GFX8GISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX8GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT: s_nop 0
+; GFX8GISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX8GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8GISEL-NEXT: v_readlane_b32 s6, v4, 63
+; GFX8GISEL-NEXT: v_readlane_b32 s7, v5, 63
+; GFX8GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s7
+; GFX8GISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX8GISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX8GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9DAGISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9DAGISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX9DAGISEL-NEXT: v_bfrev_b32_e32 v4, -2
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e64 v5, -1, v2, s[4:5]
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e64 v6, v4, v3, s[4:5]
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v8, v6
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v8, v8 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[5:6], v[7:8]
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e32 v4, v7, v5, vcc
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e32 v5, v8, v6, vcc
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v4, 63
+; GFX9DAGISEL-NEXT: v_readlane_b32 s7, v5, 63
+; GFX9DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v3, s7
+; GFX9DAGISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9DAGISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9GISEL-LABEL: divergent_value_dpp_i64:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9GISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX9GISEL-NEXT: v_bfrev_b32_e32 v4, -2
+; GFX9GISEL-NEXT: v_cndmask_b32_e64 v5, -1, v2, s[4:5]
+; GFX9GISEL-NEXT: v_cndmask_b32_e64 v6, v4, v3, s[4:5]
+; GFX9GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT: v_mov_b32_e32 v8, v6
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_mov_b32_dpp v8, v8 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[5:6], v[7:8]
+; GFX9GISEL-NEXT: v_cndmask_b32_e32 v4, v7, v5, vcc
+; GFX9GISEL-NEXT: v_cndmask_b32_e32 v5, v8, v6, vcc
+; GFX9GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX9GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX9GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX9GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX9GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX9GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX9GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX9GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX9GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX9GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX9GISEL-NEXT: v_readlane_b32 s6, v4, 63
+; GFX9GISEL-NEXT: v_readlane_b32 s7, v5, 63
+; GFX9GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9GISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX9GISEL-NEXT: v_mov_b32_e32 v3, s7
+; GFX9GISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9GISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1064DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX1064DAGISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e64 v4, -1, v2, s[4:5]
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e64 v5, 0x7fffffff, v3, s[4:5]
+; GFX1064DAGISEL-NEXT: v_mbcnt_lo_u32_b32 v8, -1, 0
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1064DAGISEL-NEXT: v_mbcnt_hi_u32_b32 v8, -1, v8
+; GFX1064DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT: v_add_nc_u32_e32 v8, 32, v8
+; GFX1064DAGISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX1064DAGISEL-NEXT: v_mul_lo_u32 v8, 4, v8
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1064DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1064DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1064DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1064DAGISEL-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1064DAGISEL-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1064DAGISEL-NEXT: ds_permute_b32 v6, v8, v4
+; GFX1064DAGISEL-NEXT: ds_permute_b32 v7, v8, v5
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s6, v4, 63
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s7, v5, 63
+; GFX1064DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v3, s7
+; GFX1064DAGISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1064DAGISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX1064DAGISEL-NEXT: s_clause 0x6 ; 28-byte Folded Reload
+; GFX1064DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32
+; GFX1064DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4
+; GFX1064DAGISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
+; GFX1064DAGISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12
+; GFX1064DAGISEL-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16
+; GFX1064DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20
+; GFX1064DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24
+; GFX1064DAGISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1064DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1064GISEL-LABEL: divergent_value_dpp_i64:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX1064GISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1064GISEL-NEXT: v_cndmask_b32_e64 v4, -1, v2, s[4:5]
+; GFX1064GISEL-NEXT: v_cndmask_b32_e64 v5, 0x7fffffff, v3, s[4:5]
+; GFX1064GISEL-NEXT: v_mbcnt_lo_u32_b32 v8, -1, 0
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1064GISEL-NEXT: v_mbcnt_hi_u32_b32 v8, -1, v8
+; GFX1064GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT: v_add_nc_u32_e32 v8, 32, v8
+; GFX1064GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX1064GISEL-NEXT: v_mul_lo_u32 v8, 4, v8
+; GFX1064GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1064GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX1064GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1064GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX1064GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1064GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX1064GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1064GISEL-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1064GISEL-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX1064GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1064GISEL-NEXT: ds_permute_b32 v6, v8, v4
+; GFX1064GISEL-NEXT: ds_permute_b32 v7, v8, v5
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX1064GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1064GISEL-NEXT: v_readlane_b32 s6, v4, 63
+; GFX1064GISEL-NEXT: v_readlane_b32 s7, v5, 63
+; GFX1064GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v3, s7
+; GFX1064GISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1064GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX1064GISEL-NEXT: s_clause 0x6 ; 28-byte Folded Reload
+; GFX1064GISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32
+; GFX1064GISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4
+; GFX1064GISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
+; GFX1064GISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12
+; GFX1064GISEL-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16
+; GFX1064GISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20
+; GFX1064GISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24
+; GFX1064GISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1064GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1032DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: s_xor_saveexec_b32 s4, -1
+; GFX1032DAGISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX1032DAGISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX1032DAGISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX1032DAGISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX1032DAGISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX1032DAGISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX1032DAGISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032DAGISEL-NEXT: s_mov_b32 exec_lo, s4
+; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s6, -1
+; GFX1032DAGISEL-NEXT: v_cndmask_b32_e64 v4, -1, v2, s6
+; GFX1032DAGISEL-NEXT: v_cndmask_b32_e64 v5, 0x7fffffff, v3, s6
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1032DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1032DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1032DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc_lo
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1032DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1032DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1032DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc_lo
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1032DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1032DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1032DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc_lo
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1032DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1032DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1032DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc_lo
+; GFX1032DAGISEL-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1032DAGISEL-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1032DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1032DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc_lo
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s4, v4, 31
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v5, 31
+; GFX1032DAGISEL-NEXT: s_mov_b32 exec_lo, s6
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032DAGISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1032DAGISEL-NEXT: s_xor_saveexec_b32 s4, -1
+; GFX1032DAGISEL-NEXT: s_clause 0x5 ; 24-byte Folded Reload
+; GFX1032DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32
+; GFX1032DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4
+; GFX1032DAGISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
+; GFX1032DAGISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12
+; GFX1032DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16
+; GFX1032DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20
+; GFX1032DAGISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032DAGISEL-NEXT: s_mov_b32 exec_lo, s4
+; GFX1032DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1032DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1032GISEL-LABEL: divergent_value_dpp_i64:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032GISEL-NEXT: s_xor_saveexec_b32 s4, -1
+; GFX1032GISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX1032GISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX1032GISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX1032GISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX1032GISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX1032GISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX1032GISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032GISEL-NEXT: s_mov_b32 exec_lo, s4
+; GFX1032GISEL-NEXT: s_or_saveexec_b32 s6, -1
+; GFX1032GISEL-NEXT: v_cndmask_b32_e64 v4, -1, v2, s6
+; GFX1032GISEL-NEXT: v_cndmask_b32_e64 v5, 0x7fffffff, v3, s6
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1032GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1032GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1032GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc_lo
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1032GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1032GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1032GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc_lo
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1032GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1032GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1032GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc_lo
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1032GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1032GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1032GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc_lo
+; GFX1032GISEL-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1032GISEL-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1032GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1032GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc_lo
+; GFX1032GISEL-NEXT: v_readlane_b32 s4, v4, 31
+; GFX1032GISEL-NEXT: v_readlane_b32 s5, v5, 31
+; GFX1032GISEL-NEXT: s_mov_b32 exec_lo, s6
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032GISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1032GISEL-NEXT: s_xor_saveexec_b32 s4, -1
+; GFX1032GISEL-NEXT: s_clause 0x5 ; 24-byte Folded Reload
+; GFX1032GISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32
+; GFX1032GISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4
+; GFX1032GISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
+; GFX1032GISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12
+; GFX1032GISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16
+; GFX1032GISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20
+; GFX1032GISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032GISEL-NEXT: s_mov_b32 exec_lo, s4
+; GFX1032GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1032GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1164DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-NEXT: s_clause 0x4 ; 28-byte Folded Spill
+; GFX1164DAGISEL-NEXT: scratch_store_b64 off, v[4:5], s32
+; GFX1164DAGISEL-NEXT: scratch_store_b64 off, v[6:7], s32 offset:8
+; GFX1164DAGISEL-NEXT: scratch_store_b32 off, v8, s32 offset:16
+; GFX1164DAGISEL-NEXT: scratch_store_b32 off, v4, s32 offset:20
+; GFX1164DAGISEL-NEXT: scratch_store_b32 off, v5, s32 offset:24
+; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1164DAGISEL-NEXT: v_cndmask_b32_e64 v4, -1, v2, s[0:1]
+; GFX1164DAGISEL-NEXT: v_cndmask_b32_e64 v5, 0x7fffffff, v3, s[0:1]
+; GFX1164DAGISEL-NEXT: v_mbcnt_lo_u32_b32 v8, -1, 0
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-NEXT: v_mbcnt_hi_u32_b32 v8, -1, v8
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT: v_add_nc_u32_e32 v8, 32, v8
+; GFX1164DAGISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT: v_mul_lo_u32 v8, 4, v8
+; GFX1164DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX1164DAGISEL-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX1164DAGISEL-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX1164DAGISEL-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1164DAGISEL-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX1164DAGISEL-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1164DAGISEL-NEXT: ds_permute_b32 v6, v8, v4
+; GFX1164DAGISEL-NEXT: ds_permute_b32 v7, v8, v5
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX1164DAGISEL-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT: v_readlane_b32 s2, v4, 63
+; GFX1164DAGISEL-NEXT: v_readlane_b32 s3, v5, 63
+; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v3, s3
+; GFX1164DAGISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1164DAGISEL-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-NEXT: s_waitcnt_depctr depctr_sa_sdst(0)
+; GFX1164DAGISEL-NEXT: s_clause 0x4 ; 28-byte Folded Reload
+; GFX1164DAGISEL-NEXT: scratch_load_b64 v[4:5], off, s32
+; GFX1164DAGISEL-NEXT: scratch_load_b64 v[6:7], off, s32 offset:8
+; GFX1164DAGISEL-NEXT: scratch_load_b32 v8, off, s32 offset:16
+; GFX1164DAGISEL-NEXT: scratch_load_b32 v4, off, s32 offset:20
+; GFX1164DAGISEL-NEXT: scratch_load_b32 v5, off, s32 offset:24
+; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1164DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1164GISEL-LABEL: divergent_value_dpp_i64:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164GISEL-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX1164GISEL-NEXT: s_clause 0x4 ; 28-byte Folded Spill
+; GFX1164GISEL-NEXT: scratch_store_b64 off, v[4:5], s32
+; GFX1164GISEL-NEXT: scratch_store_b64 off, v[6:7], s32 offset:8
+; GFX1164GISEL-NEXT: scratch_store_b32 off, v8, s32 offset:16
+; GFX1164GISEL-NEXT: scratch_store_b32 off, v4, s32 offset:20
+; GFX1164GISEL-NEXT: scratch_store_b32 off, v5, s32 offset:24
+; GFX1164GISEL-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164GISEL-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1164GISEL-NEXT: v_cndmask_b32_e64 v4, -1, v2, s[0:1]
+; GFX1164GISEL-NEXT: v_cndmask_b32_e64 v5, 0x7fffffff, v3, s[0:1]
+; GFX1164GISEL-NEXT: v_mbcnt_lo_u32_b32 v8, -1, 0
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164GISEL-NEXT: v_mbcnt_hi_u32_b32 v8, -1, v8
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT: v_add_nc_u32_e32 v8, 32, v8
+; GFX1164GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT: v_mul_lo_u32 v8, 4, v8
+; GFX1164GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX1164GISEL-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX1164GISEL-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX1164GISEL-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1164GISEL-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1164GISEL-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX1164GISEL-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1164GISEL-NEXT: ds_permute_b32 v6, v8, v4
+; GFX1164GISEL-NEXT: ds_permute_b32 v7, v8, v5
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX1164GISEL-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT: v_readlane_b32 s2, v4, 63
+; GFX1164GISEL-NEXT: v_readlane_b32 s3, v5, 63
+; GFX1164GISEL-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v3, s3
+; GFX1164GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1164GISEL-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX1164GISEL-NEXT: s_waitcnt_depctr depctr_sa_sdst(0)
+; GFX1164GISEL-NEXT: s_clause 0x4 ; 28-byte Folded Reload
+; GFX1164GISEL-NEXT: scratch_load_b64 v[4:5], off, s32
+; GFX1164GISEL-NEXT: scratch_load_b64 v[6:7], off, s32 offset:8
+; GFX1164GISEL-NEXT: scratch_load_b32 v8, off, s32 offset:16
+; GFX1164GISEL-NEXT: scratch_load_b32 v4, off, s32 offset:20
+; GFX1164GISEL-NEXT: scratch_load_b32 v5, off, s32 offset:24
+; GFX1164GISEL-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1164GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1132DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX1132DAGISEL-NEXT: s_clause 0x3 ; 24-byte Folded Spill
+; GFX1132DAGISEL-NEXT: scratch_store_b64 off, v[4:5], s32
+; GFX1132DAGISEL-NEXT: scratch_store_b64 off, v[6:7], s32 offset:8
+; GFX1132DAGISEL-NEXT: scratch_store_b32 off, v4, s32 offset:16
+; GFX1132DAGISEL-NEXT: scratch_store_b32 off, v5, s32 offset:20
+; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_cndmask_b32_e64 v4, -1, v2, s2
+; GFX1132DAGISEL-NEXT: v_cndmask_b32_e64 v5, 0x7fffffff, v3, s2
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1132DAGISEL-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v7, v5
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1132DAGISEL-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v7, v5
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1132DAGISEL-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v7, v5
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1132DAGISEL-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v7, v5
+; GFX1132DAGISEL-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1132DAGISEL-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v7, v5
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT: v_readlane_b32 s0, v4, 31
+; GFX1132DAGISEL-NEXT: v_readlane_b32 s1, v5, 31
+; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX1132DAGISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1132DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX1132DAGISEL-NEXT: s_clause 0x3 ; 24-byte Folded Reload
+; GFX1132DAGISEL-NEXT: scratch_load_b64 v[4:5], off, s32
+; GFX1132DAGISEL-NEXT: scratch_load_b64 v[6:7], off, s32 offset:8
+; GFX1132DAGISEL-NEXT: scratch_load_b32 v4, off, s32 offset:16
+; GFX1132DAGISEL-NEXT: scratch_load_b32 v5, off, s32 offset:20
+; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1132DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1132GISEL-LABEL: divergent_value_dpp_i64:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132GISEL-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX1132GISEL-NEXT: s_clause 0x3 ; 24-byte Folded Spill
+; GFX1132GISEL-NEXT: scratch_store_b64 off, v[4:5], s32
+; GFX1132GISEL-NEXT: scratch_store_b64 off, v[6:7], s32 offset:8
+; GFX1132GISEL-NEXT: scratch_store_b32 off, v4, s32 offset:16
+; GFX1132GISEL-NEXT: scratch_store_b32 off, v5, s32 offset:20
+; GFX1132GISEL-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132GISEL-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_cndmask_b32_e64 v4, -1, v2, s2
+; GFX1132GISEL-NEXT: v_cndmask_b32_e64 v5, 0x7fffffff, v3, s2
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1132GISEL-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v7, v5
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1132GISEL-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v7, v5
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1132GISEL-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v7, v5
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1132GISEL-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v7, v5
+; GFX1132GISEL-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1132GISEL-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1132GISEL-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v7, v5
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT: v_readlane_b32 s0, v4, 31
+; GFX1132GISEL-NEXT: v_readlane_b32 s1, v5, 31
+; GFX1132GISEL-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX1132GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1132GISEL-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX1132GISEL-NEXT: s_clause 0x3 ; 24-byte Folded Reload
+; GFX1132GISEL-NEXT: scratch_load_b64 v[4:5], off, s32
+; GFX1132GISEL-NEXT: scratch_load_b64 v[6:7], off, s32 offset:8
+; GFX1132GISEL-NEXT: scratch_load_b32 v4, off, s32 offset:16
+; GFX1132GISEL-NEXT: scratch_load_b32 v5, off, s32 offset:20
+; GFX1132GISEL-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1132GISEL-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %result = call i64 @llvm.amdgcn.wave.reduce.min.i64(i64 %in, i32 2)
+ store i64 %result, ptr addrspace(1) %out
+ ret void
+}
+
define amdgpu_kernel void @default_stratergy(ptr addrspace(1) %out) {
; GFX8DAGISEL-LABEL: default_stratergy:
; GFX8DAGISEL: ; %bb.0: ; %entry
@@ -1168,20 +2144,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s2
; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB5_6
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB6_6
; GFX8DAGISEL-NEXT: ; %bb.3: ; %if
; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX8DAGISEL-NEXT: s_brev_b32 s6, -2
-; GFX8DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX8DAGISEL-NEXT: s_min_i32 s6, s6, s8
; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4
; GFX8DAGISEL-NEXT: ; %bb.5:
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6
-; GFX8DAGISEL-NEXT: .LBB5_6: ; %endif
+; GFX8DAGISEL-NEXT: .LBB6_6: ; %endif
; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
@@ -1196,26 +2172,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX8GISEL-NEXT: ; implicit-def: $sgpr6
; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX8GISEL-NEXT: s_cbranch_execz .LBB5_2
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX8GISEL-NEXT: ; %bb.1: ; %else
; GFX8GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
; GFX8GISEL-NEXT: ; implicit-def: $vgpr0
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8GISEL-NEXT: s_mov_b32 s6, s2
-; GFX8GISEL-NEXT: .LBB5_2: ; %Flow
+; GFX8GISEL-NEXT: .LBB6_2: ; %Flow
; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX8GISEL-NEXT: s_cbranch_execz .LBB5_5
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB6_5
; GFX8GISEL-NEXT: ; %bb.3: ; %if
; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX8GISEL-NEXT: s_brev_b32 s6, -2
-; GFX8GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
; GFX8GISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX8GISEL-NEXT: s_min_i32 s6, s6, s8
; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB5_4
-; GFX8GISEL-NEXT: .LBB5_5: ; %endif
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB6_4
+; GFX8GISEL-NEXT: .LBB6_5: ; %endif
; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6
@@ -1239,20 +2215,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB5_6
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB6_6
; GFX9DAGISEL-NEXT: ; %bb.3: ; %if
; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX9DAGISEL-NEXT: s_brev_b32 s6, -2
-; GFX9DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX9DAGISEL-NEXT: s_min_i32 s6, s6, s8
; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4
; GFX9DAGISEL-NEXT: ; %bb.5:
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6
-; GFX9DAGISEL-NEXT: .LBB5_6: ; %endif
+; GFX9DAGISEL-NEXT: .LBB6_6: ; %endif
; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
@@ -1266,26 +2242,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX9GISEL-NEXT: ; implicit-def: $sgpr6
; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX9GISEL-NEXT: s_cbranch_execz .LBB5_2
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX9GISEL-NEXT: ; %bb.1: ; %else
; GFX9GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
; GFX9GISEL-NEXT: ; implicit-def: $vgpr0
; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9GISEL-NEXT: s_mov_b32 s6, s2
-; GFX9GISEL-NEXT: .LBB5_2: ; %Flow
+; GFX9GISEL-NEXT: .LBB6_2: ; %Flow
; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX9GISEL-NEXT: s_cbranch_execz .LBB5_5
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB6_5
; GFX9GISEL-NEXT: ; %bb.3: ; %if
; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX9GISEL-NEXT: s_brev_b32 s6, -2
-; GFX9GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
; GFX9GISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX9GISEL-NEXT: s_min_i32 s6, s6, s8
; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB5_4
-; GFX9GISEL-NEXT: .LBB5_5: ; %endif
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB6_4
+; GFX9GISEL-NEXT: .LBB6_5: ; %endif
; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6
@@ -1308,20 +2284,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB5_6
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB6_6
; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if
; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX1064DAGISEL-NEXT: s_brev_b32 s6, -2
-; GFX1064DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX1064DAGISEL-NEXT: s_min_i32 s6, s6, s8
; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4
; GFX1064DAGISEL-NEXT: ; %bb.5:
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6
-; GFX1064DAGISEL-NEXT: .LBB5_6: ; %endif
+; GFX1064DAGISEL-NEXT: .LBB6_6: ; %endif
; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
@@ -1335,26 +2311,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6
; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064GISEL-NEXT: s_cbranch_execz .LBB5_2
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX1064GISEL-NEXT: ; %bb.1: ; %else
; GFX1064GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0
; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064GISEL-NEXT: s_mov_b32 s6, s2
-; GFX1064GISEL-NEXT: .LBB5_2: ; %Flow
+; GFX1064GISEL-NEXT: .LBB6_2: ; %Flow
; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX1064GISEL-NEXT: s_cbranch_execz .LBB5_5
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB6_5
; GFX1064GISEL-NEXT: ; %bb.3: ; %if
; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX1064GISEL-NEXT: s_brev_b32 s6, -2
-; GFX1064GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
; GFX1064GISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX1064GISEL-NEXT: s_min_i32 s6, s6, s8
; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB5_4
-; GFX1064GISEL-NEXT: .LBB5_5: ; %endif
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB6_4
+; GFX1064GISEL-NEXT: .LBB6_5: ; %endif
; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6
@@ -1377,20 +2353,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB5_6
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB6_6
; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if
; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
; GFX1032DAGISEL-NEXT: s_brev_b32 s1, -2
-; GFX1032DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s3, s2
; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s3
; GFX1032DAGISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1032DAGISEL-NEXT: s_min_i32 s1, s1, s6
; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4
; GFX1032DAGISEL-NEXT: ; %bb.5:
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX1032DAGISEL-NEXT: .LBB5_6: ; %endif
+; GFX1032DAGISEL-NEXT: .LBB6_6: ; %endif
; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
@@ -1404,26 +2380,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1032GISEL-NEXT: ; implicit-def: $sgpr0
; GFX1032GISEL-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032GISEL-NEXT: s_xor_b32 s1, exec_lo, s1
-; GFX1032GISEL-NEXT: s_cbranch_execz .LBB5_2
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX1032GISEL-NEXT: ; %bb.1: ; %else
; GFX1032GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c
; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0
; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032GISEL-NEXT: s_mov_b32 s0, s0
-; GFX1032GISEL-NEXT: .LBB5_2: ; %Flow
+; GFX1032GISEL-NEXT: .LBB6_2: ; %Flow
; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s1, s1
-; GFX1032GISEL-NEXT: s_cbranch_execz .LBB5_5
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB6_5
; GFX1032GISEL-NEXT: ; %bb.3: ; %if
; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
; GFX1032GISEL-NEXT: s_brev_b32 s0, -2
-; GFX1032GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX1032GISEL-NEXT: s_ff1_i32_b32 s3, s2
; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s3
; GFX1032GISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1032GISEL-NEXT: s_min_i32 s0, s0, s6
; GFX1032GISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB5_4
-; GFX1032GISEL-NEXT: .LBB5_5: ; %endif
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB6_4
+; GFX1032GISEL-NEXT: .LBB6_5: ; %endif
; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX1032GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0
@@ -1448,21 +2424,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB5_6
+; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB6_6
; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if
; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX1164DAGISEL-NEXT: s_brev_b32 s6, -2
-; GFX1164DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX1164DAGISEL-NEXT: s_min_i32 s6, s6, s8
; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4
; GFX1164DAGISEL-NEXT: ; %bb.5:
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6
-; GFX1164DAGISEL-NEXT: .LBB5_6: ; %endif
+; GFX1164DAGISEL-NEXT: .LBB6_6: ; %endif
; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
@@ -1478,27 +2454,27 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164GISEL-NEXT: s_cbranch_execz .LBB5_2
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX1164GISEL-NEXT: ; %bb.1: ; %else
; GFX1164GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0
; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164GISEL-NEXT: s_mov_b32 s6, s2
-; GFX1164GISEL-NEXT: .LBB5_2: ; %Flow
+; GFX1164GISEL-NEXT: .LBB6_2: ; %Flow
; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1]
-; GFX1164GISEL-NEXT: s_cbranch_execz .LBB5_5
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB6_5
; GFX1164GISEL-NEXT: ; %bb.3: ; %if
; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX1164GISEL-NEXT: s_brev_b32 s6, -2
-; GFX1164GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX1164GISEL-NEXT: s_min_i32 s6, s6, s8
; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB5_4
-; GFX1164GISEL-NEXT: .LBB5_5: ; %endif
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB6_4
+; GFX1164GISEL-NEXT: .LBB6_5: ; %endif
; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6
@@ -1523,21 +2499,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB5_6
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB6_6
; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if
; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
; GFX1132DAGISEL-NEXT: s_brev_b32 s1, -2
-; GFX1132DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s3
; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1132DAGISEL-NEXT: s_min_i32 s1, s1, s6
; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4
; GFX1132DAGISEL-NEXT: ; %bb.5:
; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX1132DAGISEL-NEXT: .LBB5_6: ; %endif
+; GFX1132DAGISEL-NEXT: .LBB6_6: ; %endif
; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0
@@ -1553,27 +2529,27 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
; GFX1132GISEL-NEXT: s_xor_b32 s1, exec_lo, s1
-; GFX1132GISEL-NEXT: s_cbranch_execz .LBB5_2
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX1132GISEL-NEXT: ; %bb.1: ; %else
; GFX1132GISEL-NEXT: s_load_b32 s0, s[4:5], 0x2c
; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0
; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132GISEL-NEXT: s_mov_b32 s0, s0
-; GFX1132GISEL-NEXT: .LBB5_2: ; %Flow
+; GFX1132GISEL-NEXT: .LBB6_2: ; %Flow
; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s1, s1
-; GFX1132GISEL-NEXT: s_cbranch_execz .LBB5_5
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB6_5
; GFX1132GISEL-NEXT: ; %bb.3: ; %if
; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
; GFX1132GISEL-NEXT: s_brev_b32 s0, -2
-; GFX1132GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2
; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s3
; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1132GISEL-NEXT: s_min_i32 s0, s0, s6
; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB5_4
-; GFX1132GISEL-NEXT: .LBB5_5: ; %endif
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB6_4
+; GFX1132GISEL-NEXT: .LBB6_5: ; %endif
; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24
; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
@@ -1712,7 +2688,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX8DAGISEL-NEXT: s_mov_b32 s4, -1
; GFX8DAGISEL-NEXT: s_brev_b32 s5, -2
; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX8DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s12, s[6:7]
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v4, s4
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v5, s5
@@ -1723,7 +2699,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX8DAGISEL-NEXT: s_bitset0_b64 s[6:7], s12
; GFX8DAGISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5]
; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX8DAGISEL-NEXT: ; %bb.2:
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -1737,7 +2713,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX8GISEL-NEXT: s_mov_b32 s4, -1
; GFX8GISEL-NEXT: s_brev_b32 s5, -2
; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX8GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX8GISEL-NEXT: s_ff1_i32_b64 s12, s[6:7]
; GFX8GISEL-NEXT: v_mov_b32_e32 v4, s4
; GFX8GISEL-NEXT: v_mov_b32_e32 v5, s5
@@ -1748,7 +2724,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX8GISEL-NEXT: s_bitset0_b64 s[6:7], s12
; GFX8GISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5]
; GFX8GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX8GISEL-NEXT: ; %bb.2:
; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -1762,7 +2738,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX9DAGISEL-NEXT: s_mov_b32 s4, -1
; GFX9DAGISEL-NEXT: s_brev_b32 s5, -2
; GFX9DAGISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX9DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s12, s[6:7]
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v4, s4
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v5, s5
@@ -1773,7 +2749,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX9DAGISEL-NEXT: s_bitset0_b64 s[6:7], s12
; GFX9DAGISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5]
; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX9DAGISEL-NEXT: ; %bb.2:
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -1787,7 +2763,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX9GISEL-NEXT: s_mov_b32 s4, -1
; GFX9GISEL-NEXT: s_brev_b32 s5, -2
; GFX9GISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX9GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX9GISEL-NEXT: s_ff1_i32_b64 s12, s[6:7]
; GFX9GISEL-NEXT: v_mov_b32_e32 v4, s4
; GFX9GISEL-NEXT: v_mov_b32_e32 v5, s5
@@ -1798,7 +2774,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX9GISEL-NEXT: s_bitset0_b64 s[6:7], s12
; GFX9GISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5]
; GFX9GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX9GISEL-NEXT: ; %bb.2:
; GFX9GISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX9GISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -1812,7 +2788,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1064DAGISEL-NEXT: s_mov_b32 s4, -1
; GFX1064DAGISEL-NEXT: s_brev_b32 s5, -2
; GFX1064DAGISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX1064DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s12, s[6:7]
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v4, s4
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v5, s5
@@ -1823,7 +2799,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[6:7], s12
; GFX1064DAGISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5]
; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1064DAGISEL-NEXT: ; %bb.2:
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -1836,7 +2812,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1064GISEL-NEXT: s_mov_b32 s4, -1
; GFX1064GISEL-NEXT: s_brev_b32 s5, -2
; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX1064GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1064GISEL-NEXT: s_ff1_i32_b64 s12, s[6:7]
; GFX1064GISEL-NEXT: v_mov_b32_e32 v4, s4
; GFX1064GISEL-NEXT: v_mov_b32_e32 v5, s5
@@ -1847,7 +2823,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1064GISEL-NEXT: s_bitset0_b64 s[6:7], s12
; GFX1064GISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5]
; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1064GISEL-NEXT: ; %bb.2:
; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX1064GISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -1860,7 +2836,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1032DAGISEL-NEXT: s_mov_b32 s4, -1
; GFX1032DAGISEL-NEXT: s_brev_b32 s5, -2
; GFX1032DAGISEL-NEXT: s_mov_b32 s6, exec_lo
-; GFX1032DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s7, s6
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v4, s4
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v5, s5
@@ -1871,7 +2847,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1032DAGISEL-NEXT: s_bitset0_b32 s6, s7
; GFX1032DAGISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5]
; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s6, 0
-; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1032DAGISEL-NEXT: ; %bb.2:
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -1884,7 +2860,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1032GISEL-NEXT: s_mov_b32 s4, -1
; GFX1032GISEL-NEXT: s_brev_b32 s5, -2
; GFX1032GISEL-NEXT: s_mov_b32 s6, exec_lo
-; GFX1032GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1032GISEL-NEXT: s_ff1_i32_b32 s7, s6
; GFX1032GISEL-NEXT: v_mov_b32_e32 v4, s4
; GFX1032GISEL-NEXT: v_mov_b32_e32 v5, s5
@@ -1895,7 +2871,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1032GISEL-NEXT: s_bitset0_b32 s6, s7
; GFX1032GISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5]
; GFX1032GISEL-NEXT: s_cmp_lg_u32 s6, 0
-; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1032GISEL-NEXT: ; %bb.2:
; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX1032GISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -1908,7 +2884,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1164DAGISEL-NEXT: s_mov_b32 s0, -1
; GFX1164DAGISEL-NEXT: s_brev_b32 s1, -2
; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s8, s[2:3]
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v4, s0
@@ -1920,7 +2896,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s8
; GFX1164DAGISEL-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1]
; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1164DAGISEL-NEXT: ; %bb.2:
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v3, s1
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, s0
@@ -1933,7 +2909,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1164GISEL-NEXT: s_mov_b32 s0, -1
; GFX1164GISEL-NEXT: s_brev_b32 s1, -2
; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX1164GISEL-NEXT: s_ctz_i32_b64 s8, s[2:3]
; GFX1164GISEL-NEXT: v_mov_b32_e32 v4, s0
@@ -1945,7 +2921,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s8
; GFX1164GISEL-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1]
; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1164GISEL-NEXT: ; %bb.2:
; GFX1164GISEL-NEXT: v_mov_b32_e32 v3, s1
; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, s0
@@ -1958,7 +2934,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1132DAGISEL-NEXT: s_mov_b32 s0, -1
; GFX1132DAGISEL-NEXT: s_brev_b32 s1, -2
; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2
; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
@@ -1969,7 +2945,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1132DAGISEL-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1]
; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1132DAGISEL-NEXT: ; %bb.2:
; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132DAGISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
@@ -1981,7 +2957,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1132GISEL-NEXT: s_mov_b32 s0, -1
; GFX1132GISEL-NEXT: s_brev_b32 s1, -2
; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2
; GFX1132GISEL-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
@@ -1992,7 +2968,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1132GISEL-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1]
; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1132GISEL-NEXT: ; %bb.2:
; GFX1132GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
@@ -2033,19 +3009,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX8GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7
; GFX8GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc
; GFX8GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
-; GFX8GISEL-NEXT: s_cbranch_execz .LBB8_2
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX8GISEL-NEXT: ; %bb.1: ; %else
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8GISEL-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX8GISEL-NEXT: .LBB8_2: ; %Flow
+; GFX8GISEL-NEXT: .LBB9_2: ; %Flow
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[8:9]
-; GFX8GISEL-NEXT: s_cbranch_execz .LBB8_4
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX8GISEL-NEXT: ; %bb.3: ; %if
; GFX8GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8GISEL-NEXT: s_mov_b64 s[6:7], s[4:5]
-; GFX8GISEL-NEXT: .LBB8_4: ; %endif
+; GFX8GISEL-NEXT: .LBB9_4: ; %endif
; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1
@@ -2082,19 +3058,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX9GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7
; GFX9GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc
; GFX9GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
-; GFX9GISEL-NEXT: s_cbranch_execz .LBB8_2
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX9GISEL-NEXT: ; %bb.1: ; %else
; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9GISEL-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX9GISEL-NEXT: .LBB8_2: ; %Flow
+; GFX9GISEL-NEXT: .LBB9_2: ; %Flow
; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[8:9]
-; GFX9GISEL-NEXT: s_cbranch_execz .LBB8_4
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX9GISEL-NEXT: ; %bb.3: ; %if
; GFX9GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9GISEL-NEXT: s_mov_b64 s[6:7], s[6:7]
-; GFX9GISEL-NEXT: .LBB8_4: ; %endif
+; GFX9GISEL-NEXT: .LBB9_4: ; %endif
; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s7
@@ -2131,19 +3107,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7
; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc
; GFX1064GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
-; GFX1064GISEL-NEXT: s_cbranch_execz .LBB8_2
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX1064GISEL-NEXT: ; %bb.1: ; %else
; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX1064GISEL-NEXT: .LBB8_2: ; %Flow
+; GFX1064GISEL-NEXT: .LBB9_2: ; %Flow
; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[8:9]
-; GFX1064GISEL-NEXT: s_cbranch_execz .LBB8_4
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX1064GISEL-NEXT: ; %bb.3: ; %if
; GFX1064GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], s[6:7]
-; GFX1064GISEL-NEXT: .LBB8_4: ; %endif
+; GFX1064GISEL-NEXT: .LBB9_4: ; %endif
; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, s7
@@ -2180,19 +3156,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1032GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7
; GFX1032GISEL-NEXT: s_and_saveexec_b32 s8, vcc_lo
; GFX1032GISEL-NEXT: s_xor_b32 s8, exec_lo, s8
-; GFX1032GISEL-NEXT: s_cbranch_execz .LBB8_2
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX1032GISEL-NEXT: ; %bb.1: ; %else
; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032GISEL-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX1032GISEL-NEXT: .LBB8_2: ; %Flow
+; GFX1032GISEL-NEXT: .LBB9_2: ; %Flow
; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s2, s8
-; GFX1032GISEL-NEXT: s_cbranch_execz .LBB8_4
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX1032GISEL-NEXT: ; %bb.3: ; %if
; GFX1032GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032GISEL-NEXT: s_mov_b64 s[6:7], s[6:7]
-; GFX1032GISEL-NEXT: .LBB8_4: ; %endif
+; GFX1032GISEL-NEXT: .LBB9_4: ; %endif
; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s7
@@ -2233,19 +3209,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
; GFX1164GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
-; GFX1164GISEL-NEXT: s_cbranch_execz .LBB8_2
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX1164GISEL-NEXT: ; %bb.1: ; %else
; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164GISEL-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX1164GISEL-NEXT: .LBB8_2: ; %Flow
+; GFX1164GISEL-NEXT: .LBB9_2: ; %Flow
; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[2:3], s[8:9]
-; GFX1164GISEL-NEXT: s_cbranch_execz .LBB8_4
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX1164GISEL-NEXT: ; %bb.3: ; %if
; GFX1164GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164GISEL-NEXT: s_mov_b64 s[6:7], s[4:5]
-; GFX1164GISEL-NEXT: .LBB8_4: ; %endif
+; GFX1164GISEL-NEXT: .LBB9_4: ; %endif
; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, s7
@@ -2284,19 +3260,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
; GFX1132GISEL-NEXT: s_xor_b32 s8, exec_lo, s8
-; GFX1132GISEL-NEXT: s_cbranch_execz .LBB8_2
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX1132GISEL-NEXT: ; %bb.1: ; %else
; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132GISEL-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX1132GISEL-NEXT: .LBB8_2: ; %Flow
+; GFX1132GISEL-NEXT: .LBB9_2: ; %Flow
; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s2, s8
-; GFX1132GISEL-NEXT: s_cbranch_execz .LBB8_4
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX1132GISEL-NEXT: ; %bb.3: ; %if
; GFX1132GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132GISEL-NEXT: s_mov_b64 s[6:7], s[4:5]
-; GFX1132GISEL-NEXT: .LBB8_4: ; %endif
+; GFX1132GISEL-NEXT: .LBB9_4: ; %endif
; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
; GFX1132GISEL-NEXT: v_mov_b32_e32 v2, 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll
index f6e3b0ed78b20..238e1430e1cae 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll
@@ -813,6 +813,942 @@ entry:
ret void
}
+define void @divergent_value_dpp_i64(ptr addrspace(1) %out, i64 %in) {
+; GFX8DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8DAGISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX8DAGISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e64 v4, 0, v2, s[4:5]
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e64 v5, 0, v3, s[4:5]
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT: s_nop 0
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT: s_nop 0
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT: s_nop 0
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT: s_nop 0
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT: s_nop 0
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT: s_nop 0
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v4, 63
+; GFX8DAGISEL-NEXT: v_readlane_b32 s7, v5, 63
+; GFX8DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s7
+; GFX8DAGISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8DAGISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX8DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX8DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8GISEL-LABEL: divergent_value_dpp_i64:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX8GISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX8GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v2, s[4:5]
+; GFX8GISEL-NEXT: v_cndmask_b32_e64 v5, 0, v3, s[4:5]
+; GFX8GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT: s_nop 0
+; GFX8GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT: s_nop 0
+; GFX8GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT: s_nop 0
+; GFX8GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT: s_nop 0
+; GFX8GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT: s_nop 0
+; GFX8GISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT: s_nop 0
+; GFX8GISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8GISEL-NEXT: v_readlane_b32 s6, v4, 63
+; GFX8GISEL-NEXT: v_readlane_b32 s7, v5, 63
+; GFX8GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s7
+; GFX8GISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX8GISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX8GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9DAGISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9DAGISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e64 v4, 0, v2, s[4:5]
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e64 v5, 0, v3, s[4:5]
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v4, 63
+; GFX9DAGISEL-NEXT: v_readlane_b32 s7, v5, 63
+; GFX9DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v3, s7
+; GFX9DAGISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9DAGISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9GISEL-LABEL: divergent_value_dpp_i64:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9GISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX9GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v2, s[4:5]
+; GFX9GISEL-NEXT: v_cndmask_b32_e64 v5, 0, v3, s[4:5]
+; GFX9GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX9GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX9GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX9GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX9GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX9GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX9GISEL-NEXT: v_readlane_b32 s6, v4, 63
+; GFX9GISEL-NEXT: v_readlane_b32 s7, v5, 63
+; GFX9GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9GISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX9GISEL-NEXT: v_mov_b32_e32 v3, s7
+; GFX9GISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9GISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1064DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX1064DAGISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e64 v4, 0, v2, s[4:5]
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e64 v5, 0, v3, s[4:5]
+; GFX1064DAGISEL-NEXT: v_mbcnt_lo_u32_b32 v8, -1, 0
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1064DAGISEL-NEXT: v_mbcnt_hi_u32_b32 v8, -1, v8
+; GFX1064DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT: v_add_nc_u32_e32 v8, 32, v8
+; GFX1064DAGISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX1064DAGISEL-NEXT: v_mul_lo_u32 v8, 4, v8
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1064DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1064DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1064DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1064DAGISEL-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1064DAGISEL-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1064DAGISEL-NEXT: ds_permute_b32 v6, v8, v4
+; GFX1064DAGISEL-NEXT: ds_permute_b32 v7, v8, v5
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s6, v4, 63
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s7, v5, 63
+; GFX1064DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v3, s7
+; GFX1064DAGISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1064DAGISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX1064DAGISEL-NEXT: s_clause 0x6 ; 28-byte Folded Reload
+; GFX1064DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32
+; GFX1064DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4
+; GFX1064DAGISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
+; GFX1064DAGISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12
+; GFX1064DAGISEL-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16
+; GFX1064DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20
+; GFX1064DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24
+; GFX1064DAGISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1064DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1064GISEL-LABEL: divergent_value_dpp_i64:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX1064GISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1064GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v2, s[4:5]
+; GFX1064GISEL-NEXT: v_cndmask_b32_e64 v5, 0, v3, s[4:5]
+; GFX1064GISEL-NEXT: v_mbcnt_lo_u32_b32 v8, -1, 0
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1064GISEL-NEXT: v_mbcnt_hi_u32_b32 v8, -1, v8
+; GFX1064GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT: v_add_nc_u32_e32 v8, 32, v8
+; GFX1064GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX1064GISEL-NEXT: v_mul_lo_u32 v8, 4, v8
+; GFX1064GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1064GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX1064GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1064GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX1064GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1064GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX1064GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1064GISEL-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1064GISEL-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX1064GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1064GISEL-NEXT: ds_permute_b32 v6, v8, v4
+; GFX1064GISEL-NEXT: ds_permute_b32 v7, v8, v5
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX1064GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1064GISEL-NEXT: v_readlane_b32 s6, v4, 63
+; GFX1064GISEL-NEXT: v_readlane_b32 s7, v5, 63
+; GFX1064GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v3, s7
+; GFX1064GISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1064GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX1064GISEL-NEXT: s_clause 0x6 ; 28-byte Folded Reload
+; GFX1064GISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32
+; GFX1064GISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4
+; GFX1064GISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
+; GFX1064GISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12
+; GFX1064GISEL-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16
+; GFX1064GISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20
+; GFX1064GISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24
+; GFX1064GISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1064GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1032DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: s_xor_saveexec_b32 s4, -1
+; GFX1032DAGISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX1032DAGISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX1032DAGISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX1032DAGISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX1032DAGISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX1032DAGISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX1032DAGISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032DAGISEL-NEXT: s_mov_b32 exec_lo, s4
+; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s6, -1
+; GFX1032DAGISEL-NEXT: v_cndmask_b32_e64 v4, 0, v2, s6
+; GFX1032DAGISEL-NEXT: v_cndmask_b32_e64 v5, 0, v3, s6
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1032DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1032DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1032DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc_lo
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1032DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1032DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1032DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc_lo
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1032DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1032DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1032DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc_lo
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1032DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1032DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1032DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc_lo
+; GFX1032DAGISEL-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1032DAGISEL-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1032DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1032DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc_lo
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s4, v4, 31
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v5, 31
+; GFX1032DAGISEL-NEXT: s_mov_b32 exec_lo, s6
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032DAGISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1032DAGISEL-NEXT: s_xor_saveexec_b32 s4, -1
+; GFX1032DAGISEL-NEXT: s_clause 0x5 ; 24-byte Folded Reload
+; GFX1032DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32
+; GFX1032DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4
+; GFX1032DAGISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
+; GFX1032DAGISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12
+; GFX1032DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16
+; GFX1032DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20
+; GFX1032DAGISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032DAGISEL-NEXT: s_mov_b32 exec_lo, s4
+; GFX1032DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1032DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1032GISEL-LABEL: divergent_value_dpp_i64:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032GISEL-NEXT: s_xor_saveexec_b32 s4, -1
+; GFX1032GISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX1032GISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX1032GISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX1032GISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX1032GISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX1032GISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX1032GISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032GISEL-NEXT: s_mov_b32 exec_lo, s4
+; GFX1032GISEL-NEXT: s_or_saveexec_b32 s6, -1
+; GFX1032GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v2, s6
+; GFX1032GISEL-NEXT: v_cndmask_b32_e64 v5, 0, v3, s6
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1032GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1032GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1032GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc_lo
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1032GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1032GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1032GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc_lo
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1032GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1032GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1032GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc_lo
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1032GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1032GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1032GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc_lo
+; GFX1032GISEL-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1032GISEL-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1032GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1032GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc_lo
+; GFX1032GISEL-NEXT: v_readlane_b32 s4, v4, 31
+; GFX1032GISEL-NEXT: v_readlane_b32 s5, v5, 31
+; GFX1032GISEL-NEXT: s_mov_b32 exec_lo, s6
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032GISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1032GISEL-NEXT: s_xor_saveexec_b32 s4, -1
+; GFX1032GISEL-NEXT: s_clause 0x5 ; 24-byte Folded Reload
+; GFX1032GISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32
+; GFX1032GISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4
+; GFX1032GISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
+; GFX1032GISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12
+; GFX1032GISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16
+; GFX1032GISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20
+; GFX1032GISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032GISEL-NEXT: s_mov_b32 exec_lo, s4
+; GFX1032GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1032GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1164DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-NEXT: s_clause 0x4 ; 28-byte Folded Spill
+; GFX1164DAGISEL-NEXT: scratch_store_b64 off, v[4:5], s32
+; GFX1164DAGISEL-NEXT: scratch_store_b64 off, v[6:7], s32 offset:8
+; GFX1164DAGISEL-NEXT: scratch_store_b32 off, v8, s32 offset:16
+; GFX1164DAGISEL-NEXT: scratch_store_b32 off, v4, s32 offset:20
+; GFX1164DAGISEL-NEXT: scratch_store_b32 off, v5, s32 offset:24
+; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1164DAGISEL-NEXT: v_cndmask_b32_e64 v4, 0, v2, s[0:1]
+; GFX1164DAGISEL-NEXT: v_cndmask_b32_e64 v5, 0, v3, s[0:1]
+; GFX1164DAGISEL-NEXT: v_mbcnt_lo_u32_b32 v8, -1, 0
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-NEXT: v_mbcnt_hi_u32_b32 v8, -1, v8
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT: v_add_nc_u32_e32 v8, 32, v8
+; GFX1164DAGISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT: v_mul_lo_u32 v8, 4, v8
+; GFX1164DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX1164DAGISEL-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX1164DAGISEL-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX1164DAGISEL-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1164DAGISEL-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX1164DAGISEL-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1164DAGISEL-NEXT: ds_permute_b32 v6, v8, v4
+; GFX1164DAGISEL-NEXT: ds_permute_b32 v7, v8, v5
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX1164DAGISEL-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT: v_readlane_b32 s2, v4, 63
+; GFX1164DAGISEL-NEXT: v_readlane_b32 s3, v5, 63
+; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v3, s3
+; GFX1164DAGISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1164DAGISEL-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-NEXT: s_waitcnt_depctr depctr_sa_sdst(0)
+; GFX1164DAGISEL-NEXT: s_clause 0x4 ; 28-byte Folded Reload
+; GFX1164DAGISEL-NEXT: scratch_load_b64 v[4:5], off, s32
+; GFX1164DAGISEL-NEXT: scratch_load_b64 v[6:7], off, s32 offset:8
+; GFX1164DAGISEL-NEXT: scratch_load_b32 v8, off, s32 offset:16
+; GFX1164DAGISEL-NEXT: scratch_load_b32 v4, off, s32 offset:20
+; GFX1164DAGISEL-NEXT: scratch_load_b32 v5, off, s32 offset:24
+; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1164DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1164GISEL-LABEL: divergent_value_dpp_i64:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164GISEL-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX1164GISEL-NEXT: s_clause 0x4 ; 28-byte Folded Spill
+; GFX1164GISEL-NEXT: scratch_store_b64 off, v[4:5], s32
+; GFX1164GISEL-NEXT: scratch_store_b64 off, v[6:7], s32 offset:8
+; GFX1164GISEL-NEXT: scratch_store_b32 off, v8, s32 offset:16
+; GFX1164GISEL-NEXT: scratch_store_b32 off, v4, s32 offset:20
+; GFX1164GISEL-NEXT: scratch_store_b32 off, v5, s32 offset:24
+; GFX1164GISEL-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164GISEL-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1164GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v2, s[0:1]
+; GFX1164GISEL-NEXT: v_cndmask_b32_e64 v5, 0, v3, s[0:1]
+; GFX1164GISEL-NEXT: v_mbcnt_lo_u32_b32 v8, -1, 0
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164GISEL-NEXT: v_mbcnt_hi_u32_b32 v8, -1, v8
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT: v_add_nc_u32_e32 v8, 32, v8
+; GFX1164GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT: v_mul_lo_u32 v8, 4, v8
+; GFX1164GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX1164GISEL-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX1164GISEL-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX1164GISEL-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1164GISEL-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1164GISEL-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX1164GISEL-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1164GISEL-NEXT: ds_permute_b32 v6, v8, v4
+; GFX1164GISEL-NEXT: ds_permute_b32 v7, v8, v5
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX1164GISEL-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT: v_readlane_b32 s2, v4, 63
+; GFX1164GISEL-NEXT: v_readlane_b32 s3, v5, 63
+; GFX1164GISEL-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v3, s3
+; GFX1164GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1164GISEL-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX1164GISEL-NEXT: s_waitcnt_depctr depctr_sa_sdst(0)
+; GFX1164GISEL-NEXT: s_clause 0x4 ; 28-byte Folded Reload
+; GFX1164GISEL-NEXT: scratch_load_b64 v[4:5], off, s32
+; GFX1164GISEL-NEXT: scratch_load_b64 v[6:7], off, s32 offset:8
+; GFX1164GISEL-NEXT: scratch_load_b32 v8, off, s32 offset:16
+; GFX1164GISEL-NEXT: scratch_load_b32 v4, off, s32 offset:20
+; GFX1164GISEL-NEXT: scratch_load_b32 v5, off, s32 offset:24
+; GFX1164GISEL-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1164GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1132DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX1132DAGISEL-NEXT: s_clause 0x3 ; 24-byte Folded Spill
+; GFX1132DAGISEL-NEXT: scratch_store_b64 off, v[4:5], s32
+; GFX1132DAGISEL-NEXT: scratch_store_b64 off, v[6:7], s32 offset:8
+; GFX1132DAGISEL-NEXT: scratch_store_b32 off, v4, s32 offset:16
+; GFX1132DAGISEL-NEXT: scratch_store_b32 off, v5, s32 offset:20
+; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_cndmask_b32_e64 v4, 0, v2, s2
+; GFX1132DAGISEL-NEXT: v_cndmask_b32_e64 v5, 0, v3, s2
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1132DAGISEL-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v7, v5
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1132DAGISEL-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v7, v5
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1132DAGISEL-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v7, v5
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1132DAGISEL-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v7, v5
+; GFX1132DAGISEL-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1132DAGISEL-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v7, v5
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT: v_readlane_b32 s0, v4, 31
+; GFX1132DAGISEL-NEXT: v_readlane_b32 s1, v5, 31
+; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX1132DAGISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1132DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX1132DAGISEL-NEXT: s_clause 0x3 ; 24-byte Folded Reload
+; GFX1132DAGISEL-NEXT: scratch_load_b64 v[4:5], off, s32
+; GFX1132DAGISEL-NEXT: scratch_load_b64 v[6:7], off, s32 offset:8
+; GFX1132DAGISEL-NEXT: scratch_load_b32 v4, off, s32 offset:16
+; GFX1132DAGISEL-NEXT: scratch_load_b32 v5, off, s32 offset:20
+; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1132DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1132GISEL-LABEL: divergent_value_dpp_i64:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132GISEL-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX1132GISEL-NEXT: s_clause 0x3 ; 24-byte Folded Spill
+; GFX1132GISEL-NEXT: scratch_store_b64 off, v[4:5], s32
+; GFX1132GISEL-NEXT: scratch_store_b64 off, v[6:7], s32 offset:8
+; GFX1132GISEL-NEXT: scratch_store_b32 off, v4, s32 offset:16
+; GFX1132GISEL-NEXT: scratch_store_b32 off, v5, s32 offset:20
+; GFX1132GISEL-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132GISEL-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v2, s2
+; GFX1132GISEL-NEXT: v_cndmask_b32_e64 v5, 0, v3, s2
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1132GISEL-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v7, v5
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1132GISEL-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v7, v5
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1132GISEL-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v7, v5
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1132GISEL-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v7, v5
+; GFX1132GISEL-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1132GISEL-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1132GISEL-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v7, v5
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT: v_readlane_b32 s0, v4, 31
+; GFX1132GISEL-NEXT: v_readlane_b32 s1, v5, 31
+; GFX1132GISEL-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX1132GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1132GISEL-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX1132GISEL-NEXT: s_clause 0x3 ; 24-byte Folded Reload
+; GFX1132GISEL-NEXT: scratch_load_b64 v[4:5], off, s32
+; GFX1132GISEL-NEXT: scratch_load_b64 v[6:7], off, s32 offset:8
+; GFX1132GISEL-NEXT: scratch_load_b32 v4, off, s32 offset:16
+; GFX1132GISEL-NEXT: scratch_load_b32 v5, off, s32 offset:20
+; GFX1132GISEL-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1132GISEL-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %result = call i64 @llvm.amdgcn.wave.reduce.umax.i64(i64 %in, i32 2)
+ store i64 %result, ptr addrspace(1) %out
+ ret void
+}
+
define amdgpu_kernel void @default_stratergy(ptr addrspace(1) %out) {
; GFX8DAGISEL-LABEL: default_stratergy:
; GFX8DAGISEL: ; %bb.0: ; %entry
@@ -1161,20 +2097,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s2
; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB5_6
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB6_6
; GFX8DAGISEL-NEXT: ; %bb.3: ; %if
; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX8DAGISEL-NEXT: s_mov_b32 s6, 0
-; GFX8DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX8DAGISEL-NEXT: s_max_u32 s6, s6, s8
; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4
; GFX8DAGISEL-NEXT: ; %bb.5:
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6
-; GFX8DAGISEL-NEXT: .LBB5_6: ; %endif
+; GFX8DAGISEL-NEXT: .LBB6_6: ; %endif
; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
@@ -1189,30 +2125,30 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX8GISEL-NEXT: ; implicit-def: $sgpr2
; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX8GISEL-NEXT: s_cbranch_execz .LBB5_2
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX8GISEL-NEXT: ; %bb.1: ; %else
; GFX8GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
; GFX8GISEL-NEXT: ; implicit-def: $vgpr0
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8GISEL-NEXT: s_mov_b32 s2, s2
-; GFX8GISEL-NEXT: .LBB5_2: ; %Flow
+; GFX8GISEL-NEXT: .LBB6_2: ; %Flow
; GFX8GISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s2
; GFX8GISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX8GISEL-NEXT: s_cbranch_execz .LBB5_6
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB6_6
; GFX8GISEL-NEXT: ; %bb.3: ; %if
; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX8GISEL-NEXT: s_mov_b32 s6, 0
-; GFX8GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
; GFX8GISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX8GISEL-NEXT: s_max_u32 s6, s6, s8
; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB5_4
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB6_4
; GFX8GISEL-NEXT: ; %bb.5:
; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s6
-; GFX8GISEL-NEXT: .LBB5_6: ; %endif
+; GFX8GISEL-NEXT: .LBB6_6: ; %endif
; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
@@ -1235,20 +2171,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB5_6
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB6_6
; GFX9DAGISEL-NEXT: ; %bb.3: ; %if
; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX9DAGISEL-NEXT: s_mov_b32 s6, 0
-; GFX9DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX9DAGISEL-NEXT: s_max_u32 s6, s6, s8
; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4
; GFX9DAGISEL-NEXT: ; %bb.5:
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6
-; GFX9DAGISEL-NEXT: .LBB5_6: ; %endif
+; GFX9DAGISEL-NEXT: .LBB6_6: ; %endif
; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
@@ -1262,30 +2198,30 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX9GISEL-NEXT: ; implicit-def: $sgpr2
; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX9GISEL-NEXT: s_cbranch_execz .LBB5_2
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX9GISEL-NEXT: ; %bb.1: ; %else
; GFX9GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
; GFX9GISEL-NEXT: ; implicit-def: $vgpr0
; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9GISEL-NEXT: s_mov_b32 s2, s2
-; GFX9GISEL-NEXT: .LBB5_2: ; %Flow
+; GFX9GISEL-NEXT: .LBB6_2: ; %Flow
; GFX9GISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s2
; GFX9GISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX9GISEL-NEXT: s_cbranch_execz .LBB5_6
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB6_6
; GFX9GISEL-NEXT: ; %bb.3: ; %if
; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX9GISEL-NEXT: s_mov_b32 s6, 0
-; GFX9GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
; GFX9GISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX9GISEL-NEXT: s_max_u32 s6, s6, s8
; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB5_4
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB6_4
; GFX9GISEL-NEXT: ; %bb.5:
; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s6
-; GFX9GISEL-NEXT: .LBB5_6: ; %endif
+; GFX9GISEL-NEXT: .LBB6_6: ; %endif
; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9GISEL-NEXT: v_mov_b32_e32 v0, 0
@@ -1307,20 +2243,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB5_6
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB6_6
; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if
; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX1064DAGISEL-NEXT: s_mov_b32 s6, 0
-; GFX1064DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX1064DAGISEL-NEXT: s_max_u32 s6, s6, s8
; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4
; GFX1064DAGISEL-NEXT: ; %bb.5:
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6
-; GFX1064DAGISEL-NEXT: .LBB5_6: ; %endif
+; GFX1064DAGISEL-NEXT: .LBB6_6: ; %endif
; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
@@ -1334,26 +2270,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6
; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064GISEL-NEXT: s_cbranch_execz .LBB5_2
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX1064GISEL-NEXT: ; %bb.1: ; %else
; GFX1064GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0
; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064GISEL-NEXT: s_mov_b32 s6, s2
-; GFX1064GISEL-NEXT: .LBB5_2: ; %Flow
+; GFX1064GISEL-NEXT: .LBB6_2: ; %Flow
; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX1064GISEL-NEXT: s_cbranch_execz .LBB5_5
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB6_5
; GFX1064GISEL-NEXT: ; %bb.3: ; %if
; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX1064GISEL-NEXT: s_mov_b32 s6, 0
-; GFX1064GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
; GFX1064GISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX1064GISEL-NEXT: s_max_u32 s6, s6, s8
; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB5_4
-; GFX1064GISEL-NEXT: .LBB5_5: ; %endif
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB6_4
+; GFX1064GISEL-NEXT: .LBB6_5: ; %endif
; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6
@@ -1376,20 +2312,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB5_6
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB6_6
; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if
; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
; GFX1032DAGISEL-NEXT: s_mov_b32 s1, 0
-; GFX1032DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s3, s2
; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s3
; GFX1032DAGISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1032DAGISEL-NEXT: s_max_u32 s1, s1, s6
; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4
; GFX1032DAGISEL-NEXT: ; %bb.5:
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX1032DAGISEL-NEXT: .LBB5_6: ; %endif
+; GFX1032DAGISEL-NEXT: .LBB6_6: ; %endif
; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
@@ -1403,30 +2339,30 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1032GISEL-NEXT: ; implicit-def: $sgpr1
; GFX1032GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX1032GISEL-NEXT: s_cbranch_execz .LBB5_2
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX1032GISEL-NEXT: ; %bb.1: ; %else
; GFX1032GISEL-NEXT: s_load_dword s1, s[4:5], 0x2c
; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0
; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032GISEL-NEXT: s_mov_b32 s1, s1
-; GFX1032GISEL-NEXT: .LBB5_2: ; %Flow
+; GFX1032GISEL-NEXT: .LBB6_2: ; %Flow
; GFX1032GISEL-NEXT: s_or_saveexec_b32 s0, s0
; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX1032GISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX1032GISEL-NEXT: s_cbranch_execz .LBB5_6
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB6_6
; GFX1032GISEL-NEXT: ; %bb.3: ; %if
; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
; GFX1032GISEL-NEXT: s_mov_b32 s1, 0
-; GFX1032GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX1032GISEL-NEXT: s_ff1_i32_b32 s3, s2
; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s3
; GFX1032GISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1032GISEL-NEXT: s_max_u32 s1, s1, s6
; GFX1032GISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB5_4
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB6_4
; GFX1032GISEL-NEXT: ; %bb.5:
; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX1032GISEL-NEXT: .LBB5_6: ; %endif
+; GFX1032GISEL-NEXT: .LBB6_6: ; %endif
; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, 0
@@ -1450,21 +2386,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB5_6
+; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB6_6
; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if
; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX1164DAGISEL-NEXT: s_mov_b32 s6, 0
-; GFX1164DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX1164DAGISEL-NEXT: s_max_u32 s6, s6, s8
; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4
; GFX1164DAGISEL-NEXT: ; %bb.5:
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6
-; GFX1164DAGISEL-NEXT: .LBB5_6: ; %endif
+; GFX1164DAGISEL-NEXT: .LBB6_6: ; %endif
; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
@@ -1480,27 +2416,27 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164GISEL-NEXT: s_cbranch_execz .LBB5_2
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX1164GISEL-NEXT: ; %bb.1: ; %else
; GFX1164GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0
; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164GISEL-NEXT: s_mov_b32 s6, s2
-; GFX1164GISEL-NEXT: .LBB5_2: ; %Flow
+; GFX1164GISEL-NEXT: .LBB6_2: ; %Flow
; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1]
-; GFX1164GISEL-NEXT: s_cbranch_execz .LBB5_5
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB6_5
; GFX1164GISEL-NEXT: ; %bb.3: ; %if
; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX1164GISEL-NEXT: s_mov_b32 s6, 0
-; GFX1164GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX1164GISEL-NEXT: s_max_u32 s6, s6, s8
; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB5_4
-; GFX1164GISEL-NEXT: .LBB5_5: ; %endif
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB6_4
+; GFX1164GISEL-NEXT: .LBB6_5: ; %endif
; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6
@@ -1525,21 +2461,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB5_6
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB6_6
; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if
; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0
-; GFX1132DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s3
; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1132DAGISEL-NEXT: s_max_u32 s1, s1, s6
; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4
; GFX1132DAGISEL-NEXT: ; %bb.5:
; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX1132DAGISEL-NEXT: .LBB5_6: ; %endif
+; GFX1132DAGISEL-NEXT: .LBB6_6: ; %endif
; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0
@@ -1555,31 +2491,31 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
; GFX1132GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX1132GISEL-NEXT: s_cbranch_execz .LBB5_2
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX1132GISEL-NEXT: ; %bb.1: ; %else
; GFX1132GISEL-NEXT: s_load_b32 s1, s[4:5], 0x2c
; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0
; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132GISEL-NEXT: s_mov_b32 s1, s1
-; GFX1132GISEL-NEXT: .LBB5_2: ; %Flow
+; GFX1132GISEL-NEXT: .LBB6_2: ; %Flow
; GFX1132GISEL-NEXT: s_or_saveexec_b32 s0, s0
; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX1132GISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX1132GISEL-NEXT: s_cbranch_execz .LBB5_6
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB6_6
; GFX1132GISEL-NEXT: ; %bb.3: ; %if
; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
; GFX1132GISEL-NEXT: s_mov_b32 s1, 0
-; GFX1132GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2
; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s3
; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1132GISEL-NEXT: s_max_u32 s1, s1, s6
; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB5_4
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB6_4
; GFX1132GISEL-NEXT: ; %bb.5:
; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX1132GISEL-NEXT: .LBB5_6: ; %endif
+; GFX1132GISEL-NEXT: .LBB6_6: ; %endif
; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, 0
@@ -1857,7 +2793,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], 0
; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX8DAGISEL-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1
; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s12, s[6:7]
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v4, s4
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v5, s5
@@ -1868,7 +2804,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX8DAGISEL-NEXT: s_bitset0_b64 s[6:7], s12
; GFX8DAGISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5]
; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB9_1
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB10_1
; GFX8DAGISEL-NEXT: ; %bb.2:
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -1881,7 +2817,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX8GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8GISEL-NEXT: s_mov_b64 s[4:5], 0
; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX8GISEL-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1
; GFX8GISEL-NEXT: s_ff1_i32_b64 s12, s[6:7]
; GFX8GISEL-NEXT: v_mov_b32_e32 v4, s4
; GFX8GISEL-NEXT: v_mov_b32_e32 v5, s5
@@ -1892,7 +2828,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX8GISEL-NEXT: s_bitset0_b64 s[6:7], s12
; GFX8GISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5]
; GFX8GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB9_1
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB10_1
; GFX8GISEL-NEXT: ; %bb.2:
; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -1905,7 +2841,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], 0
; GFX9DAGISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX9DAGISEL-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1
; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s12, s[6:7]
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v4, s4
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v5, s5
@@ -1916,7 +2852,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX9DAGISEL-NEXT: s_bitset0_b64 s[6:7], s12
; GFX9DAGISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5]
; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB9_1
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB10_1
; GFX9DAGISEL-NEXT: ; %bb.2:
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -1929,7 +2865,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX9GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9GISEL-NEXT: s_mov_b64 s[4:5], 0
; GFX9GISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX9GISEL-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1
; GFX9GISEL-NEXT: s_ff1_i32_b64 s12, s[6:7]
; GFX9GISEL-NEXT: v_mov_b32_e32 v4, s4
; GFX9GISEL-NEXT: v_mov_b32_e32 v5, s5
@@ -1940,7 +2876,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX9GISEL-NEXT: s_bitset0_b64 s[6:7], s12
; GFX9GISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5]
; GFX9GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB9_1
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB10_1
; GFX9GISEL-NEXT: ; %bb.2:
; GFX9GISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX9GISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -1953,7 +2889,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1064DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], 0
; GFX1064DAGISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX1064DAGISEL-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1
; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s12, s[6:7]
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v4, s4
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v5, s5
@@ -1964,7 +2900,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[6:7], s12
; GFX1064DAGISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5]
; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB9_1
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1064DAGISEL-NEXT: ; %bb.2:
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -1976,7 +2912,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1064GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], 0
; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX1064GISEL-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1
; GFX1064GISEL-NEXT: s_ff1_i32_b64 s12, s[6:7]
; GFX1064GISEL-NEXT: v_mov_b32_e32 v4, s4
; GFX1064GISEL-NEXT: v_mov_b32_e32 v5, s5
@@ -1987,7 +2923,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1064GISEL-NEXT: s_bitset0_b64 s[6:7], s12
; GFX1064GISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5]
; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB9_1
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1064GISEL-NEXT: ; %bb.2:
; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX1064GISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -1999,7 +2935,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1032DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1032DAGISEL-NEXT: s_mov_b64 s[4:5], 0
; GFX1032DAGISEL-NEXT: s_mov_b32 s6, exec_lo
-; GFX1032DAGISEL-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1
; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s7, s6
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v4, s4
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v5, s5
@@ -2010,7 +2946,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1032DAGISEL-NEXT: s_bitset0_b32 s6, s7
; GFX1032DAGISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5]
; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s6, 0
-; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB9_1
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1032DAGISEL-NEXT: ; %bb.2:
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -2022,7 +2958,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1032GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1032GISEL-NEXT: s_mov_b64 s[4:5], 0
; GFX1032GISEL-NEXT: s_mov_b32 s6, exec_lo
-; GFX1032GISEL-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1
; GFX1032GISEL-NEXT: s_ff1_i32_b32 s7, s6
; GFX1032GISEL-NEXT: v_mov_b32_e32 v4, s4
; GFX1032GISEL-NEXT: v_mov_b32_e32 v5, s5
@@ -2033,7 +2969,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1032GISEL-NEXT: s_bitset0_b32 s6, s7
; GFX1032GISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5]
; GFX1032GISEL-NEXT: s_cmp_lg_u32 s6, 0
-; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB9_1
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1032GISEL-NEXT: ; %bb.2:
; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX1032GISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -2045,7 +2981,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], 0
; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164DAGISEL-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s8, s[2:3]
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v4, s0
@@ -2057,7 +2993,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s8
; GFX1164DAGISEL-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1]
; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB9_1
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1164DAGISEL-NEXT: ; %bb.2:
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v3, s1
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, s0
@@ -2069,7 +3005,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1164GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], 0
; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164GISEL-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1
; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX1164GISEL-NEXT: s_ctz_i32_b64 s8, s[2:3]
; GFX1164GISEL-NEXT: v_mov_b32_e32 v4, s0
@@ -2081,7 +3017,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s8
; GFX1164GISEL-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1]
; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB9_1
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1164GISEL-NEXT: ; %bb.2:
; GFX1164GISEL-NEXT: v_mov_b32_e32 v3, s1
; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, s0
@@ -2093,7 +3029,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1132DAGISEL-NEXT: s_mov_b64 s[0:1], 0
; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132DAGISEL-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2
; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
@@ -2104,7 +3040,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1132DAGISEL-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1]
; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB9_1
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1132DAGISEL-NEXT: ; %bb.2:
; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132DAGISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
@@ -2115,7 +3051,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1132GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1132GISEL-NEXT: s_mov_b64 s[0:1], 0
; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132GISEL-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1
; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2
; GFX1132GISEL-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
@@ -2126,7 +3062,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1132GISEL-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1]
; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB9_1
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1132GISEL-NEXT: ; %bb.2:
; GFX1132GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
@@ -2167,24 +3103,24 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX8GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7
; GFX8GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc
; GFX8GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
-; GFX8GISEL-NEXT: s_cbranch_execz .LBB10_2
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB11_2
; GFX8GISEL-NEXT: ; %bb.1: ; %else
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8GISEL-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX8GISEL-NEXT: .LBB10_2: ; %Flow
+; GFX8GISEL-NEXT: .LBB11_2: ; %Flow
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8GISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9]
; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s7
; GFX8GISEL-NEXT: s_xor_b64 exec, exec, s[2:3]
-; GFX8GISEL-NEXT: s_cbranch_execz .LBB10_4
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB11_4
; GFX8GISEL-NEXT: ; %bb.3: ; %if
; GFX8GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8GISEL-NEXT: s_mov_b64 s[4:5], s[4:5]
; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s5
-; GFX8GISEL-NEXT: .LBB10_4: ; %endif
+; GFX8GISEL-NEXT: .LBB11_4: ; %endif
; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1
; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0
@@ -2219,24 +3155,24 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX9GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7
; GFX9GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc
; GFX9GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
-; GFX9GISEL-NEXT: s_cbranch_execz .LBB10_2
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB11_2
; GFX9GISEL-NEXT: ; %bb.1: ; %else
; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9GISEL-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX9GISEL-NEXT: .LBB10_2: ; %Flow
+; GFX9GISEL-NEXT: .LBB11_2: ; %Flow
; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9GISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9]
; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s7
; GFX9GISEL-NEXT: s_xor_b64 exec, exec, s[2:3]
-; GFX9GISEL-NEXT: s_cbranch_execz .LBB10_4
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB11_4
; GFX9GISEL-NEXT: ; %bb.3: ; %if
; GFX9GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9GISEL-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s5
-; GFX9GISEL-NEXT: .LBB10_4: ; %endif
+; GFX9GISEL-NEXT: .LBB11_4: ; %endif
; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX9GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -2271,19 +3207,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7
; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc
; GFX1064GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
-; GFX1064GISEL-NEXT: s_cbranch_execz .LBB10_2
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB11_2
; GFX1064GISEL-NEXT: ; %bb.1: ; %else
; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX1064GISEL-NEXT: .LBB10_2: ; %Flow
+; GFX1064GISEL-NEXT: .LBB11_2: ; %Flow
; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[8:9]
-; GFX1064GISEL-NEXT: s_cbranch_execz .LBB10_4
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB11_4
; GFX1064GISEL-NEXT: ; %bb.3: ; %if
; GFX1064GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], s[6:7]
-; GFX1064GISEL-NEXT: .LBB10_4: ; %endif
+; GFX1064GISEL-NEXT: .LBB11_4: ; %endif
; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, s7
@@ -2320,24 +3256,24 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1032GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7
; GFX1032GISEL-NEXT: s_and_saveexec_b32 s8, vcc_lo
; GFX1032GISEL-NEXT: s_xor_b32 s8, exec_lo, s8
-; GFX1032GISEL-NEXT: s_cbranch_execz .LBB10_2
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB11_2
; GFX1032GISEL-NEXT: ; %bb.1: ; %else
; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032GISEL-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX1032GISEL-NEXT: .LBB10_2: ; %Flow
+; GFX1032GISEL-NEXT: .LBB11_2: ; %Flow
; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032GISEL-NEXT: s_or_saveexec_b32 s2, s8
; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s7
; GFX1032GISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s2
-; GFX1032GISEL-NEXT: s_cbranch_execz .LBB10_4
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB11_4
; GFX1032GISEL-NEXT: ; %bb.3: ; %if
; GFX1032GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032GISEL-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s5
-; GFX1032GISEL-NEXT: .LBB10_4: ; %endif
+; GFX1032GISEL-NEXT: .LBB11_4: ; %endif
; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX1032GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -2376,19 +3312,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
; GFX1164GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
-; GFX1164GISEL-NEXT: s_cbranch_execz .LBB10_2
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB11_2
; GFX1164GISEL-NEXT: ; %bb.1: ; %else
; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164GISEL-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX1164GISEL-NEXT: .LBB10_2: ; %Flow
+; GFX1164GISEL-NEXT: .LBB11_2: ; %Flow
; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[2:3], s[8:9]
-; GFX1164GISEL-NEXT: s_cbranch_execz .LBB10_4
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB11_4
; GFX1164GISEL-NEXT: ; %bb.3: ; %if
; GFX1164GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164GISEL-NEXT: s_mov_b64 s[6:7], s[4:5]
-; GFX1164GISEL-NEXT: .LBB10_4: ; %endif
+; GFX1164GISEL-NEXT: .LBB11_4: ; %endif
; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, s7
@@ -2427,23 +3363,23 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
; GFX1132GISEL-NEXT: s_xor_b32 s8, exec_lo, s8
-; GFX1132GISEL-NEXT: s_cbranch_execz .LBB10_2
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB11_2
; GFX1132GISEL-NEXT: ; %bb.1: ; %else
; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132GISEL-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX1132GISEL-NEXT: .LBB10_2: ; %Flow
+; GFX1132GISEL-NEXT: .LBB11_2: ; %Flow
; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132GISEL-NEXT: s_or_saveexec_b32 s2, s8
; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
; GFX1132GISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s2
-; GFX1132GISEL-NEXT: s_cbranch_execz .LBB10_4
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB11_4
; GFX1132GISEL-NEXT: ; %bb.3: ; %if
; GFX1132GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132GISEL-NEXT: s_mov_b64 s[4:5], s[4:5]
; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX1132GISEL-NEXT: .LBB10_4: ; %endif
+; GFX1132GISEL-NEXT: .LBB11_4: ; %endif
; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1132GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX1132GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll
index 4f01ea425aca9..dda651251a269 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll
@@ -813,6 +813,942 @@ entry:
ret void
}
+define void @divergent_value_dpp_i64(ptr addrspace(1) %out, i64 %in) {
+; GFX8DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8DAGISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX8DAGISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e64 v4, -1, v2, s[4:5]
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e64 v5, -1, v3, s[4:5]
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT: s_nop 0
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT: s_nop 0
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT: s_nop 0
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT: s_nop 0
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT: s_nop 0
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT: s_nop 0
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v4, 63
+; GFX8DAGISEL-NEXT: v_readlane_b32 s7, v5, 63
+; GFX8DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s7
+; GFX8DAGISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8DAGISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX8DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX8DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8GISEL-LABEL: divergent_value_dpp_i64:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX8GISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX8GISEL-NEXT: v_cndmask_b32_e64 v4, -1, v2, s[4:5]
+; GFX8GISEL-NEXT: v_cndmask_b32_e64 v5, -1, v3, s[4:5]
+; GFX8GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT: s_nop 0
+; GFX8GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT: s_nop 0
+; GFX8GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT: s_nop 0
+; GFX8GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT: s_nop 0
+; GFX8GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT: s_nop 0
+; GFX8GISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT: s_nop 0
+; GFX8GISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX8GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8GISEL-NEXT: v_readlane_b32 s6, v4, 63
+; GFX8GISEL-NEXT: v_readlane_b32 s7, v5, 63
+; GFX8GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s7
+; GFX8GISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX8GISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX8GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9DAGISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9DAGISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e64 v4, -1, v2, s[4:5]
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e64 v5, -1, v3, s[4:5]
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v4, 63
+; GFX9DAGISEL-NEXT: v_readlane_b32 s7, v5, 63
+; GFX9DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v3, s7
+; GFX9DAGISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9DAGISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9GISEL-LABEL: divergent_value_dpp_i64:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9GISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX9GISEL-NEXT: v_cndmask_b32_e64 v4, -1, v2, s[4:5]
+; GFX9GISEL-NEXT: v_cndmask_b32_e64 v5, -1, v3, s[4:5]
+; GFX9GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX9GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX9GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX9GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX9GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX9GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX9GISEL-NEXT: v_readlane_b32 s6, v4, 63
+; GFX9GISEL-NEXT: v_readlane_b32 s7, v5, 63
+; GFX9GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9GISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX9GISEL-NEXT: v_mov_b32_e32 v3, s7
+; GFX9GISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9GISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1064DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX1064DAGISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e64 v4, -1, v2, s[4:5]
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e64 v5, -1, v3, s[4:5]
+; GFX1064DAGISEL-NEXT: v_mbcnt_lo_u32_b32 v8, -1, 0
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1064DAGISEL-NEXT: v_mbcnt_hi_u32_b32 v8, -1, v8
+; GFX1064DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT: v_add_nc_u32_e32 v8, 32, v8
+; GFX1064DAGISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX1064DAGISEL-NEXT: v_mul_lo_u32 v8, 4, v8
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1064DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1064DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1064DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1064DAGISEL-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1064DAGISEL-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1064DAGISEL-NEXT: ds_permute_b32 v6, v8, v4
+; GFX1064DAGISEL-NEXT: ds_permute_b32 v7, v8, v5
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s6, v4, 63
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s7, v5, 63
+; GFX1064DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v3, s7
+; GFX1064DAGISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1064DAGISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX1064DAGISEL-NEXT: s_clause 0x6 ; 28-byte Folded Reload
+; GFX1064DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32
+; GFX1064DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4
+; GFX1064DAGISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
+; GFX1064DAGISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12
+; GFX1064DAGISEL-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16
+; GFX1064DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20
+; GFX1064DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24
+; GFX1064DAGISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1064DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1064GISEL-LABEL: divergent_value_dpp_i64:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX1064GISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1064GISEL-NEXT: v_cndmask_b32_e64 v4, -1, v2, s[4:5]
+; GFX1064GISEL-NEXT: v_cndmask_b32_e64 v5, -1, v3, s[4:5]
+; GFX1064GISEL-NEXT: v_mbcnt_lo_u32_b32 v8, -1, 0
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1064GISEL-NEXT: v_mbcnt_hi_u32_b32 v8, -1, v8
+; GFX1064GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT: v_add_nc_u32_e32 v8, 32, v8
+; GFX1064GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX1064GISEL-NEXT: v_mul_lo_u32 v8, 4, v8
+; GFX1064GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1064GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX1064GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1064GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX1064GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1064GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX1064GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1064GISEL-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1064GISEL-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX1064GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1064GISEL-NEXT: ds_permute_b32 v6, v8, v4
+; GFX1064GISEL-NEXT: ds_permute_b32 v7, v8, v5
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX1064GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1064GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1064GISEL-NEXT: v_readlane_b32 s6, v4, 63
+; GFX1064GISEL-NEXT: v_readlane_b32 s7, v5, 63
+; GFX1064GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v3, s7
+; GFX1064GISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1064GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX1064GISEL-NEXT: s_clause 0x6 ; 28-byte Folded Reload
+; GFX1064GISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32
+; GFX1064GISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4
+; GFX1064GISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
+; GFX1064GISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12
+; GFX1064GISEL-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16
+; GFX1064GISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20
+; GFX1064GISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24
+; GFX1064GISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1064GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1032DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: s_xor_saveexec_b32 s4, -1
+; GFX1032DAGISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX1032DAGISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX1032DAGISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX1032DAGISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX1032DAGISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX1032DAGISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX1032DAGISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032DAGISEL-NEXT: s_mov_b32 exec_lo, s4
+; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s6, -1
+; GFX1032DAGISEL-NEXT: v_cndmask_b32_e64 v4, -1, v2, s6
+; GFX1032DAGISEL-NEXT: v_cndmask_b32_e64 v5, -1, v3, s6
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1032DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1032DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1032DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc_lo
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1032DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1032DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1032DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc_lo
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1032DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1032DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1032DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc_lo
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1032DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1032DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1032DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc_lo
+; GFX1032DAGISEL-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1032DAGISEL-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1032DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1032DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc_lo
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s4, v4, 31
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v5, 31
+; GFX1032DAGISEL-NEXT: s_mov_b32 exec_lo, s6
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032DAGISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1032DAGISEL-NEXT: s_xor_saveexec_b32 s4, -1
+; GFX1032DAGISEL-NEXT: s_clause 0x5 ; 24-byte Folded Reload
+; GFX1032DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32
+; GFX1032DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4
+; GFX1032DAGISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
+; GFX1032DAGISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12
+; GFX1032DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16
+; GFX1032DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20
+; GFX1032DAGISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032DAGISEL-NEXT: s_mov_b32 exec_lo, s4
+; GFX1032DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1032DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1032GISEL-LABEL: divergent_value_dpp_i64:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032GISEL-NEXT: s_xor_saveexec_b32 s4, -1
+; GFX1032GISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX1032GISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX1032GISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX1032GISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX1032GISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX1032GISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX1032GISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032GISEL-NEXT: s_mov_b32 exec_lo, s4
+; GFX1032GISEL-NEXT: s_or_saveexec_b32 s6, -1
+; GFX1032GISEL-NEXT: v_cndmask_b32_e64 v4, -1, v2, s6
+; GFX1032GISEL-NEXT: v_cndmask_b32_e64 v5, -1, v3, s6
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1032GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1032GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1032GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc_lo
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1032GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1032GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1032GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc_lo
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1032GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1032GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1032GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc_lo
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1032GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1032GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1032GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc_lo
+; GFX1032GISEL-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1032GISEL-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1032GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1032GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc_lo
+; GFX1032GISEL-NEXT: v_readlane_b32 s4, v4, 31
+; GFX1032GISEL-NEXT: v_readlane_b32 s5, v5, 31
+; GFX1032GISEL-NEXT: s_mov_b32 exec_lo, s6
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032GISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1032GISEL-NEXT: s_xor_saveexec_b32 s4, -1
+; GFX1032GISEL-NEXT: s_clause 0x5 ; 24-byte Folded Reload
+; GFX1032GISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32
+; GFX1032GISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4
+; GFX1032GISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
+; GFX1032GISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12
+; GFX1032GISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16
+; GFX1032GISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20
+; GFX1032GISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032GISEL-NEXT: s_mov_b32 exec_lo, s4
+; GFX1032GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1032GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1164DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-NEXT: s_clause 0x4 ; 28-byte Folded Spill
+; GFX1164DAGISEL-NEXT: scratch_store_b64 off, v[4:5], s32
+; GFX1164DAGISEL-NEXT: scratch_store_b64 off, v[6:7], s32 offset:8
+; GFX1164DAGISEL-NEXT: scratch_store_b32 off, v8, s32 offset:16
+; GFX1164DAGISEL-NEXT: scratch_store_b32 off, v4, s32 offset:20
+; GFX1164DAGISEL-NEXT: scratch_store_b32 off, v5, s32 offset:24
+; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1164DAGISEL-NEXT: v_cndmask_b32_e64 v4, -1, v2, s[0:1]
+; GFX1164DAGISEL-NEXT: v_cndmask_b32_e64 v5, -1, v3, s[0:1]
+; GFX1164DAGISEL-NEXT: v_mbcnt_lo_u32_b32 v8, -1, 0
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-NEXT: v_mbcnt_hi_u32_b32 v8, -1, v8
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT: v_add_nc_u32_e32 v8, 32, v8
+; GFX1164DAGISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT: v_mul_lo_u32 v8, 4, v8
+; GFX1164DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX1164DAGISEL-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX1164DAGISEL-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX1164DAGISEL-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1164DAGISEL-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX1164DAGISEL-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1164DAGISEL-NEXT: ds_permute_b32 v6, v8, v4
+; GFX1164DAGISEL-NEXT: ds_permute_b32 v7, v8, v5
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX1164DAGISEL-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164DAGISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164DAGISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT: v_readlane_b32 s2, v4, 63
+; GFX1164DAGISEL-NEXT: v_readlane_b32 s3, v5, 63
+; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v3, s3
+; GFX1164DAGISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1164DAGISEL-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-NEXT: s_waitcnt_depctr depctr_sa_sdst(0)
+; GFX1164DAGISEL-NEXT: s_clause 0x4 ; 28-byte Folded Reload
+; GFX1164DAGISEL-NEXT: scratch_load_b64 v[4:5], off, s32
+; GFX1164DAGISEL-NEXT: scratch_load_b64 v[6:7], off, s32 offset:8
+; GFX1164DAGISEL-NEXT: scratch_load_b32 v8, off, s32 offset:16
+; GFX1164DAGISEL-NEXT: scratch_load_b32 v4, off, s32 offset:20
+; GFX1164DAGISEL-NEXT: scratch_load_b32 v5, off, s32 offset:24
+; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1164DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1164GISEL-LABEL: divergent_value_dpp_i64:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164GISEL-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX1164GISEL-NEXT: s_clause 0x4 ; 28-byte Folded Spill
+; GFX1164GISEL-NEXT: scratch_store_b64 off, v[4:5], s32
+; GFX1164GISEL-NEXT: scratch_store_b64 off, v[6:7], s32 offset:8
+; GFX1164GISEL-NEXT: scratch_store_b32 off, v8, s32 offset:16
+; GFX1164GISEL-NEXT: scratch_store_b32 off, v4, s32 offset:20
+; GFX1164GISEL-NEXT: scratch_store_b32 off, v5, s32 offset:24
+; GFX1164GISEL-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164GISEL-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1164GISEL-NEXT: v_cndmask_b32_e64 v4, -1, v2, s[0:1]
+; GFX1164GISEL-NEXT: v_cndmask_b32_e64 v5, -1, v3, s[0:1]
+; GFX1164GISEL-NEXT: v_mbcnt_lo_u32_b32 v8, -1, 0
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164GISEL-NEXT: v_mbcnt_hi_u32_b32 v8, -1, v8
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT: v_add_nc_u32_e32 v8, 32, v8
+; GFX1164GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT: v_mul_lo_u32 v8, 4, v8
+; GFX1164GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX1164GISEL-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX1164GISEL-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX1164GISEL-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1164GISEL-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1164GISEL-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX1164GISEL-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1164GISEL-NEXT: ds_permute_b32 v6, v8, v4
+; GFX1164GISEL-NEXT: ds_permute_b32 v7, v8, v5
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX1164GISEL-NEXT: s_waitcnt_depctr depctr_va_vcc(0)
+; GFX1164GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT: v_readlane_b32 s2, v4, 63
+; GFX1164GISEL-NEXT: v_readlane_b32 s3, v5, 63
+; GFX1164GISEL-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v3, s3
+; GFX1164GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1164GISEL-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX1164GISEL-NEXT: s_waitcnt_depctr depctr_sa_sdst(0)
+; GFX1164GISEL-NEXT: s_clause 0x4 ; 28-byte Folded Reload
+; GFX1164GISEL-NEXT: scratch_load_b64 v[4:5], off, s32
+; GFX1164GISEL-NEXT: scratch_load_b64 v[6:7], off, s32 offset:8
+; GFX1164GISEL-NEXT: scratch_load_b32 v8, off, s32 offset:16
+; GFX1164GISEL-NEXT: scratch_load_b32 v4, off, s32 offset:20
+; GFX1164GISEL-NEXT: scratch_load_b32 v5, off, s32 offset:24
+; GFX1164GISEL-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1164GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1132DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX1132DAGISEL-NEXT: s_clause 0x3 ; 24-byte Folded Spill
+; GFX1132DAGISEL-NEXT: scratch_store_b64 off, v[4:5], s32
+; GFX1132DAGISEL-NEXT: scratch_store_b64 off, v[6:7], s32 offset:8
+; GFX1132DAGISEL-NEXT: scratch_store_b32 off, v4, s32 offset:16
+; GFX1132DAGISEL-NEXT: scratch_store_b32 off, v5, s32 offset:20
+; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_cndmask_b32_e64 v4, -1, v2, s2
+; GFX1132DAGISEL-NEXT: v_cndmask_b32_e64 v5, -1, v3, s2
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1132DAGISEL-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v7, v5
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1132DAGISEL-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v7, v5
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1132DAGISEL-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v7, v5
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1132DAGISEL-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v7, v5
+; GFX1132DAGISEL-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1132DAGISEL-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v7, v5
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT: v_readlane_b32 s0, v4, 31
+; GFX1132DAGISEL-NEXT: v_readlane_b32 s1, v5, 31
+; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX1132DAGISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1132DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX1132DAGISEL-NEXT: s_clause 0x3 ; 24-byte Folded Reload
+; GFX1132DAGISEL-NEXT: scratch_load_b64 v[4:5], off, s32
+; GFX1132DAGISEL-NEXT: scratch_load_b64 v[6:7], off, s32 offset:8
+; GFX1132DAGISEL-NEXT: scratch_load_b32 v4, off, s32 offset:16
+; GFX1132DAGISEL-NEXT: scratch_load_b32 v5, off, s32 offset:20
+; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1132DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1132GISEL-LABEL: divergent_value_dpp_i64:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132GISEL-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX1132GISEL-NEXT: s_clause 0x3 ; 24-byte Folded Spill
+; GFX1132GISEL-NEXT: scratch_store_b64 off, v[4:5], s32
+; GFX1132GISEL-NEXT: scratch_store_b64 off, v[6:7], s32 offset:8
+; GFX1132GISEL-NEXT: scratch_store_b32 off, v4, s32 offset:16
+; GFX1132GISEL-NEXT: scratch_store_b32 off, v5, s32 offset:20
+; GFX1132GISEL-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132GISEL-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_cndmask_b32_e64 v4, -1, v2, s2
+; GFX1132GISEL-NEXT: v_cndmask_b32_e64 v5, -1, v3, s2
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1132GISEL-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v7, v5
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1132GISEL-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v7, v5
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1132GISEL-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v7, v5
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1132GISEL-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v7, v5
+; GFX1132GISEL-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1132GISEL-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX1132GISEL-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v7, v5
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT: v_readlane_b32 s0, v4, 31
+; GFX1132GISEL-NEXT: v_readlane_b32 s1, v5, 31
+; GFX1132GISEL-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX1132GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1132GISEL-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX1132GISEL-NEXT: s_clause 0x3 ; 24-byte Folded Reload
+; GFX1132GISEL-NEXT: scratch_load_b64 v[4:5], off, s32
+; GFX1132GISEL-NEXT: scratch_load_b64 v[6:7], off, s32 offset:8
+; GFX1132GISEL-NEXT: scratch_load_b32 v4, off, s32 offset:16
+; GFX1132GISEL-NEXT: scratch_load_b32 v5, off, s32 offset:20
+; GFX1132GISEL-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX1132GISEL-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %result = call i64 @llvm.amdgcn.wave.reduce.umin.i64(i64 %in, i32 2)
+ store i64 %result, ptr addrspace(1) %out
+ ret void
+}
+
define amdgpu_kernel void @default_stratergy(ptr addrspace(1) %out) {
; GFX8DAGISEL-LABEL: default_stratergy:
; GFX8DAGISEL: ; %bb.0: ; %entry
@@ -1161,20 +2097,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s2
; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB5_6
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB6_6
; GFX8DAGISEL-NEXT: ; %bb.3: ; %if
; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX8DAGISEL-NEXT: s_mov_b32 s6, -1
-; GFX8DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX8DAGISEL-NEXT: s_min_u32 s6, s6, s8
; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4
; GFX8DAGISEL-NEXT: ; %bb.5:
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6
-; GFX8DAGISEL-NEXT: .LBB5_6: ; %endif
+; GFX8DAGISEL-NEXT: .LBB6_6: ; %endif
; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
@@ -1189,30 +2125,30 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX8GISEL-NEXT: ; implicit-def: $sgpr2
; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX8GISEL-NEXT: s_cbranch_execz .LBB5_2
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX8GISEL-NEXT: ; %bb.1: ; %else
; GFX8GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
; GFX8GISEL-NEXT: ; implicit-def: $vgpr0
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8GISEL-NEXT: s_mov_b32 s2, s2
-; GFX8GISEL-NEXT: .LBB5_2: ; %Flow
+; GFX8GISEL-NEXT: .LBB6_2: ; %Flow
; GFX8GISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s2
; GFX8GISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX8GISEL-NEXT: s_cbranch_execz .LBB5_6
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB6_6
; GFX8GISEL-NEXT: ; %bb.3: ; %if
; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX8GISEL-NEXT: s_mov_b32 s6, -1
-; GFX8GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
; GFX8GISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX8GISEL-NEXT: s_min_u32 s6, s6, s8
; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB5_4
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB6_4
; GFX8GISEL-NEXT: ; %bb.5:
; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s6
-; GFX8GISEL-NEXT: .LBB5_6: ; %endif
+; GFX8GISEL-NEXT: .LBB6_6: ; %endif
; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
@@ -1235,20 +2171,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB5_6
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB6_6
; GFX9DAGISEL-NEXT: ; %bb.3: ; %if
; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX9DAGISEL-NEXT: s_mov_b32 s6, -1
-; GFX9DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX9DAGISEL-NEXT: s_min_u32 s6, s6, s8
; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4
; GFX9DAGISEL-NEXT: ; %bb.5:
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6
-; GFX9DAGISEL-NEXT: .LBB5_6: ; %endif
+; GFX9DAGISEL-NEXT: .LBB6_6: ; %endif
; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
@@ -1262,30 +2198,30 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX9GISEL-NEXT: ; implicit-def: $sgpr2
; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX9GISEL-NEXT: s_cbranch_execz .LBB5_2
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX9GISEL-NEXT: ; %bb.1: ; %else
; GFX9GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
; GFX9GISEL-NEXT: ; implicit-def: $vgpr0
; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9GISEL-NEXT: s_mov_b32 s2, s2
-; GFX9GISEL-NEXT: .LBB5_2: ; %Flow
+; GFX9GISEL-NEXT: .LBB6_2: ; %Flow
; GFX9GISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s2
; GFX9GISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX9GISEL-NEXT: s_cbranch_execz .LBB5_6
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB6_6
; GFX9GISEL-NEXT: ; %bb.3: ; %if
; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX9GISEL-NEXT: s_mov_b32 s6, -1
-; GFX9GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
; GFX9GISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX9GISEL-NEXT: s_min_u32 s6, s6, s8
; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB5_4
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB6_4
; GFX9GISEL-NEXT: ; %bb.5:
; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s6
-; GFX9GISEL-NEXT: .LBB5_6: ; %endif
+; GFX9GISEL-NEXT: .LBB6_6: ; %endif
; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9GISEL-NEXT: v_mov_b32_e32 v0, 0
@@ -1307,20 +2243,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB5_6
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB6_6
; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if
; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX1064DAGISEL-NEXT: s_mov_b32 s6, -1
-; GFX1064DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX1064DAGISEL-NEXT: s_min_u32 s6, s6, s8
; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4
; GFX1064DAGISEL-NEXT: ; %bb.5:
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6
-; GFX1064DAGISEL-NEXT: .LBB5_6: ; %endif
+; GFX1064DAGISEL-NEXT: .LBB6_6: ; %endif
; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
@@ -1334,26 +2270,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6
; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064GISEL-NEXT: s_cbranch_execz .LBB5_2
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX1064GISEL-NEXT: ; %bb.1: ; %else
; GFX1064GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0
; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064GISEL-NEXT: s_mov_b32 s6, s2
-; GFX1064GISEL-NEXT: .LBB5_2: ; %Flow
+; GFX1064GISEL-NEXT: .LBB6_2: ; %Flow
; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX1064GISEL-NEXT: s_cbranch_execz .LBB5_5
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB6_5
; GFX1064GISEL-NEXT: ; %bb.3: ; %if
; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX1064GISEL-NEXT: s_mov_b32 s6, -1
-; GFX1064GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
; GFX1064GISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX1064GISEL-NEXT: s_min_u32 s6, s6, s8
; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB5_4
-; GFX1064GISEL-NEXT: .LBB5_5: ; %endif
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB6_4
+; GFX1064GISEL-NEXT: .LBB6_5: ; %endif
; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6
@@ -1376,20 +2312,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB5_6
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB6_6
; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if
; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
; GFX1032DAGISEL-NEXT: s_mov_b32 s1, -1
-; GFX1032DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s3, s2
; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s3
; GFX1032DAGISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1032DAGISEL-NEXT: s_min_u32 s1, s1, s6
; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4
; GFX1032DAGISEL-NEXT: ; %bb.5:
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX1032DAGISEL-NEXT: .LBB5_6: ; %endif
+; GFX1032DAGISEL-NEXT: .LBB6_6: ; %endif
; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
@@ -1403,30 +2339,30 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1032GISEL-NEXT: ; implicit-def: $sgpr1
; GFX1032GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX1032GISEL-NEXT: s_cbranch_execz .LBB5_2
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX1032GISEL-NEXT: ; %bb.1: ; %else
; GFX1032GISEL-NEXT: s_load_dword s1, s[4:5], 0x2c
; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0
; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032GISEL-NEXT: s_mov_b32 s1, s1
-; GFX1032GISEL-NEXT: .LBB5_2: ; %Flow
+; GFX1032GISEL-NEXT: .LBB6_2: ; %Flow
; GFX1032GISEL-NEXT: s_or_saveexec_b32 s0, s0
; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX1032GISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX1032GISEL-NEXT: s_cbranch_execz .LBB5_6
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB6_6
; GFX1032GISEL-NEXT: ; %bb.3: ; %if
; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
; GFX1032GISEL-NEXT: s_mov_b32 s1, -1
-; GFX1032GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX1032GISEL-NEXT: s_ff1_i32_b32 s3, s2
; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s3
; GFX1032GISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1032GISEL-NEXT: s_min_u32 s1, s1, s6
; GFX1032GISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB5_4
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB6_4
; GFX1032GISEL-NEXT: ; %bb.5:
; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX1032GISEL-NEXT: .LBB5_6: ; %endif
+; GFX1032GISEL-NEXT: .LBB6_6: ; %endif
; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, 0
@@ -1450,21 +2386,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB5_6
+; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB6_6
; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if
; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX1164DAGISEL-NEXT: s_mov_b32 s6, -1
-; GFX1164DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX1164DAGISEL-NEXT: s_min_u32 s6, s6, s8
; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4
; GFX1164DAGISEL-NEXT: ; %bb.5:
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6
-; GFX1164DAGISEL-NEXT: .LBB5_6: ; %endif
+; GFX1164DAGISEL-NEXT: .LBB6_6: ; %endif
; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
@@ -1480,27 +2416,27 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164GISEL-NEXT: s_cbranch_execz .LBB5_2
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX1164GISEL-NEXT: ; %bb.1: ; %else
; GFX1164GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0
; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164GISEL-NEXT: s_mov_b32 s6, s2
-; GFX1164GISEL-NEXT: .LBB5_2: ; %Flow
+; GFX1164GISEL-NEXT: .LBB6_2: ; %Flow
; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1]
-; GFX1164GISEL-NEXT: s_cbranch_execz .LBB5_5
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB6_5
; GFX1164GISEL-NEXT: ; %bb.3: ; %if
; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX1164GISEL-NEXT: s_mov_b32 s6, -1
-; GFX1164GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX1164GISEL-NEXT: s_min_u32 s6, s6, s8
; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB5_4
-; GFX1164GISEL-NEXT: .LBB5_5: ; %endif
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB6_4
+; GFX1164GISEL-NEXT: .LBB6_5: ; %endif
; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6
@@ -1525,21 +2461,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB5_6
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB6_6
; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if
; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
; GFX1132DAGISEL-NEXT: s_mov_b32 s1, -1
-; GFX1132DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s3
; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1132DAGISEL-NEXT: s_min_u32 s1, s1, s6
; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB6_4
; GFX1132DAGISEL-NEXT: ; %bb.5:
; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX1132DAGISEL-NEXT: .LBB5_6: ; %endif
+; GFX1132DAGISEL-NEXT: .LBB6_6: ; %endif
; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0
@@ -1555,31 +2491,31 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
; GFX1132GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX1132GISEL-NEXT: s_cbranch_execz .LBB5_2
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX1132GISEL-NEXT: ; %bb.1: ; %else
; GFX1132GISEL-NEXT: s_load_b32 s1, s[4:5], 0x2c
; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0
; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132GISEL-NEXT: s_mov_b32 s1, s1
-; GFX1132GISEL-NEXT: .LBB5_2: ; %Flow
+; GFX1132GISEL-NEXT: .LBB6_2: ; %Flow
; GFX1132GISEL-NEXT: s_or_saveexec_b32 s0, s0
; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX1132GISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX1132GISEL-NEXT: s_cbranch_execz .LBB5_6
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB6_6
; GFX1132GISEL-NEXT: ; %bb.3: ; %if
; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
; GFX1132GISEL-NEXT: s_mov_b32 s1, -1
-; GFX1132GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: .LBB6_4: ; =>This Inner Loop Header: Depth=1
; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2
; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s3
; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1132GISEL-NEXT: s_min_u32 s1, s1, s6
; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB5_4
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB6_4
; GFX1132GISEL-NEXT: ; %bb.5:
; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX1132GISEL-NEXT: .LBB5_6: ; %endif
+; GFX1132GISEL-NEXT: .LBB6_6: ; %endif
; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, 0
@@ -1717,7 +2653,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], -1
; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX8DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s12, s[6:7]
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v4, s4
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v5, s5
@@ -1728,7 +2664,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX8DAGISEL-NEXT: s_bitset0_b64 s[6:7], s12
; GFX8DAGISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5]
; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX8DAGISEL-NEXT: ; %bb.2:
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -1741,7 +2677,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX8GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8GISEL-NEXT: s_mov_b64 s[4:5], -1
; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX8GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX8GISEL-NEXT: s_ff1_i32_b64 s12, s[6:7]
; GFX8GISEL-NEXT: v_mov_b32_e32 v4, s4
; GFX8GISEL-NEXT: v_mov_b32_e32 v5, s5
@@ -1752,7 +2688,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX8GISEL-NEXT: s_bitset0_b64 s[6:7], s12
; GFX8GISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5]
; GFX8GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX8GISEL-NEXT: ; %bb.2:
; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -1765,7 +2701,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], -1
; GFX9DAGISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX9DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s12, s[6:7]
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v4, s4
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v5, s5
@@ -1776,7 +2712,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX9DAGISEL-NEXT: s_bitset0_b64 s[6:7], s12
; GFX9DAGISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5]
; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX9DAGISEL-NEXT: ; %bb.2:
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -1789,7 +2725,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX9GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9GISEL-NEXT: s_mov_b64 s[4:5], -1
; GFX9GISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX9GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX9GISEL-NEXT: s_ff1_i32_b64 s12, s[6:7]
; GFX9GISEL-NEXT: v_mov_b32_e32 v4, s4
; GFX9GISEL-NEXT: v_mov_b32_e32 v5, s5
@@ -1800,7 +2736,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX9GISEL-NEXT: s_bitset0_b64 s[6:7], s12
; GFX9GISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5]
; GFX9GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX9GISEL-NEXT: ; %bb.2:
; GFX9GISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX9GISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -1813,7 +2749,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1064DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], -1
; GFX1064DAGISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX1064DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s12, s[6:7]
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v4, s4
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v5, s5
@@ -1824,7 +2760,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[6:7], s12
; GFX1064DAGISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5]
; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1064DAGISEL-NEXT: ; %bb.2:
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -1836,7 +2772,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1064GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], -1
; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX1064GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1064GISEL-NEXT: s_ff1_i32_b64 s12, s[6:7]
; GFX1064GISEL-NEXT: v_mov_b32_e32 v4, s4
; GFX1064GISEL-NEXT: v_mov_b32_e32 v5, s5
@@ -1847,7 +2783,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1064GISEL-NEXT: s_bitset0_b64 s[6:7], s12
; GFX1064GISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5]
; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1064GISEL-NEXT: ; %bb.2:
; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX1064GISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -1859,7 +2795,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1032DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1032DAGISEL-NEXT: s_mov_b64 s[4:5], -1
; GFX1032DAGISEL-NEXT: s_mov_b32 s6, exec_lo
-; GFX1032DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s7, s6
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v4, s4
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v5, s5
@@ -1870,7 +2806,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1032DAGISEL-NEXT: s_bitset0_b32 s6, s7
; GFX1032DAGISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5]
; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s6, 0
-; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1032DAGISEL-NEXT: ; %bb.2:
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -1882,7 +2818,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1032GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1032GISEL-NEXT: s_mov_b64 s[4:5], -1
; GFX1032GISEL-NEXT: s_mov_b32 s6, exec_lo
-; GFX1032GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1032GISEL-NEXT: s_ff1_i32_b32 s7, s6
; GFX1032GISEL-NEXT: v_mov_b32_e32 v4, s4
; GFX1032GISEL-NEXT: v_mov_b32_e32 v5, s5
@@ -1893,7 +2829,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1032GISEL-NEXT: s_bitset0_b32 s6, s7
; GFX1032GISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5]
; GFX1032GISEL-NEXT: s_cmp_lg_u32 s6, 0
-; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1032GISEL-NEXT: ; %bb.2:
; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX1032GISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -1905,7 +2841,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], -1
; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s8, s[2:3]
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v4, s0
@@ -1917,7 +2853,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s8
; GFX1164DAGISEL-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1]
; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1164DAGISEL-NEXT: ; %bb.2:
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v3, s1
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, s0
@@ -1929,7 +2865,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1164GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], -1
; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX1164GISEL-NEXT: s_ctz_i32_b64 s8, s[2:3]
; GFX1164GISEL-NEXT: v_mov_b32_e32 v4, s0
@@ -1941,7 +2877,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s8
; GFX1164GISEL-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1]
; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1164GISEL-NEXT: ; %bb.2:
; GFX1164GISEL-NEXT: v_mov_b32_e32 v3, s1
; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, s0
@@ -1953,7 +2889,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1132DAGISEL-NEXT: s_mov_b64 s[0:1], -1
; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2
; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
@@ -1964,7 +2900,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1132DAGISEL-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1]
; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1132DAGISEL-NEXT: ; %bb.2:
; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132DAGISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
@@ -1975,7 +2911,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1132GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1132GISEL-NEXT: s_mov_b64 s[0:1], -1
; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2
; GFX1132GISEL-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
@@ -1986,7 +2922,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1132GISEL-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1]
; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1132GISEL-NEXT: ; %bb.2:
; GFX1132GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
@@ -2027,24 +2963,24 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX8GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7
; GFX8GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc
; GFX8GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
-; GFX8GISEL-NEXT: s_cbranch_execz .LBB8_2
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX8GISEL-NEXT: ; %bb.1: ; %else
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8GISEL-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX8GISEL-NEXT: .LBB8_2: ; %Flow
+; GFX8GISEL-NEXT: .LBB9_2: ; %Flow
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8GISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9]
; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s7
; GFX8GISEL-NEXT: s_xor_b64 exec, exec, s[2:3]
-; GFX8GISEL-NEXT: s_cbranch_execz .LBB8_4
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX8GISEL-NEXT: ; %bb.3: ; %if
; GFX8GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8GISEL-NEXT: s_mov_b64 s[4:5], s[4:5]
; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s5
-; GFX8GISEL-NEXT: .LBB8_4: ; %endif
+; GFX8GISEL-NEXT: .LBB9_4: ; %endif
; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1
; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0
@@ -2079,24 +3015,24 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX9GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7
; GFX9GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc
; GFX9GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
-; GFX9GISEL-NEXT: s_cbranch_execz .LBB8_2
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX9GISEL-NEXT: ; %bb.1: ; %else
; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9GISEL-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX9GISEL-NEXT: .LBB8_2: ; %Flow
+; GFX9GISEL-NEXT: .LBB9_2: ; %Flow
; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9GISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9]
; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s7
; GFX9GISEL-NEXT: s_xor_b64 exec, exec, s[2:3]
-; GFX9GISEL-NEXT: s_cbranch_execz .LBB8_4
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX9GISEL-NEXT: ; %bb.3: ; %if
; GFX9GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9GISEL-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s5
-; GFX9GISEL-NEXT: .LBB8_4: ; %endif
+; GFX9GISEL-NEXT: .LBB9_4: ; %endif
; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX9GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -2131,19 +3067,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7
; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc
; GFX1064GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
-; GFX1064GISEL-NEXT: s_cbranch_execz .LBB8_2
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX1064GISEL-NEXT: ; %bb.1: ; %else
; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX1064GISEL-NEXT: .LBB8_2: ; %Flow
+; GFX1064GISEL-NEXT: .LBB9_2: ; %Flow
; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[8:9]
-; GFX1064GISEL-NEXT: s_cbranch_execz .LBB8_4
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX1064GISEL-NEXT: ; %bb.3: ; %if
; GFX1064GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], s[6:7]
-; GFX1064GISEL-NEXT: .LBB8_4: ; %endif
+; GFX1064GISEL-NEXT: .LBB9_4: ; %endif
; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, s7
@@ -2180,24 +3116,24 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1032GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7
; GFX1032GISEL-NEXT: s_and_saveexec_b32 s8, vcc_lo
; GFX1032GISEL-NEXT: s_xor_b32 s8, exec_lo, s8
-; GFX1032GISEL-NEXT: s_cbranch_execz .LBB8_2
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX1032GISEL-NEXT: ; %bb.1: ; %else
; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032GISEL-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX1032GISEL-NEXT: .LBB8_2: ; %Flow
+; GFX1032GISEL-NEXT: .LBB9_2: ; %Flow
; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032GISEL-NEXT: s_or_saveexec_b32 s2, s8
; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s7
; GFX1032GISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s2
-; GFX1032GISEL-NEXT: s_cbranch_execz .LBB8_4
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX1032GISEL-NEXT: ; %bb.3: ; %if
; GFX1032GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032GISEL-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s5
-; GFX1032GISEL-NEXT: .LBB8_4: ; %endif
+; GFX1032GISEL-NEXT: .LBB9_4: ; %endif
; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX1032GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -2236,19 +3172,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
; GFX1164GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
-; GFX1164GISEL-NEXT: s_cbranch_execz .LBB8_2
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX1164GISEL-NEXT: ; %bb.1: ; %else
; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164GISEL-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX1164GISEL-NEXT: .LBB8_2: ; %Flow
+; GFX1164GISEL-NEXT: .LBB9_2: ; %Flow
; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[2:3], s[8:9]
-; GFX1164GISEL-NEXT: s_cbranch_execz .LBB8_4
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX1164GISEL-NEXT: ; %bb.3: ; %if
; GFX1164GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164GISEL-NEXT: s_mov_b64 s[6:7], s[4:5]
-; GFX1164GISEL-NEXT: .LBB8_4: ; %endif
+; GFX1164GISEL-NEXT: .LBB9_4: ; %endif
; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, s7
@@ -2287,23 +3223,23 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
; GFX1132GISEL-NEXT: s_xor_b32 s8, exec_lo, s8
-; GFX1132GISEL-NEXT: s_cbranch_execz .LBB8_2
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX1132GISEL-NEXT: ; %bb.1: ; %else
; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132GISEL-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX1132GISEL-NEXT: .LBB8_2: ; %Flow
+; GFX1132GISEL-NEXT: .LBB9_2: ; %Flow
; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132GISEL-NEXT: s_or_saveexec_b32 s2, s8
; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
; GFX1132GISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s2
-; GFX1132GISEL-NEXT: s_cbranch_execz .LBB8_4
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX1132GISEL-NEXT: ; %bb.3: ; %if
; GFX1132GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132GISEL-NEXT: s_mov_b64 s[4:5], s[4:5]
; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX1132GISEL-NEXT: .LBB8_4: ; %endif
+; GFX1132GISEL-NEXT: .LBB9_4: ; %endif
; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1132GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX1132GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
More information about the llvm-branch-commits
mailing list