[llvm] [AMDGPU] Extend Wave Reduce Intrinsics for add, sub, and, or, xor, min, max (Integer type) (PR #111342)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Oct 8 04:35:40 PDT 2024
- Previous message: [llvm] [AMDGPU] Extend Wave Reduce Intrinsics for add, sub, and, or, xor, min, max (Integer type) (PR #111342)
- Next message: [llvm] [AMDGPU] Extend Wave Reduce Intrinsics for add, sub, and, or, xor, min, max (Integer type) (PR #111342)
- Messages sorted by:
[ date ]
[ thread ]
[ subject ]
[ author ]
https://github.com/easyonaadit updated https://github.com/llvm/llvm-project/pull/111342
>From 30454d1ceb4e126d08ed01d19fbcd7cc513ecec6 Mon Sep 17 00:00:00 2001
From: easyonaadit <aaditya.alokdeshpande at amd.com>
Date: Tue, 24 Sep 2024 15:35:42 +0530
Subject: [PATCH 1/2] Added wave reduce intrinsics for int add,sub,or,xor,and.
Still have to extend for unsigned sub and floats.
---
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 7 +
.../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 9 +-
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 236 +++++++++++++++++-
llvm/lib/Target/AMDGPU/SIInstructions.td | 35 +++
.../global_atomics_iterative_scan_fp.ll | 2 +-
5 files changed, 281 insertions(+), 8 deletions(-)
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 4cd32a0502c66d..097a074859ca10 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2121,6 +2121,13 @@ class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic<
def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce;
def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce;
+def int_amdgcn_wave_reduce_add : AMDGPUWaveReduce;
+def int_amdgcn_wave_reduce_uadd : AMDGPUWaveReduce;
+def int_amdgcn_wave_reduce_sub : AMDGPUWaveReduce;
+def int_amdgcn_wave_reduce_usub : AMDGPUWaveReduce;
+def int_amdgcn_wave_reduce_and : AMDGPUWaveReduce;
+def int_amdgcn_wave_reduce_or : AMDGPUWaveReduce;
+def int_amdgcn_wave_reduce_xor : AMDGPUWaveReduce;
def int_amdgcn_readfirstlane :
Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index f2c9619cb8276a..c5ee2944e3015e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4851,7 +4851,14 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
break;
}
case Intrinsic::amdgcn_wave_reduce_umin:
- case Intrinsic::amdgcn_wave_reduce_umax: {
+ case Intrinsic::amdgcn_wave_reduce_umax:
+ case Intrinsic::amdgcn_wave_reduce_and:
+ case Intrinsic::amdgcn_wave_reduce_or:
+ case Intrinsic::amdgcn_wave_reduce_xor:
+ case Intrinsic::amdgcn_wave_reduce_usub:
+ case Intrinsic::amdgcn_wave_reduce_sub:
+ case Intrinsic::amdgcn_wave_reduce_uadd:
+ case Intrinsic::amdgcn_wave_reduce_add: {
unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 10108866a7005a..f787f3d71fc045 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4859,10 +4859,220 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
Register DstReg = MI.getOperand(0).getReg();
MachineBasicBlock *RetBB = nullptr;
if (isSGPR) {
- // These operations with a uniform value i.e. SGPR are idempotent.
- // Reduced value will be same as given sgpr.
- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
- RetBB = &BB;
+ switch(Opc){
+ case AMDGPU::S_MIN_U32:
+ case AMDGPU::S_MAX_U32:
+ case AMDGPU::S_AND_B32:
+ case AMDGPU::S_OR_B32:
+ // These operations with a uniform value i.e. SGPR are idempotent.
+ // Reduced value will be same as given sgpr.
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
+ RetBB = &BB;
+ break;
+ // TODO --> add support for Unsigned ADD and unsigned SUB.
+ case AMDGPU::S_XOR_B32:
+ case AMDGPU::S_ADD_U32:
+ case AMDGPU::S_ADD_I32:
+ // case AMDGPU::S_SUB_U32:
+ case AMDGPU::S_SUB_I32:{
+ MachineBasicBlock::iterator I = BB.end();
+ Register SrcReg = MI.getOperand(1).getReg();
+
+ // Create Control flow for loop
+ // Split MI's Machine Basic block into For loop
+ auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
+
+ // Create virtual registers required for lowering.
+ const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
+ const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
+ Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
+ Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
+
+ Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
+ Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
+ Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
+
+ Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
+ Register CountOfActiveLanesReg = MRI.createVirtualRegister(DstRegClass);
+
+ bool IsWave32 = ST.isWave32();
+ unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+ unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+
+ // Create initail values of induction variable from Exec, Accumulator and
+ // insert branch instr to newly created ComputeBlock
+ uint32_t InitalValue = 0;
+
+ auto TmpSReg =
+ BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
+ BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
+ .addImm(InitalValue);
+ BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH)).addMBB(ComputeLoop);
+
+ // Start constructing ComputeLoop
+ I = ComputeLoop->end();
+ auto Accumulator =
+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
+ .addReg(InitalValReg)
+ .addMBB(&BB);
+ auto ActiveBits =
+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
+ .addReg(TmpSReg->getOperand(0).getReg())
+ .addMBB(&BB);
+
+ // Perform the computations
+ unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
+ auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
+ .addReg(ActiveBits->getOperand(0).getReg());
+ auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), CountOfActiveLanesReg)
+ .addReg(Accumulator->getOperand(0).getReg())
+ .addImm(1);
+
+ // Manipulate the iterator to get the next active lane
+ unsigned BITSETOpc =
+ IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
+ auto NewActiveBits =
+ BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
+ .addReg(FF1->getOperand(0).getReg())
+ .addReg(ActiveBits->getOperand(0).getReg());
+
+ // Add phi nodes
+ Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
+ .addMBB(ComputeLoop);
+ ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
+ .addMBB(ComputeLoop);
+
+ // Creating branching
+ unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
+ BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
+ .addReg(NewActiveBits->getOperand(0).getReg())
+ .addImm(0);
+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
+ .addMBB(ComputeLoop);
+
+ I = ComputeEnd->begin();
+ switch(Opc){
+ case AMDGPU::S_XOR_B32:{
+ // Performing an XOR operation on a uniform value
+ // depends on the number of active lanes. If there
+ // are an even number of active lanes, then the XOR
+ // will result in 0. And if there are an odd number
+ // of Active lanes then the XOR will result in the
+ // same value as that in the SGPR. This comes from
+ // the fact that A^A = 0 and A^0 = A.
+
+ // Create basic block to check the parity.
+ // MachineFunction &MF = *ComputeEnd->getParent();
+ // MachineBasicBlock *CheckParity = MF.CreateMachineBasicBlock();
+ // MachineFunction::iterator It = ComputeEnd->getIterator();
+ // MF.insert(It, CheckParity);
+ // ComputeLoop->addSuccessor(CheckParity);
+ // ComputeLoop->removeSuccessor(ComputeEnd);
+
+ Register ParityRegister = MRI.createVirtualRegister(DstRegClass);
+ // Register Product = MRI.createVirtualRegister(DstRegClass);
+ // Register OddResult = MRI.createVirtualRegister(DstRegClass);
+ // MachineBasicBlock *Even = MF.CreateMachineBasicBlock();
+ // MachineBasicBlock *Odd = MF.CreateMachineBasicBlock();
+ // MF.push_back(Even);
+ // MF.push_back(Odd);
+ // CheckParity->addSuccessor(Even);
+ // CheckParity->addSuccessor(Odd);
+ // Even->addSuccessor(ComputeEnd);
+ // Odd->addSuccessor(ComputeEnd);
+
+ // If the LSB is set, the number is odd, else it is even.
+ // TODO --> is FF0 faster or left-shift by 31 faster or AND 0xfffffffe??
+ // I = CheckParity->begin();
+ auto ParityReg = BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
+ .addReg(NewAccumulator->getOperand(0).getReg())
+ .addImm(1);
+
+ BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
+ .addReg(ParityReg->getOperand(0).getReg())
+ .addImm(SrcReg);
+ // BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
+ // .addMBB(Even);
+ // BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_BRANCH))
+ // .addMBB(Odd);
+
+ // If there are an even number of active lanes, the result is 0.
+ // I = Even->begin();
+ // BuildMI(*Even, I, DL, TII->get(AMDGPU::S_MOV_B32), EvenResult).addImm(0);
+ // BuildMI(*Even, I, DL, TII->get(AMDGPU::S_BRANCH))
+ // .addMBB(ComputeEnd);
+
+ // If there are an odd number of active lanes, the result is the value itself.
+ // I = Odd->begin();
+ // BuildMI(*Odd, I, DL, TII->get(AMDGPU::S_MOV_B32), OddResult).addReg(SrcReg);
+ // BuildMI(*Odd, I, DL, TII->get(AMDGPU::S_BRANCH))
+ // .addMBB(ComputeEnd);
+
+ // Add PHI node to get the appropriate result.
+ // I = ComputeEnd->begin();
+ // auto PhiNode =
+ // BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::PHI), DstReg)
+ // .addReg(EvenResult)
+ // .addMBB(Even);
+ // PhiNode.addReg(OddResult)
+ // .addMBB(Odd);
+ break;
+ }
+ case AMDGPU::S_SUB_U32:{
+ // Doubt --> how can you have a negative unsigned value??
+ break;
+ }
+ case AMDGPU::S_SUB_I32:{
+ // TODO --> use 2's compliment or subtract from 0 to find the negation of the number.
+ Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
+ // Take the negation of the source operand.
+ auto InvertedValReg = BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal).addImm(-1).addReg(SrcReg);
+ // Multiply the negated value with the number of active lanes.
+ BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MUL_I32), DstReg).addReg(InvertedValReg->getOperand(0).getReg()).addReg(NewAccumulator->getOperand(0).getReg());
+ break;
+ }
+ // Doubt --> is SSA form still have to be followed for MIR?
+ case AMDGPU::S_ADD_U32:{
+ // For unsigned multiplication, zero extend the inputs to 64bits,
+ // perform an unsigned multiplication on them and then store the
+ // 32 lower order bits as the result.
+ Register ExtendedInput = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ Register ZeroExtension = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register ExtendedCount = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ Register UnsignedProduct = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+
+ auto ZeroExtented =
+ BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MOV_B32), ZeroExtension)
+ .addImm(0);
+
+ // Zero extend the input to 64bits.
+ auto Input_64 =
+ BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::REG_SEQUENCE), ExtendedInput)
+ .addReg(SrcReg).addImm(AMDGPU::sub0)
+ .addReg(ZeroExtented->getOperand(0).getReg()).addImm(AMDGPU::sub1);
+
+ // Zero extend the number of active lanes to 64bits.
+ auto Count_64 =
+ BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::REG_SEQUENCE), ExtendedCount)
+ .addReg(NewAccumulator->getOperand(0).getReg()).addImm(AMDGPU::sub0)
+ .addReg(ZeroExtented->getOperand(0).getReg()).addImm(AMDGPU::sub1);
+
+ auto Product =
+ BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MUL_U64), UnsignedProduct)
+ .addReg(Input_64->getOperand(0).getReg())
+ .addReg(Count_64->getOperand(0).getReg());
+
+ // Store the lower 32bits of the product as the result.
+ BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(Product->getOperand(0).getReg(), 0, AMDGPU::sub0);
+ break;
+ }
+ case AMDGPU::S_ADD_I32:
+ BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MUL_I32), DstReg).addReg(SrcReg).addReg(NewAccumulator->getOperand(0).getReg());
+ }
+
+ RetBB = ComputeEnd;
+ }
+ }
} else {
// TODO: Implement DPP Strategy and switch based on immediate strategy
// operand. For now, for all the cases (default, Iterative and DPP we use
@@ -4898,7 +5108,7 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
// Create initail values of induction variable from Exec, Accumulator and
- // insert branch instr to newly created ComputeBlockk
+ // insert branch instr to newly created ComputeBlock
uint32_t InitalValue =
(Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
auto TmpSReg =
@@ -4970,6 +5180,20 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
+ case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U32);
+ case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
+ case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U32);
+ case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
+ case AMDGPU::WAVE_REDUCE_AND_PSEUDO_U32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
+ case AMDGPU::WAVE_REDUCE_OR_PSEUDO_U32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
+ case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_U32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
case AMDGPU::S_UADDO_PSEUDO:
case AMDGPU::S_USUBO_PSEUDO: {
const DebugLoc &DL = MI.getDebugLoc();
@@ -6771,7 +6995,7 @@ SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
- // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
+ // If all the operands are zero-extended to 32-bits, then we replace s_mul_u64 // TODO --> `..are zero-extended to 32-bits, then we ..` , should this be zero-extended from 32 bits?
// with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
// 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 9afb29d95abd7d..b61094cd5f6309 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -264,6 +264,41 @@ let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses
(ins VSrc_b32: $src, VSrc_b32:$strategy),
[(set i32:$sdst, (int_amdgcn_wave_reduce_umax i32:$src, i32:$strategy))]> {
}
+
+ def WAVE_REDUCE_ADD_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+ (ins VSrc_b32: $src, VSrc_b32:$strategy),
+ [(set i32:$sdst, (int_amdgcn_wave_reduce_add i32:$src, i32:$strategy))]> {
+ }
+
+ def WAVE_REDUCE_ADD_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+ (ins VSrc_b32: $src, VSrc_b32:$strategy),
+ [(set i32:$sdst, (int_amdgcn_wave_reduce_uadd i32:$src, i32:$strategy))]> {
+ }
+
+ def WAVE_REDUCE_SUB_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+ (ins VSrc_b32: $src, VSrc_b32:$strategy),
+ [(set i32:$sdst, (int_amdgcn_wave_reduce_sub i32:$src, i32:$strategy))]> {
+ }
+
+ def WAVE_REDUCE_SUB_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+ (ins VSrc_b32: $src, VSrc_b32:$strategy),
+ [(set i32:$sdst, (int_amdgcn_wave_reduce_usub i32:$src, i32:$strategy))]> {
+ }
+
+ def WAVE_REDUCE_AND_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+ (ins VSrc_b32: $src, VSrc_b32:$strategy),
+ [(set i32:$sdst, (int_amdgcn_wave_reduce_and i32:$src, i32:$strategy))]> {
+ }
+
+ def WAVE_REDUCE_OR_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+ (ins VSrc_b32: $src, VSrc_b32:$strategy),
+ [(set i32:$sdst, (int_amdgcn_wave_reduce_or i32:$src, i32:$strategy))]> {
+ }
+
+ def WAVE_REDUCE_XOR_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+ (ins VSrc_b32: $src, VSrc_b32:$strategy),
+ [(set i32:$sdst, (int_amdgcn_wave_reduce_xor i32:$src, i32:$strategy))]> {
+ }
}
let usesCustomInserter = 1, Defs = [VCC] in {
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll
index d1e50bd560cb23..02942254cc555b 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll
@@ -156,7 +156,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_value(ptr addrspace(1) %ptr) #
; IR-DPP: 14:
; IR-DPP-NEXT: ret void
;
- %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 seq_cst
+ %result = atomicrmw fsub ptr addrspace(1) %ptr, float 4.0 seq_cst
ret void
}
>From 78c41d7139ebf5fc8bd51c0a018fc0fdae8a02db Mon Sep 17 00:00:00 2001
From: easyonaadit <aaditya.alokdeshpande at amd.com>
Date: Thu, 26 Sep 2024 13:33:57 +0530
Subject: [PATCH 2/2] Wave Reduce Intrinsics for Integer Type(32 bit) ->
Operations: Add, Sub, Min, Max, AND, OR, XOR
---
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 19 +-
.../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 12 +-
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 239 +--
llvm/lib/Target/AMDGPU/SIInstructions.td | 30 +-
.../global_atomics_iterative_scan_fp.ll | 4 +-
.../CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll | 1280 ++++++++++++++++
.../CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll | 1032 +++++++++++++
.../CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll | 1031 +++++++++++++
.../CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll | 1031 +++++++++++++
.../CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll | 1031 +++++++++++++
.../CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll | 1329 ++++++++++++++++
.../CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll | 1333 +++++++++++++++++
.../AMDGPU/llvm.amdgcn.wave.reduce.add.mir | 90 ++
.../AMDGPU/llvm.amdgcn.wave.reduce.and.mir | 89 ++
.../AMDGPU/llvm.amdgcn.wave.reduce.max.mir | 89 ++
.../AMDGPU/llvm.amdgcn.wave.reduce.min.mir | 89 ++
.../AMDGPU/llvm.amdgcn.wave.reduce.or.mir | 89 ++
.../AMDGPU/llvm.amdgcn.wave.reduce.sub.mir | 92 ++
.../AMDGPU/llvm.amdgcn.wave.reduce.umax.mir | 4 +-
.../AMDGPU/llvm.amdgcn.wave.reduce.umin.mir | 4 +-
.../AMDGPU/llvm.amdgcn.wave.reduce.xor.mir | 92 ++
21 files changed, 8793 insertions(+), 216 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.add.mir
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.and.mir
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.max.mir
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.min.mir
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.or.mir
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.sub.mir
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.xor.mir
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 097a074859ca10..816078db3b1e06 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2109,7 +2109,7 @@ def int_amdgcn_s_quadmask :
def int_amdgcn_s_wqm :
DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyint_ty], [IntrNoMem, IntrConvergent]>;
-class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic<
+class AMDGPUWaveReduce<LLVMType data_ty = llvm_any_ty> : Intrinsic<
[data_ty],
[
LLVMMatchType<0>, // llvm value to reduce (SGPR/VGPR)
@@ -2119,15 +2119,14 @@ class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic<
],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<1>>]>;
-def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce;
-def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce;
-def int_amdgcn_wave_reduce_add : AMDGPUWaveReduce;
-def int_amdgcn_wave_reduce_uadd : AMDGPUWaveReduce;
-def int_amdgcn_wave_reduce_sub : AMDGPUWaveReduce;
-def int_amdgcn_wave_reduce_usub : AMDGPUWaveReduce;
-def int_amdgcn_wave_reduce_and : AMDGPUWaveReduce;
-def int_amdgcn_wave_reduce_or : AMDGPUWaveReduce;
-def int_amdgcn_wave_reduce_xor : AMDGPUWaveReduce;
+multiclass AMDGPUWaveReduceGenerator<list<string> Operations> {
+ foreach Op = Operations in {
+ def Op : AMDGPUWaveReduce;
+ }
+}
+
+defvar Operations = ["umin", "min", "umax", "max", "add", "sub", "and", "or", "xor"];
+defm int_amdgcn_wave_reduce_ : AMDGPUWaveReduceGenerator<Operations>;
def int_amdgcn_readfirstlane :
Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 68b0857fd21504..468ee031a60048 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4846,15 +4846,15 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize);
break;
}
- case Intrinsic::amdgcn_wave_reduce_umin:
+ case Intrinsic::amdgcn_wave_reduce_add:
+ case Intrinsic::amdgcn_wave_reduce_sub:
+ case Intrinsic::amdgcn_wave_reduce_min:
+ case Intrinsic::amdgcn_wave_reduce_umin:
+ case Intrinsic::amdgcn_wave_reduce_max:
case Intrinsic::amdgcn_wave_reduce_umax:
case Intrinsic::amdgcn_wave_reduce_and:
case Intrinsic::amdgcn_wave_reduce_or:
- case Intrinsic::amdgcn_wave_reduce_xor:
- case Intrinsic::amdgcn_wave_reduce_usub:
- case Intrinsic::amdgcn_wave_reduce_sub:
- case Intrinsic::amdgcn_wave_reduce_uadd:
- case Intrinsic::amdgcn_wave_reduce_add: {
+ case Intrinsic::amdgcn_wave_reduce_xor: {
unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 9b4f25ba10d42b..41b3fc65c8bf06 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4863,96 +4863,38 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
if (isSGPR) {
switch(Opc){
case AMDGPU::S_MIN_U32:
+ case AMDGPU::S_MIN_I32:
case AMDGPU::S_MAX_U32:
+ case AMDGPU::S_MAX_I32:
case AMDGPU::S_AND_B32:
- case AMDGPU::S_OR_B32:
+ case AMDGPU::S_OR_B32:{
// These operations with a uniform value i.e. SGPR are idempotent.
// Reduced value will be same as given sgpr.
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
RetBB = &BB;
break;
- // TODO --> add support for Unsigned ADD and unsigned SUB.
+ }
case AMDGPU::S_XOR_B32:
- case AMDGPU::S_ADD_U32:
case AMDGPU::S_ADD_I32:
- // case AMDGPU::S_SUB_U32:
case AMDGPU::S_SUB_I32:{
- MachineBasicBlock::iterator I = BB.end();
- Register SrcReg = MI.getOperand(1).getReg();
-
- // Create Control flow for loop
- // Split MI's Machine Basic block into For loop
- auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
-
- // Create virtual registers required for lowering.
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
- Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
- Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
-
- Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
- Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
- Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
-
- Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
+ Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
Register CountOfActiveLanesReg = MRI.createVirtualRegister(DstRegClass);
bool IsWave32 = ST.isWave32();
unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+ unsigned CountReg = IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
// Create initail values of induction variable from Exec, Accumulator and
- // insert branch instr to newly created ComputeBlock
- uint32_t InitalValue = 0;
-
- auto TmpSReg =
- BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
- BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
- .addImm(InitalValue);
- BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH)).addMBB(ComputeLoop);
-
- // Start constructing ComputeLoop
- I = ComputeLoop->end();
- auto Accumulator =
- BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
- .addReg(InitalValReg)
- .addMBB(&BB);
- auto ActiveBits =
- BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
- .addReg(TmpSReg->getOperand(0).getReg())
- .addMBB(&BB);
-
- // Perform the computations
- unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
- auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
- .addReg(ActiveBits->getOperand(0).getReg());
- auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), CountOfActiveLanesReg)
- .addReg(Accumulator->getOperand(0).getReg())
- .addImm(1);
-
- // Manipulate the iterator to get the next active lane
- unsigned BITSETOpc =
- IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
- auto NewActiveBits =
- BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
- .addReg(FF1->getOperand(0).getReg())
- .addReg(ActiveBits->getOperand(0).getReg());
-
- // Add phi nodes
- Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
- .addMBB(ComputeLoop);
- ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
- .addMBB(ComputeLoop);
-
- // Creating branching
- unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
- BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
- .addReg(NewActiveBits->getOperand(0).getReg())
- .addImm(0);
- BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
- .addMBB(ComputeLoop);
+ // insert branch instr to newly created ComputeBlock
+ auto Exec =
+ BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
+
+ auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), CountOfActiveLanesReg)
+ .addReg(Exec->getOperand(0).getReg());
- I = ComputeEnd->begin();
switch(Opc){
case AMDGPU::S_XOR_B32:{
// Performing an XOR operation on a uniform value
@@ -4963,116 +4905,34 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
// same value as that in the SGPR. This comes from
// the fact that A^A = 0 and A^0 = A.
- // Create basic block to check the parity.
- // MachineFunction &MF = *ComputeEnd->getParent();
- // MachineBasicBlock *CheckParity = MF.CreateMachineBasicBlock();
- // MachineFunction::iterator It = ComputeEnd->getIterator();
- // MF.insert(It, CheckParity);
- // ComputeLoop->addSuccessor(CheckParity);
- // ComputeLoop->removeSuccessor(ComputeEnd);
-
Register ParityRegister = MRI.createVirtualRegister(DstRegClass);
- // Register Product = MRI.createVirtualRegister(DstRegClass);
- // Register OddResult = MRI.createVirtualRegister(DstRegClass);
- // MachineBasicBlock *Even = MF.CreateMachineBasicBlock();
- // MachineBasicBlock *Odd = MF.CreateMachineBasicBlock();
- // MF.push_back(Even);
- // MF.push_back(Odd);
- // CheckParity->addSuccessor(Even);
- // CheckParity->addSuccessor(Odd);
- // Even->addSuccessor(ComputeEnd);
- // Odd->addSuccessor(ComputeEnd);
-
- // If the LSB is set, the number is odd, else it is even.
- // TODO --> is FF0 faster or left-shift by 31 faster or AND 0xfffffffe??
- // I = CheckParity->begin();
- auto ParityReg = BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
+
+ auto ParityReg = BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
.addReg(NewAccumulator->getOperand(0).getReg())
.addImm(1);
-
- BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
- .addReg(ParityReg->getOperand(0).getReg())
- .addImm(SrcReg);
- // BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
- // .addMBB(Even);
- // BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_BRANCH))
- // .addMBB(Odd);
-
- // If there are an even number of active lanes, the result is 0.
- // I = Even->begin();
- // BuildMI(*Even, I, DL, TII->get(AMDGPU::S_MOV_B32), EvenResult).addImm(0);
- // BuildMI(*Even, I, DL, TII->get(AMDGPU::S_BRANCH))
- // .addMBB(ComputeEnd);
-
- // If there are an odd number of active lanes, the result is the value itself.
- // I = Odd->begin();
- // BuildMI(*Odd, I, DL, TII->get(AMDGPU::S_MOV_B32), OddResult).addReg(SrcReg);
- // BuildMI(*Odd, I, DL, TII->get(AMDGPU::S_BRANCH))
- // .addMBB(ComputeEnd);
-
- // Add PHI node to get the appropriate result.
- // I = ComputeEnd->begin();
- // auto PhiNode =
- // BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::PHI), DstReg)
- // .addReg(EvenResult)
- // .addMBB(Even);
- // PhiNode.addReg(OddResult)
- // .addMBB(Odd);
- break;
- }
- case AMDGPU::S_SUB_U32:{
- // Doubt --> how can you have a negative unsigned value??
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
+ .addReg(SrcReg)
+ .addReg(ParityReg->getOperand(0).getReg()) ;
break;
}
case AMDGPU::S_SUB_I32:{
- // TODO --> use 2's compliment or subtract from 0 to find the negation of the number.
Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
+
// Take the negation of the source operand.
- auto InvertedValReg = BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal).addImm(-1).addReg(SrcReg);
- // Multiply the negated value with the number of active lanes.
- BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MUL_I32), DstReg).addReg(InvertedValReg->getOperand(0).getReg()).addReg(NewAccumulator->getOperand(0).getReg());
+ auto InvertedValReg = BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal).addImm(-1).addReg(SrcReg);
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
+ .addReg(InvertedValReg->getOperand(0).getReg())
+ .addReg(NewAccumulator->getOperand(0).getReg());
break;
}
- // Doubt --> is SSA form still have to be followed for MIR?
- case AMDGPU::S_ADD_U32:{
- // For unsigned multiplication, zero extend the inputs to 64bits,
- // perform an unsigned multiplication on them and then store the
- // 32 lower order bits as the result.
- Register ExtendedInput = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
- Register ZeroExtension = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
- Register ExtendedCount = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
- Register UnsignedProduct = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
-
- auto ZeroExtented =
- BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MOV_B32), ZeroExtension)
- .addImm(0);
-
- // Zero extend the input to 64bits.
- auto Input_64 =
- BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::REG_SEQUENCE), ExtendedInput)
- .addReg(SrcReg).addImm(AMDGPU::sub0)
- .addReg(ZeroExtented->getOperand(0).getReg()).addImm(AMDGPU::sub1);
-
- // Zero extend the number of active lanes to 64bits.
- auto Count_64 =
- BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::REG_SEQUENCE), ExtendedCount)
- .addReg(NewAccumulator->getOperand(0).getReg()).addImm(AMDGPU::sub0)
- .addReg(ZeroExtented->getOperand(0).getReg()).addImm(AMDGPU::sub1);
-
- auto Product =
- BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MUL_U64), UnsignedProduct)
- .addReg(Input_64->getOperand(0).getReg())
- .addReg(Count_64->getOperand(0).getReg());
-
- // Store the lower 32bits of the product as the result.
- BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(Product->getOperand(0).getReg(), 0, AMDGPU::sub0);
+ case AMDGPU::S_ADD_I32:{
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
+ .addReg(SrcReg)
+ .addReg(NewAccumulator->getOperand(0).getReg());
break;
}
- case AMDGPU::S_ADD_I32:
- BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MUL_I32), DstReg).addReg(SrcReg).addReg(NewAccumulator->getOperand(0).getReg());
}
-
- RetBB = ComputeEnd;
+ RetBB = &BB;
}
}
} else {
@@ -5111,8 +4971,29 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
// Create initail values of induction variable from Exec, Accumulator and
// insert branch instr to newly created ComputeBlock
- uint32_t InitalValue =
- (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
+ uint32_t InitalValue;
+ switch(Opc){
+ case AMDGPU::S_MIN_U32:
+ InitalValue = std::numeric_limits<uint32_t>::max();
+ break;
+ case AMDGPU::S_MIN_I32:
+ InitalValue = std::numeric_limits<int32_t>::max();
+ break;
+ case AMDGPU::S_MAX_U32:
+ InitalValue = 0;
+ break;
+ case AMDGPU::S_MAX_I32:
+ InitalValue = std::numeric_limits<int32_t>::min();
+ break;
+ case AMDGPU::S_ADD_I32:
+ case AMDGPU::S_SUB_I32:
+ case AMDGPU::S_OR_B32:
+ case AMDGPU::S_XOR_B32:
+ InitalValue = 0x00000000;
+ break;
+ case AMDGPU::S_AND_B32:
+ InitalValue = 0xFFFFFFFF;
+ }
auto TmpSReg =
BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
@@ -5178,23 +5059,23 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
switch (MI.getOpcode()) {
- case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
+ case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_U32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
- case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
+ case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
+ case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_U32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
- case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U32:
- return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U32);
+ case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
- case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U32:
- return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U32);
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
- case AMDGPU::WAVE_REDUCE_AND_PSEUDO_U32:
+ case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
- case AMDGPU::WAVE_REDUCE_OR_PSEUDO_U32:
+ case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
- case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_U32:
+ case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
case AMDGPU::S_UADDO_PSEUDO:
case AMDGPU::S_USUBO_PSEUDO: {
@@ -7083,7 +6964,7 @@ SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
- // If all the operands are zero-extended to 32-bits, then we replace s_mul_u64 // TODO --> `..are zero-extended to 32-bits, then we ..` , should this be zero-extended from 32 bits?
+ // If all the operands are zero-extended to 32-bits, then we replace s_mul_u64
// with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
// 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index b61094cd5f6309..08a489e549bd63 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -255,24 +255,29 @@ def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)),
(V_SET_INACTIVE_B32 0, VGPR_32:$src, 0, VGPR_32:$inactive, (IMPLICIT_DEF))>;
let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
- def WAVE_REDUCE_UMIN_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+ def WAVE_REDUCE_MIN_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
(ins VSrc_b32: $src, VSrc_b32:$strategy),
[(set i32:$sdst, (int_amdgcn_wave_reduce_umin i32:$src, i32:$strategy))]> {
}
- def WAVE_REDUCE_UMAX_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+ def WAVE_REDUCE_MIN_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+ (ins VSrc_b32: $src, VSrc_b32:$strategy),
+ [(set i32:$sdst, (int_amdgcn_wave_reduce_min i32:$src, i32:$strategy))]> {
+ }
+
+ def WAVE_REDUCE_MAX_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
(ins VSrc_b32: $src, VSrc_b32:$strategy),
[(set i32:$sdst, (int_amdgcn_wave_reduce_umax i32:$src, i32:$strategy))]> {
}
- def WAVE_REDUCE_ADD_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+ def WAVE_REDUCE_MAX_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst),
(ins VSrc_b32: $src, VSrc_b32:$strategy),
- [(set i32:$sdst, (int_amdgcn_wave_reduce_add i32:$src, i32:$strategy))]> {
+ [(set i32:$sdst, (int_amdgcn_wave_reduce_max i32:$src, i32:$strategy))]> {
}
-
- def WAVE_REDUCE_ADD_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+
+ def WAVE_REDUCE_ADD_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst),
(ins VSrc_b32: $src, VSrc_b32:$strategy),
- [(set i32:$sdst, (int_amdgcn_wave_reduce_uadd i32:$src, i32:$strategy))]> {
+ [(set i32:$sdst, (int_amdgcn_wave_reduce_add i32:$src, i32:$strategy))]> {
}
def WAVE_REDUCE_SUB_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst),
@@ -280,22 +285,17 @@ let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses
[(set i32:$sdst, (int_amdgcn_wave_reduce_sub i32:$src, i32:$strategy))]> {
}
- def WAVE_REDUCE_SUB_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
- (ins VSrc_b32: $src, VSrc_b32:$strategy),
- [(set i32:$sdst, (int_amdgcn_wave_reduce_usub i32:$src, i32:$strategy))]> {
- }
-
- def WAVE_REDUCE_AND_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+ def WAVE_REDUCE_AND_PSEUDO_B32 : VPseudoInstSI <(outs SGPR_32:$sdst),
(ins VSrc_b32: $src, VSrc_b32:$strategy),
[(set i32:$sdst, (int_amdgcn_wave_reduce_and i32:$src, i32:$strategy))]> {
}
- def WAVE_REDUCE_OR_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+ def WAVE_REDUCE_OR_PSEUDO_B32 : VPseudoInstSI <(outs SGPR_32:$sdst),
(ins VSrc_b32: $src, VSrc_b32:$strategy),
[(set i32:$sdst, (int_amdgcn_wave_reduce_or i32:$src, i32:$strategy))]> {
}
- def WAVE_REDUCE_XOR_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+ def WAVE_REDUCE_XOR_PSEUDO_B32 : VPseudoInstSI <(outs SGPR_32:$sdst),
(ins VSrc_b32: $src, VSrc_b32:$strategy),
[(set i32:$sdst, (int_amdgcn_wave_reduce_xor i32:$src, i32:$strategy))]> {
}
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll
index 02942254cc555b..fe47715681c389 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll
@@ -132,7 +132,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_value(ptr addrspace(1) %ptr) #
; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP6]], 0
; IR-ITERATIVE-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP14:%.*]]
; IR-ITERATIVE: 12:
-; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP10]] seq_cst, align 4
+; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[TMP10]] seq_cst, align 4
; IR-ITERATIVE-NEXT: br label [[TMP14]]
; IR-ITERATIVE: 14:
; IR-ITERATIVE-NEXT: ret void
@@ -151,7 +151,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_value(ptr addrspace(1) %ptr) #
; IR-DPP-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP6]], 0
; IR-DPP-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP14:%.*]]
; IR-DPP: 12:
-; IR-DPP-NEXT: [[TMP13:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP10]] seq_cst, align 4
+; IR-DPP-NEXT: [[TMP13:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[TMP10]] seq_cst, align 4
; IR-DPP-NEXT: br label [[TMP14]]
; IR-DPP: 14:
; IR-DPP-NEXT: ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll
new file mode 100644
index 00000000000000..20e8b76aeed16b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll
@@ -0,0 +1,1280 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s
+
+declare i32 @llvm.amdgcn.wave.reduce.add.i32(i32, i32 immarg)
+declare i32 @llvm.amdgcn.workitem.id.x()
+
+define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: uniform_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: s_mul_i32 s2, s4, s2
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: uniform_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: s_mul_i32 s2, s4, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: uniform_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: s_mul_i32 s2, s4, s2
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: uniform_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: s_mul_i32 s2, s4, s2
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: uniform_value:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_clause 0x1
+; GFX1064DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s4, s2
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: uniform_value:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_clause 0x1
+; GFX1064GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: s_mul_i32 s2, s4, s2
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: uniform_value:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_clause 0x1
+; GFX1032DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s4, s2
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: uniform_value:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_clause 0x1
+; GFX1032GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: s_mul_i32 s2, s4, s2
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: uniform_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_clause 0x1
+; GFX1164DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s4, s2
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_nop 0
+; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: uniform_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_clause 0x1
+; GFX1164GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: s_mul_i32 s2, s4, s2
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_nop 0
+; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: uniform_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_clause 0x1
+; GFX1132DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s4, s2
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_nop 0
+; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: uniform_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_clause 0x1
+; GFX1132GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: s_mul_i32 s2, s4, s2
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_nop 0
+; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132GISEL-NEXT: s_endpgm
+entry:
+ %result = call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 %in, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @const_value(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: const_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8DAGISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: const_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8GISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: const_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9DAGISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: const_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9GISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: const_value:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: const_value:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: const_value:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032DAGISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: const_value:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032GISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: const_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_nop 0
+; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: const_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_nop 0
+; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: const_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132DAGISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_nop 0
+; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: const_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_nop 0
+; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132GISEL-NEXT: s_endpgm
+ entry:
+ %result = call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 123, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @poison_value(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: poison_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: poison_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: poison_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: poison_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: poison_value:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: poison_value:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: poison_value:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: poison_value:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: poison_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_nop 0
+; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: poison_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_nop 0
+; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: poison_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_nop 0
+; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: poison_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_nop 0
+; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132GISEL-NEXT: s_endpgm
+entry:
+ %result = call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 poison, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: divergent_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_mov_b32 s4, 0
+; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX8DAGISEL-NEXT: s_add_i32 s4, s4, s6
+; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX8DAGISEL-NEXT: ; %bb.2:
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_mov_b32 s4, 0
+; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX8GISEL-NEXT: s_add_i32 s4, s4, s6
+; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX8GISEL-NEXT: ; %bb.2:
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_mov_b32 s4, 0
+; GFX9DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX9DAGISEL-NEXT: s_add_i32 s4, s4, s6
+; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX9DAGISEL-NEXT: ; %bb.2:
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_mov_b32 s4, 0
+; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX9GISEL-NEXT: s_add_i32 s4, s4, s6
+; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX9GISEL-NEXT: ; %bb.2:
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_value:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: s_mov_b32 s4, 0
+; GFX1064DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1064DAGISEL-NEXT: s_add_i32 s4, s4, s6
+; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1064DAGISEL-NEXT: ; %bb.2:
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_value:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: s_mov_b32 s4, 0
+; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1064GISEL-NEXT: s_add_i32 s4, s4, s6
+; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1064GISEL-NEXT: ; %bb.2:
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_value:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, 0
+; GFX1032DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1032DAGISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1032DAGISEL-NEXT: s_add_i32 s2, s2, s5
+; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1032DAGISEL-NEXT: ; %bb.2:
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_value:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032GISEL-NEXT: s_mov_b32 s2, 0
+; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032GISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1032GISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1032GISEL-NEXT: s_add_i32 s2, s2, s5
+; GFX1032GISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1032GISEL-NEXT: ; %bb.2:
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: s_mov_b32 s4, 0
+; GFX1164DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: s_add_i32 s4, s4, s6
+; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1164DAGISEL-NEXT: ; %bb.2:
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164DAGISEL-NEXT: s_nop 0
+; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: s_mov_b32 s4, 0
+; GFX1164GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT: s_add_i32 s4, s4, s6
+; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1164GISEL-NEXT: ; %bb.2:
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_nop 0
+; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132DAGISEL-NEXT: s_mov_b32 s2, 0
+; GFX1132DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s3
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: s_add_i32 s2, s2, s5
+; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1132DAGISEL-NEXT: ; %bb.2:
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132DAGISEL-NEXT: s_nop 0
+; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132GISEL-NEXT: s_mov_b32 s2, 0
+; GFX1132GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s3
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT: s_add_i32 s2, s2, s5
+; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1132GISEL-NEXT: ; %bb.2:
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_nop 0
+; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132GISEL-NEXT: s_endpgm
+entry:
+ %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+ %result = call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 %id.x, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: divergent_cfg:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr4
+; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX8DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX8DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX8DAGISEL-NEXT: s_load_dword s6, s[2:3], 0x2c
+; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: s_mul_i32 s4, s6, s4
+; GFX8DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s4
+; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX8DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX8DAGISEL-NEXT: s_mov_b32 s6, 0
+; GFX8DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX8DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX8DAGISEL-NEXT: s_add_i32 s6, s6, s8
+; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX8DAGISEL-NEXT: ; %bb.5:
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX8DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT: flat_store_dword v[2:3], v1
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_cfg:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
+; GFX8GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX8GISEL-NEXT: ; %bb.1: ; %else
+; GFX8GISEL-NEXT: s_load_dword s6, s[2:3], 0x2c
+; GFX8GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX8GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: s_mul_i32 s6, s6, s4
+; GFX8GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX8GISEL-NEXT: ; %bb.3: ; %if
+; GFX8GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX8GISEL-NEXT: s_mov_b32 s6, 0
+; GFX8GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX8GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX8GISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX8GISEL-NEXT: s_add_i32 s6, s6, s8
+; GFX8GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX8GISEL-NEXT: .LBB4_5: ; %endif
+; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_cfg:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr4
+; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX9DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX9DAGISEL-NEXT: s_load_dword s6, s[2:3], 0x2c
+; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: s_mul_i32 s4, s6, s4
+; GFX9DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s4
+; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX9DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX9DAGISEL-NEXT: s_mov_b32 s6, 0
+; GFX9DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX9DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX9DAGISEL-NEXT: s_add_i32 s6, s6, s8
+; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX9DAGISEL-NEXT: ; %bb.5:
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX9DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_cfg:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
+; GFX9GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX9GISEL-NEXT: ; %bb.1: ; %else
+; GFX9GISEL-NEXT: s_load_dword s6, s[2:3], 0x2c
+; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX9GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: s_mul_i32 s6, s6, s4
+; GFX9GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX9GISEL-NEXT: ; %bb.3: ; %if
+; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX9GISEL-NEXT: s_mov_b32 s6, 0
+; GFX9GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX9GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX9GISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX9GISEL-NEXT: s_add_i32 s6, s6, s8
+; GFX9GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX9GISEL-NEXT: .LBB4_5: ; %endif
+; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_cfg:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr4
+; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1064DAGISEL-NEXT: s_load_dword s6, s[2:3], 0x2c
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: s_mul_i32 s4, s6, s4
+; GFX1064DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s4
+; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064DAGISEL-NEXT: s_mov_b32 s6, 0
+; GFX1064DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX1064DAGISEL-NEXT: s_add_i32 s6, s6, s8
+; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1064DAGISEL-NEXT: ; %bb.5:
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX1064DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_cfg:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
+; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1064GISEL-NEXT: ; %bb.1: ; %else
+; GFX1064GISEL-NEXT: s_load_dword s6, s[2:3], 0x2c
+; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: s_mul_i32 s6, s6, s4
+; GFX1064GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1064GISEL-NEXT: ; %bb.3: ; %if
+; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064GISEL-NEXT: s_mov_b32 s6, 0
+; GFX1064GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX1064GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1064GISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX1064GISEL-NEXT: s_add_i32 s6, s6, s8
+; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1064GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_cfg:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc_lo, 15, v0
+; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr1
+; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1032DAGISEL-NEXT: s_load_dword s1, s[2:3], 0x2c
+; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1032DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s4, s4
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: s_mul_i32 s1, s1, s4
+; GFX1032DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, s0
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1032DAGISEL-NEXT: s_mov_b32 s1, 0
+; GFX1032DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s5, s4
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1032DAGISEL-NEXT: s_bitset0_b32 s4, s5
+; GFX1032DAGISEL-NEXT: s_add_i32 s1, s1, s6
+; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1032DAGISEL-NEXT: ; %bb.5:
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1032DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_cfg:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v0
+; GFX1032GISEL-NEXT: ; implicit-def: $sgpr0
+; GFX1032GISEL-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX1032GISEL-NEXT: s_xor_b32 s1, exec_lo, s1
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1032GISEL-NEXT: ; %bb.1: ; %else
+; GFX1032GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c
+; GFX1032GISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s4, s4
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: s_mul_i32 s0, s0, s4
+; GFX1032GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s1, s1
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1032GISEL-NEXT: ; %bb.3: ; %if
+; GFX1032GISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1032GISEL-NEXT: s_mov_b32 s0, 0
+; GFX1032GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: s_ff1_i32_b32 s5, s4
+; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1032GISEL-NEXT: s_bitset0_b32 s4, s5
+; GFX1032GISEL-NEXT: s_add_i32 s0, s0, s6
+; GFX1032GISEL-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1032GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[4:5]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_cfg:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr4
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
+; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1164DAGISEL-NEXT: s_load_b32 s6, s[2:3], 0x2c
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: s_mul_i32 s4, s6, s4
+; GFX1164DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s4
+; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164DAGISEL-NEXT: s_mov_b32 s6, 0
+; GFX1164DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[4:5]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX1164DAGISEL-NEXT: s_add_i32 s6, s6, s8
+; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1164DAGISEL-NEXT: ; %bb.5:
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX1164DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_nop 0
+; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_cfg:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
+; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1164GISEL-NEXT: ; %bb.1: ; %else
+; GFX1164GISEL-NEXT: s_load_b32 s6, s[2:3], 0x2c
+; GFX1164GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: s_mul_i32 s6, s6, s4
+; GFX1164GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1]
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1164GISEL-NEXT: ; %bb.3: ; %if
+; GFX1164GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164GISEL-NEXT: s_mov_b32 s6, 0
+; GFX1164GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[4:5]
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1164GISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX1164GISEL-NEXT: s_add_i32 s6, s6, s8
+; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1164GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_nop 0
+; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_cfg:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr1
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
+; GFX1132DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1132DAGISEL-NEXT: s_load_b32 s1, s[2:3], 0x2c
+; GFX1132DAGISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s4, s4
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: s_mul_i32 s1, s1, s4
+; GFX1132DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, s0
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1132DAGISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0
+; GFX1132DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s5, s4
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1132DAGISEL-NEXT: s_bitset0_b32 s4, s5
+; GFX1132DAGISEL-NEXT: s_add_i32 s1, s1, s6
+; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1132DAGISEL-NEXT: ; %bb.5:
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1132DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_nop 0
+; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_cfg:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT: s_mov_b32 s1, exec_lo
+; GFX1132GISEL-NEXT: ; implicit-def: $sgpr0
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
+; GFX1132GISEL-NEXT: s_xor_b32 s1, exec_lo, s1
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1132GISEL-NEXT: ; %bb.1: ; %else
+; GFX1132GISEL-NEXT: s_load_b32 s0, s[2:3], 0x2c
+; GFX1132GISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s4, s4
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: s_mul_i32 s0, s0, s4
+; GFX1132GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s1, s1
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1132GISEL-NEXT: ; %bb.3: ; %if
+; GFX1132GISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132GISEL-NEXT: s_mov_b32 s0, 0
+; GFX1132GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: s_ctz_i32_b32 s5, s4
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1132GISEL-NEXT: s_bitset0_b32 s4, s5
+; GFX1132GISEL-NEXT: s_add_i32 s0, s0, s6
+; GFX1132GISEL-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1132GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
+; GFX1132GISEL-NEXT: s_nop 0
+; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132GISEL-NEXT: s_endpgm
+ entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %d_cmp = icmp ult i32 %tid, 16
+ br i1 %d_cmp, label %if, label %else
+
+if:
+ %reducedValTid = call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 %tid, i32 1)
+ br label %endif
+
+else:
+ %reducedValIn = call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 %in, i32 1)
+ br label %endif
+
+endif:
+ %combine = phi i32 [%reducedValTid, %if], [%reducedValIn, %else]
+ store i32 %combine, ptr addrspace(1) %out
+ ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX10DAGISEL: {{.*}}
+; GFX10GISEL: {{.*}}
+; GFX11DAGISEL: {{.*}}
+; GFX11GISEL: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll
new file mode 100644
index 00000000000000..f70d9b476e830d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll
@@ -0,0 +1,1032 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s
+
+declare i32 @llvm.amdgcn.wave.reduce.and.i32(i32, i32 immarg)
+declare i32 @llvm.amdgcn.workitem.id.x()
+
+define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: uniform_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8DAGISEL-NEXT: s_load_dword s2, s[2:3], 0x2c
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: uniform_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: uniform_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s4
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: uniform_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX10DAGISEL-LABEL: uniform_value:
+; GFX10DAGISEL: ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT: s_clause 0x1
+; GFX10DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, s4
+; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10DAGISEL-NEXT: s_endpgm
+;
+; GFX10GISEL-LABEL: uniform_value:
+; GFX10GISEL: ; %bb.0: ; %entry
+; GFX10GISEL-NEXT: s_clause 0x1
+; GFX10GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX10GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: uniform_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_clause 0x1
+; GFX1164DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s4
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_nop 0
+; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: uniform_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_clause 0x1
+; GFX1164GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_nop 0
+; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: uniform_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_clause 0x1
+; GFX1132DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_nop 0
+; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: uniform_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_clause 0x1
+; GFX1132GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_nop 0
+; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132GISEL-NEXT: s_endpgm
+entry:
+ %result = call i32 @llvm.amdgcn.wave.reduce.and.i32(i32 %in, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @const_value(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: const_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, 0x7b
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: const_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, 0x7b
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: const_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: const_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX10DAGISEL-LABEL: const_value:
+; GFX10DAGISEL: ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b
+; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10DAGISEL-NEXT: s_endpgm
+;
+; GFX10GISEL-LABEL: const_value:
+; GFX10GISEL: ; %bb.0: ; %entry
+; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: const_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_nop 0
+; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: const_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_nop 0
+; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: const_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7b
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_nop 0
+; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: const_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_nop 0
+; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132GISEL-NEXT: s_endpgm
+ entry:
+ %result = call i32 @llvm.amdgcn.wave.reduce.and.i32(i32 123, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @poison_value(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: poison_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v0
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: poison_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v0
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: poison_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v0, v0, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: poison_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v0, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX10DAGISEL-LABEL: poison_value:
+; GFX10DAGISEL: ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT: global_store_dword v0, v0, s[0:1]
+; GFX10DAGISEL-NEXT: s_endpgm
+;
+; GFX10GISEL-LABEL: poison_value:
+; GFX10GISEL: ; %bb.0: ; %entry
+; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT: global_store_dword v0, v0, s[0:1]
+; GFX10GISEL-NEXT: s_endpgm
+;
+; GFX11DAGISEL-LABEL: poison_value:
+; GFX11DAGISEL: ; %bb.0: ; %entry
+; GFX11DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX11DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX11DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11DAGISEL-NEXT: global_store_b32 v0, v0, s[0:1]
+; GFX11DAGISEL-NEXT: s_nop 0
+; GFX11DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11DAGISEL-NEXT: s_endpgm
+;
+; GFX11GISEL-LABEL: poison_value:
+; GFX11GISEL: ; %bb.0: ; %entry
+; GFX11GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX11GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX11GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11GISEL-NEXT: global_store_b32 v0, v0, s[0:1]
+; GFX11GISEL-NEXT: s_nop 0
+; GFX11GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11GISEL-NEXT: s_endpgm
+entry:
+ %result = call i32 @llvm.amdgcn.wave.reduce.and.i32(i32 poison, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: divergent_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_mov_b32 s4, -1
+; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX8DAGISEL-NEXT: s_and_b32 s4, s4, s6
+; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX8DAGISEL-NEXT: ; %bb.2:
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_mov_b32 s4, -1
+; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX8GISEL-NEXT: s_and_b32 s4, s4, s6
+; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX8GISEL-NEXT: ; %bb.2:
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_mov_b32 s4, -1
+; GFX9DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX9DAGISEL-NEXT: s_and_b32 s4, s4, s6
+; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX9DAGISEL-NEXT: ; %bb.2:
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_mov_b32 s4, -1
+; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX9GISEL-NEXT: s_and_b32 s4, s4, s6
+; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX9GISEL-NEXT: ; %bb.2:
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_value:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: s_mov_b32 s4, -1
+; GFX1064DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1064DAGISEL-NEXT: s_and_b32 s4, s4, s6
+; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1064DAGISEL-NEXT: ; %bb.2:
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_value:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: s_mov_b32 s4, -1
+; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1064GISEL-NEXT: s_and_b32 s4, s4, s6
+; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1064GISEL-NEXT: ; %bb.2:
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_value:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, -1
+; GFX1032DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1032DAGISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1032DAGISEL-NEXT: s_and_b32 s2, s2, s5
+; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1032DAGISEL-NEXT: ; %bb.2:
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_value:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032GISEL-NEXT: s_mov_b32 s2, -1
+; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032GISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1032GISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1032GISEL-NEXT: s_and_b32 s2, s2, s5
+; GFX1032GISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1032GISEL-NEXT: ; %bb.2:
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: s_mov_b32 s4, -1
+; GFX1164DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: s_and_b32 s4, s4, s6
+; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1164DAGISEL-NEXT: ; %bb.2:
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164DAGISEL-NEXT: s_nop 0
+; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: s_mov_b32 s4, -1
+; GFX1164GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT: s_and_b32 s4, s4, s6
+; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1164GISEL-NEXT: ; %bb.2:
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_nop 0
+; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132DAGISEL-NEXT: s_mov_b32 s2, -1
+; GFX1132DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s3
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: s_and_b32 s2, s2, s5
+; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1132DAGISEL-NEXT: ; %bb.2:
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132DAGISEL-NEXT: s_nop 0
+; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132GISEL-NEXT: s_mov_b32 s2, -1
+; GFX1132GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s3
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT: s_and_b32 s2, s2, s5
+; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1132GISEL-NEXT: ; %bb.2:
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_nop 0
+; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132GISEL-NEXT: s_endpgm
+entry:
+ %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+ %result = call i32 @llvm.amdgcn.wave.reduce.and.i32(i32 %id.x, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: divergent_cfg:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr4
+; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX8DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX8DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX8DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX8DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s4
+; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX8DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX8DAGISEL-NEXT: s_mov_b32 s6, -1
+; GFX8DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX8DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX8DAGISEL-NEXT: s_and_b32 s6, s6, s8
+; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX8DAGISEL-NEXT: ; %bb.5:
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX8DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT: flat_store_dword v[2:3], v1
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_cfg:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
+; GFX8GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX8GISEL-NEXT: ; %bb.1: ; %else
+; GFX8GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX8GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: s_mov_b32 s6, s4
+; GFX8GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX8GISEL-NEXT: ; %bb.3: ; %if
+; GFX8GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX8GISEL-NEXT: s_mov_b32 s6, -1
+; GFX8GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX8GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX8GISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX8GISEL-NEXT: s_and_b32 s6, s6, s8
+; GFX8GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX8GISEL-NEXT: .LBB4_5: ; %endif
+; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_cfg:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr4
+; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX9DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX9DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX9DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s4
+; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX9DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX9DAGISEL-NEXT: s_mov_b32 s6, -1
+; GFX9DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX9DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX9DAGISEL-NEXT: s_and_b32 s6, s6, s8
+; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX9DAGISEL-NEXT: ; %bb.5:
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX9DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_cfg:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
+; GFX9GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX9GISEL-NEXT: ; %bb.1: ; %else
+; GFX9GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX9GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: s_mov_b32 s6, s4
+; GFX9GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX9GISEL-NEXT: ; %bb.3: ; %if
+; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX9GISEL-NEXT: s_mov_b32 s6, -1
+; GFX9GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX9GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX9GISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX9GISEL-NEXT: s_and_b32 s6, s6, s8
+; GFX9GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX9GISEL-NEXT: .LBB4_5: ; %endif
+; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_cfg:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr4
+; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1064DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX1064DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1064DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s4
+; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064DAGISEL-NEXT: s_mov_b32 s6, -1
+; GFX1064DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX1064DAGISEL-NEXT: s_and_b32 s6, s6, s8
+; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1064DAGISEL-NEXT: ; %bb.5:
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX1064DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_cfg:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
+; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1064GISEL-NEXT: ; %bb.1: ; %else
+; GFX1064GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: s_mov_b32 s6, s4
+; GFX1064GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1064GISEL-NEXT: ; %bb.3: ; %if
+; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064GISEL-NEXT: s_mov_b32 s6, -1
+; GFX1064GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX1064GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1064GISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX1064GISEL-NEXT: s_and_b32 s6, s6, s8
+; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1064GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_cfg:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc_lo, 15, v0
+; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr1
+; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1032DAGISEL-NEXT: s_load_dword s1, s[2:3], 0x2c
+; GFX1032DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1032DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, s0
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1032DAGISEL-NEXT: s_mov_b32 s1, -1
+; GFX1032DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s5, s4
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1032DAGISEL-NEXT: s_bitset0_b32 s4, s5
+; GFX1032DAGISEL-NEXT: s_and_b32 s1, s1, s6
+; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1032DAGISEL-NEXT: ; %bb.5:
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1032DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_cfg:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v0
+; GFX1032GISEL-NEXT: ; implicit-def: $sgpr0
+; GFX1032GISEL-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX1032GISEL-NEXT: s_xor_b32 s1, exec_lo, s1
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1032GISEL-NEXT: ; %bb.1: ; %else
+; GFX1032GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c
+; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: s_mov_b32 s0, s0
+; GFX1032GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s1, s1
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1032GISEL-NEXT: ; %bb.3: ; %if
+; GFX1032GISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1032GISEL-NEXT: s_mov_b32 s0, -1
+; GFX1032GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: s_ff1_i32_b32 s5, s4
+; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1032GISEL-NEXT: s_bitset0_b32 s4, s5
+; GFX1032GISEL-NEXT: s_and_b32 s0, s0, s6
+; GFX1032GISEL-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1032GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[4:5]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_cfg:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr4
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
+; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1164DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c
+; GFX1164DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1164DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s4
+; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164DAGISEL-NEXT: s_mov_b32 s6, -1
+; GFX1164DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[4:5]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX1164DAGISEL-NEXT: s_and_b32 s6, s6, s8
+; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1164DAGISEL-NEXT: ; %bb.5:
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX1164DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_nop 0
+; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_cfg:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
+; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1164GISEL-NEXT: ; %bb.1: ; %else
+; GFX1164GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c
+; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: s_mov_b32 s6, s4
+; GFX1164GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1]
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1164GISEL-NEXT: ; %bb.3: ; %if
+; GFX1164GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164GISEL-NEXT: s_mov_b32 s6, -1
+; GFX1164GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[4:5]
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1164GISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX1164GISEL-NEXT: s_and_b32 s6, s6, s8
+; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1164GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_nop 0
+; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_cfg:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr1
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
+; GFX1132DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1132DAGISEL-NEXT: s_load_b32 s1, s[2:3], 0x2c
+; GFX1132DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1132DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, s0
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1132DAGISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132DAGISEL-NEXT: s_mov_b32 s1, -1
+; GFX1132DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s5, s4
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1132DAGISEL-NEXT: s_bitset0_b32 s4, s5
+; GFX1132DAGISEL-NEXT: s_and_b32 s1, s1, s6
+; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1132DAGISEL-NEXT: ; %bb.5:
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1132DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_nop 0
+; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_cfg:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT: s_mov_b32 s1, exec_lo
+; GFX1132GISEL-NEXT: ; implicit-def: $sgpr0
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
+; GFX1132GISEL-NEXT: s_xor_b32 s1, exec_lo, s1
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1132GISEL-NEXT: ; %bb.1: ; %else
+; GFX1132GISEL-NEXT: s_load_b32 s0, s[2:3], 0x2c
+; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: s_mov_b32 s0, s0
+; GFX1132GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s1, s1
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1132GISEL-NEXT: ; %bb.3: ; %if
+; GFX1132GISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132GISEL-NEXT: s_mov_b32 s0, -1
+; GFX1132GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: s_ctz_i32_b32 s5, s4
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1132GISEL-NEXT: s_bitset0_b32 s4, s5
+; GFX1132GISEL-NEXT: s_and_b32 s0, s0, s6
+; GFX1132GISEL-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1132GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
+; GFX1132GISEL-NEXT: s_nop 0
+; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132GISEL-NEXT: s_endpgm
+ entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %d_cmp = icmp ult i32 %tid, 16
+ br i1 %d_cmp, label %if, label %else
+
+if:
+ %reducedValTid = call i32 @llvm.amdgcn.wave.reduce.and.i32(i32 %tid, i32 1)
+ br label %endif
+
+else:
+ %reducedValIn = call i32 @llvm.amdgcn.wave.reduce.and.i32(i32 %in, i32 1)
+ br label %endif
+
+endif:
+ %combine = phi i32 [%reducedValTid, %if], [%reducedValIn, %else]
+ store i32 %combine, ptr addrspace(1) %out
+ ret void
+}
+
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll
new file mode 100644
index 00000000000000..f38fb59109443c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll
@@ -0,0 +1,1031 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s
+
+declare i32 @llvm.amdgcn.wave.reduce.max.i32(i32, i32 immarg)
+declare i32 @llvm.amdgcn.workitem.id.x()
+
+define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: uniform_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8DAGISEL-NEXT: s_load_dword s2, s[2:3], 0x2c
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: uniform_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: uniform_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s4
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: uniform_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX10DAGISEL-LABEL: uniform_value:
+; GFX10DAGISEL: ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT: s_clause 0x1
+; GFX10DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, s4
+; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10DAGISEL-NEXT: s_endpgm
+;
+; GFX10GISEL-LABEL: uniform_value:
+; GFX10GISEL: ; %bb.0: ; %entry
+; GFX10GISEL-NEXT: s_clause 0x1
+; GFX10GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX10GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: uniform_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_clause 0x1
+; GFX1164DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s4
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_nop 0
+; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: uniform_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_clause 0x1
+; GFX1164GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_nop 0
+; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: uniform_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_clause 0x1
+; GFX1132DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_nop 0
+; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: uniform_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_clause 0x1
+; GFX1132GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_nop 0
+; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132GISEL-NEXT: s_endpgm
+entry:
+ %result = call i32 @llvm.amdgcn.wave.reduce.max.i32(i32 %in, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @const_value(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: const_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, 0x7b
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: const_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, 0x7b
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: const_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: const_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX10DAGISEL-LABEL: const_value:
+; GFX10DAGISEL: ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b
+; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10DAGISEL-NEXT: s_endpgm
+;
+; GFX10GISEL-LABEL: const_value:
+; GFX10GISEL: ; %bb.0: ; %entry
+; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: const_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_nop 0
+; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: const_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_nop 0
+; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: const_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7b
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_nop 0
+; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: const_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_nop 0
+; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132GISEL-NEXT: s_endpgm
+ entry:
+ %result = call i32 @llvm.amdgcn.wave.reduce.max.i32(i32 123, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @poison_value(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: poison_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v0
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: poison_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v0
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: poison_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v0, v0, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: poison_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v0, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX10DAGISEL-LABEL: poison_value:
+; GFX10DAGISEL: ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT: global_store_dword v0, v0, s[0:1]
+; GFX10DAGISEL-NEXT: s_endpgm
+;
+; GFX10GISEL-LABEL: poison_value:
+; GFX10GISEL: ; %bb.0: ; %entry
+; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT: global_store_dword v0, v0, s[0:1]
+; GFX10GISEL-NEXT: s_endpgm
+;
+; GFX11DAGISEL-LABEL: poison_value:
+; GFX11DAGISEL: ; %bb.0: ; %entry
+; GFX11DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX11DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX11DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11DAGISEL-NEXT: global_store_b32 v0, v0, s[0:1]
+; GFX11DAGISEL-NEXT: s_nop 0
+; GFX11DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11DAGISEL-NEXT: s_endpgm
+;
+; GFX11GISEL-LABEL: poison_value:
+; GFX11GISEL: ; %bb.0: ; %entry
+; GFX11GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX11GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX11GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11GISEL-NEXT: global_store_b32 v0, v0, s[0:1]
+; GFX11GISEL-NEXT: s_nop 0
+; GFX11GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11GISEL-NEXT: s_endpgm
+entry:
+ %result = call i32 @llvm.amdgcn.wave.reduce.max.i32(i32 poison, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: divergent_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_brev_b32 s4, 1
+; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX8DAGISEL-NEXT: s_max_i32 s4, s4, s6
+; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX8DAGISEL-NEXT: ; %bb.2:
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_brev_b32 s4, 1
+; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX8GISEL-NEXT: s_max_i32 s4, s4, s6
+; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX8GISEL-NEXT: ; %bb.2:
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_brev_b32 s4, 1
+; GFX9DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX9DAGISEL-NEXT: s_max_i32 s4, s4, s6
+; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX9DAGISEL-NEXT: ; %bb.2:
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_brev_b32 s4, 1
+; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX9GISEL-NEXT: s_max_i32 s4, s4, s6
+; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX9GISEL-NEXT: ; %bb.2:
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_value:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: s_brev_b32 s4, 1
+; GFX1064DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1064DAGISEL-NEXT: s_max_i32 s4, s4, s6
+; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1064DAGISEL-NEXT: ; %bb.2:
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_value:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: s_brev_b32 s4, 1
+; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1064GISEL-NEXT: s_max_i32 s4, s4, s6
+; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1064GISEL-NEXT: ; %bb.2:
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_value:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032DAGISEL-NEXT: s_brev_b32 s2, 1
+; GFX1032DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1032DAGISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1032DAGISEL-NEXT: s_max_i32 s2, s2, s5
+; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1032DAGISEL-NEXT: ; %bb.2:
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_value:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032GISEL-NEXT: s_brev_b32 s2, 1
+; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032GISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1032GISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1032GISEL-NEXT: s_max_i32 s2, s2, s5
+; GFX1032GISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1032GISEL-NEXT: ; %bb.2:
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: s_brev_b32 s4, 1
+; GFX1164DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: s_max_i32 s4, s4, s6
+; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1164DAGISEL-NEXT: ; %bb.2:
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164DAGISEL-NEXT: s_nop 0
+; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: s_brev_b32 s4, 1
+; GFX1164GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT: s_max_i32 s4, s4, s6
+; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1164GISEL-NEXT: ; %bb.2:
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_nop 0
+; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132DAGISEL-NEXT: s_brev_b32 s2, 1
+; GFX1132DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s3
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: s_max_i32 s2, s2, s5
+; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1132DAGISEL-NEXT: ; %bb.2:
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132DAGISEL-NEXT: s_nop 0
+; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132GISEL-NEXT: s_brev_b32 s2, 1
+; GFX1132GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s3
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT: s_max_i32 s2, s2, s5
+; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1132GISEL-NEXT: ; %bb.2:
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_nop 0
+; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132GISEL-NEXT: s_endpgm
+entry:
+ %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+ %result = call i32 @llvm.amdgcn.wave.reduce.max.i32(i32 %id.x, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: divergent_cfg:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr4
+; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX8DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX8DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX8DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX8DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s4
+; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX8DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX8DAGISEL-NEXT: s_brev_b32 s6, 1
+; GFX8DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX8DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX8DAGISEL-NEXT: s_max_i32 s6, s6, s8
+; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX8DAGISEL-NEXT: ; %bb.5:
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX8DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT: flat_store_dword v[2:3], v1
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_cfg:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
+; GFX8GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX8GISEL-NEXT: ; %bb.1: ; %else
+; GFX8GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX8GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: s_mov_b32 s6, s4
+; GFX8GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX8GISEL-NEXT: ; %bb.3: ; %if
+; GFX8GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX8GISEL-NEXT: s_brev_b32 s6, 1
+; GFX8GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX8GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX8GISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX8GISEL-NEXT: s_max_i32 s6, s6, s8
+; GFX8GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX8GISEL-NEXT: .LBB4_5: ; %endif
+; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_cfg:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr4
+; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX9DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX9DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX9DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s4
+; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX9DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX9DAGISEL-NEXT: s_brev_b32 s6, 1
+; GFX9DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX9DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX9DAGISEL-NEXT: s_max_i32 s6, s6, s8
+; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX9DAGISEL-NEXT: ; %bb.5:
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX9DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_cfg:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
+; GFX9GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX9GISEL-NEXT: ; %bb.1: ; %else
+; GFX9GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX9GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: s_mov_b32 s6, s4
+; GFX9GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX9GISEL-NEXT: ; %bb.3: ; %if
+; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX9GISEL-NEXT: s_brev_b32 s6, 1
+; GFX9GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX9GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX9GISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX9GISEL-NEXT: s_max_i32 s6, s6, s8
+; GFX9GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX9GISEL-NEXT: .LBB4_5: ; %endif
+; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_cfg:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr4
+; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1064DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX1064DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1064DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s4
+; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064DAGISEL-NEXT: s_brev_b32 s6, 1
+; GFX1064DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX1064DAGISEL-NEXT: s_max_i32 s6, s6, s8
+; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1064DAGISEL-NEXT: ; %bb.5:
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX1064DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_cfg:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
+; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1064GISEL-NEXT: ; %bb.1: ; %else
+; GFX1064GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: s_mov_b32 s6, s4
+; GFX1064GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1064GISEL-NEXT: ; %bb.3: ; %if
+; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064GISEL-NEXT: s_brev_b32 s6, 1
+; GFX1064GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX1064GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1064GISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX1064GISEL-NEXT: s_max_i32 s6, s6, s8
+; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1064GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_cfg:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc_lo, 15, v0
+; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr1
+; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1032DAGISEL-NEXT: s_load_dword s1, s[2:3], 0x2c
+; GFX1032DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1032DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, s0
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1032DAGISEL-NEXT: s_brev_b32 s1, 1
+; GFX1032DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s5, s4
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1032DAGISEL-NEXT: s_bitset0_b32 s4, s5
+; GFX1032DAGISEL-NEXT: s_max_i32 s1, s1, s6
+; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1032DAGISEL-NEXT: ; %bb.5:
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1032DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_cfg:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v0
+; GFX1032GISEL-NEXT: ; implicit-def: $sgpr0
+; GFX1032GISEL-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX1032GISEL-NEXT: s_xor_b32 s1, exec_lo, s1
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1032GISEL-NEXT: ; %bb.1: ; %else
+; GFX1032GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c
+; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: s_mov_b32 s0, s0
+; GFX1032GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s1, s1
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1032GISEL-NEXT: ; %bb.3: ; %if
+; GFX1032GISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1032GISEL-NEXT: s_brev_b32 s0, 1
+; GFX1032GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: s_ff1_i32_b32 s5, s4
+; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1032GISEL-NEXT: s_bitset0_b32 s4, s5
+; GFX1032GISEL-NEXT: s_max_i32 s0, s0, s6
+; GFX1032GISEL-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1032GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[4:5]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_cfg:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr4
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
+; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1164DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c
+; GFX1164DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1164DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s4
+; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164DAGISEL-NEXT: s_brev_b32 s6, 1
+; GFX1164DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[4:5]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX1164DAGISEL-NEXT: s_max_i32 s6, s6, s8
+; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1164DAGISEL-NEXT: ; %bb.5:
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX1164DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_nop 0
+; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_cfg:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
+; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1164GISEL-NEXT: ; %bb.1: ; %else
+; GFX1164GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c
+; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: s_mov_b32 s6, s4
+; GFX1164GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1]
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1164GISEL-NEXT: ; %bb.3: ; %if
+; GFX1164GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164GISEL-NEXT: s_brev_b32 s6, 1
+; GFX1164GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[4:5]
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1164GISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX1164GISEL-NEXT: s_max_i32 s6, s6, s8
+; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1164GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_nop 0
+; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_cfg:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr1
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
+; GFX1132DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1132DAGISEL-NEXT: s_load_b32 s1, s[2:3], 0x2c
+; GFX1132DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1132DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, s0
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1132DAGISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132DAGISEL-NEXT: s_brev_b32 s1, 1
+; GFX1132DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s5, s4
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1132DAGISEL-NEXT: s_bitset0_b32 s4, s5
+; GFX1132DAGISEL-NEXT: s_max_i32 s1, s1, s6
+; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1132DAGISEL-NEXT: ; %bb.5:
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1132DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_nop 0
+; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_cfg:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT: s_mov_b32 s1, exec_lo
+; GFX1132GISEL-NEXT: ; implicit-def: $sgpr0
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
+; GFX1132GISEL-NEXT: s_xor_b32 s1, exec_lo, s1
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1132GISEL-NEXT: ; %bb.1: ; %else
+; GFX1132GISEL-NEXT: s_load_b32 s0, s[2:3], 0x2c
+; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: s_mov_b32 s0, s0
+; GFX1132GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s1, s1
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1132GISEL-NEXT: ; %bb.3: ; %if
+; GFX1132GISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132GISEL-NEXT: s_brev_b32 s0, 1
+; GFX1132GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: s_ctz_i32_b32 s5, s4
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1132GISEL-NEXT: s_bitset0_b32 s4, s5
+; GFX1132GISEL-NEXT: s_max_i32 s0, s0, s6
+; GFX1132GISEL-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1132GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
+; GFX1132GISEL-NEXT: s_nop 0
+; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132GISEL-NEXT: s_endpgm
+ entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %d_cmp = icmp ult i32 %tid, 16
+ br i1 %d_cmp, label %if, label %else
+
+if:
+ %reducedValTid = call i32 @llvm.amdgcn.wave.reduce.max.i32(i32 %tid, i32 1)
+ br label %endif
+
+else:
+ %reducedValIn = call i32 @llvm.amdgcn.wave.reduce.max.i32(i32 %in, i32 1)
+ br label %endif
+
+endif:
+ %combine = phi i32 [%reducedValTid, %if], [%reducedValIn, %else]
+ store i32 %combine, ptr addrspace(1) %out
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll
new file mode 100644
index 00000000000000..20a7fd25bb729f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll
@@ -0,0 +1,1031 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s
+
+declare i32 @llvm.amdgcn.wave.reduce.min.i32(i32, i32 immarg)
+declare i32 @llvm.amdgcn.workitem.id.x()
+
+define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: uniform_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8DAGISEL-NEXT: s_load_dword s2, s[2:3], 0x2c
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: uniform_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: uniform_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s4
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: uniform_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX10DAGISEL-LABEL: uniform_value:
+; GFX10DAGISEL: ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT: s_clause 0x1
+; GFX10DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, s4
+; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10DAGISEL-NEXT: s_endpgm
+;
+; GFX10GISEL-LABEL: uniform_value:
+; GFX10GISEL: ; %bb.0: ; %entry
+; GFX10GISEL-NEXT: s_clause 0x1
+; GFX10GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX10GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: uniform_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_clause 0x1
+; GFX1164DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s4
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_nop 0
+; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: uniform_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_clause 0x1
+; GFX1164GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_nop 0
+; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: uniform_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_clause 0x1
+; GFX1132DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_nop 0
+; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: uniform_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_clause 0x1
+; GFX1132GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_nop 0
+; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132GISEL-NEXT: s_endpgm
+entry:
+ %result = call i32 @llvm.amdgcn.wave.reduce.min.i32(i32 %in, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @const_value(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: const_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, 0x7b
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: const_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, 0x7b
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: const_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: const_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX10DAGISEL-LABEL: const_value:
+; GFX10DAGISEL: ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b
+; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10DAGISEL-NEXT: s_endpgm
+;
+; GFX10GISEL-LABEL: const_value:
+; GFX10GISEL: ; %bb.0: ; %entry
+; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: const_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_nop 0
+; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: const_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_nop 0
+; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: const_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7b
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_nop 0
+; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: const_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_nop 0
+; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132GISEL-NEXT: s_endpgm
+ entry:
+ %result = call i32 @llvm.amdgcn.wave.reduce.min.i32(i32 123, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @poison_value(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: poison_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v0
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: poison_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v0
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: poison_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v0, v0, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: poison_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v0, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX10DAGISEL-LABEL: poison_value:
+; GFX10DAGISEL: ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT: global_store_dword v0, v0, s[0:1]
+; GFX10DAGISEL-NEXT: s_endpgm
+;
+; GFX10GISEL-LABEL: poison_value:
+; GFX10GISEL: ; %bb.0: ; %entry
+; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT: global_store_dword v0, v0, s[0:1]
+; GFX10GISEL-NEXT: s_endpgm
+;
+; GFX11DAGISEL-LABEL: poison_value:
+; GFX11DAGISEL: ; %bb.0: ; %entry
+; GFX11DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX11DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX11DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11DAGISEL-NEXT: global_store_b32 v0, v0, s[0:1]
+; GFX11DAGISEL-NEXT: s_nop 0
+; GFX11DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11DAGISEL-NEXT: s_endpgm
+;
+; GFX11GISEL-LABEL: poison_value:
+; GFX11GISEL: ; %bb.0: ; %entry
+; GFX11GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX11GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX11GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11GISEL-NEXT: global_store_b32 v0, v0, s[0:1]
+; GFX11GISEL-NEXT: s_nop 0
+; GFX11GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11GISEL-NEXT: s_endpgm
+entry:
+ %result = call i32 @llvm.amdgcn.wave.reduce.min.i32(i32 poison, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: divergent_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_brev_b32 s4, -2
+; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX8DAGISEL-NEXT: s_min_i32 s4, s4, s6
+; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX8DAGISEL-NEXT: ; %bb.2:
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_brev_b32 s4, -2
+; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX8GISEL-NEXT: s_min_i32 s4, s4, s6
+; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX8GISEL-NEXT: ; %bb.2:
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_brev_b32 s4, -2
+; GFX9DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX9DAGISEL-NEXT: s_min_i32 s4, s4, s6
+; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX9DAGISEL-NEXT: ; %bb.2:
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_brev_b32 s4, -2
+; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX9GISEL-NEXT: s_min_i32 s4, s4, s6
+; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX9GISEL-NEXT: ; %bb.2:
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_value:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: s_brev_b32 s4, -2
+; GFX1064DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1064DAGISEL-NEXT: s_min_i32 s4, s4, s6
+; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1064DAGISEL-NEXT: ; %bb.2:
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_value:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: s_brev_b32 s4, -2
+; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1064GISEL-NEXT: s_min_i32 s4, s4, s6
+; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1064GISEL-NEXT: ; %bb.2:
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_value:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032DAGISEL-NEXT: s_brev_b32 s2, -2
+; GFX1032DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1032DAGISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1032DAGISEL-NEXT: s_min_i32 s2, s2, s5
+; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1032DAGISEL-NEXT: ; %bb.2:
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_value:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032GISEL-NEXT: s_brev_b32 s2, -2
+; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032GISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1032GISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1032GISEL-NEXT: s_min_i32 s2, s2, s5
+; GFX1032GISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1032GISEL-NEXT: ; %bb.2:
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: s_brev_b32 s4, -2
+; GFX1164DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: s_min_i32 s4, s4, s6
+; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1164DAGISEL-NEXT: ; %bb.2:
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164DAGISEL-NEXT: s_nop 0
+; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: s_brev_b32 s4, -2
+; GFX1164GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT: s_min_i32 s4, s4, s6
+; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1164GISEL-NEXT: ; %bb.2:
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_nop 0
+; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132DAGISEL-NEXT: s_brev_b32 s2, -2
+; GFX1132DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s3
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: s_min_i32 s2, s2, s5
+; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1132DAGISEL-NEXT: ; %bb.2:
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132DAGISEL-NEXT: s_nop 0
+; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132GISEL-NEXT: s_brev_b32 s2, -2
+; GFX1132GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s3
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT: s_min_i32 s2, s2, s5
+; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1132GISEL-NEXT: ; %bb.2:
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_nop 0
+; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132GISEL-NEXT: s_endpgm
+entry:
+ %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+ %result = call i32 @llvm.amdgcn.wave.reduce.min.i32(i32 %id.x, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: divergent_cfg:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr4
+; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX8DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX8DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX8DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX8DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s4
+; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX8DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX8DAGISEL-NEXT: s_brev_b32 s6, -2
+; GFX8DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX8DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX8DAGISEL-NEXT: s_min_i32 s6, s6, s8
+; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX8DAGISEL-NEXT: ; %bb.5:
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX8DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT: flat_store_dword v[2:3], v1
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_cfg:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
+; GFX8GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX8GISEL-NEXT: ; %bb.1: ; %else
+; GFX8GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX8GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: s_mov_b32 s6, s4
+; GFX8GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX8GISEL-NEXT: ; %bb.3: ; %if
+; GFX8GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX8GISEL-NEXT: s_brev_b32 s6, -2
+; GFX8GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX8GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX8GISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX8GISEL-NEXT: s_min_i32 s6, s6, s8
+; GFX8GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX8GISEL-NEXT: .LBB4_5: ; %endif
+; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_cfg:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr4
+; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX9DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX9DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX9DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s4
+; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX9DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX9DAGISEL-NEXT: s_brev_b32 s6, -2
+; GFX9DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX9DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX9DAGISEL-NEXT: s_min_i32 s6, s6, s8
+; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX9DAGISEL-NEXT: ; %bb.5:
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX9DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_cfg:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
+; GFX9GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX9GISEL-NEXT: ; %bb.1: ; %else
+; GFX9GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX9GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: s_mov_b32 s6, s4
+; GFX9GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX9GISEL-NEXT: ; %bb.3: ; %if
+; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX9GISEL-NEXT: s_brev_b32 s6, -2
+; GFX9GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX9GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX9GISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX9GISEL-NEXT: s_min_i32 s6, s6, s8
+; GFX9GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX9GISEL-NEXT: .LBB4_5: ; %endif
+; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_cfg:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr4
+; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1064DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX1064DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1064DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s4
+; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064DAGISEL-NEXT: s_brev_b32 s6, -2
+; GFX1064DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX1064DAGISEL-NEXT: s_min_i32 s6, s6, s8
+; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1064DAGISEL-NEXT: ; %bb.5:
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX1064DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_cfg:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
+; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1064GISEL-NEXT: ; %bb.1: ; %else
+; GFX1064GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: s_mov_b32 s6, s4
+; GFX1064GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1064GISEL-NEXT: ; %bb.3: ; %if
+; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064GISEL-NEXT: s_brev_b32 s6, -2
+; GFX1064GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX1064GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1064GISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX1064GISEL-NEXT: s_min_i32 s6, s6, s8
+; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1064GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_cfg:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc_lo, 15, v0
+; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr1
+; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1032DAGISEL-NEXT: s_load_dword s1, s[2:3], 0x2c
+; GFX1032DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1032DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, s0
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1032DAGISEL-NEXT: s_brev_b32 s1, -2
+; GFX1032DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s5, s4
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1032DAGISEL-NEXT: s_bitset0_b32 s4, s5
+; GFX1032DAGISEL-NEXT: s_min_i32 s1, s1, s6
+; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1032DAGISEL-NEXT: ; %bb.5:
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1032DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_cfg:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v0
+; GFX1032GISEL-NEXT: ; implicit-def: $sgpr0
+; GFX1032GISEL-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX1032GISEL-NEXT: s_xor_b32 s1, exec_lo, s1
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1032GISEL-NEXT: ; %bb.1: ; %else
+; GFX1032GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c
+; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: s_mov_b32 s0, s0
+; GFX1032GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s1, s1
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1032GISEL-NEXT: ; %bb.3: ; %if
+; GFX1032GISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1032GISEL-NEXT: s_brev_b32 s0, -2
+; GFX1032GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: s_ff1_i32_b32 s5, s4
+; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1032GISEL-NEXT: s_bitset0_b32 s4, s5
+; GFX1032GISEL-NEXT: s_min_i32 s0, s0, s6
+; GFX1032GISEL-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1032GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[4:5]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_cfg:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr4
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
+; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1164DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c
+; GFX1164DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1164DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s4
+; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164DAGISEL-NEXT: s_brev_b32 s6, -2
+; GFX1164DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[4:5]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX1164DAGISEL-NEXT: s_min_i32 s6, s6, s8
+; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1164DAGISEL-NEXT: ; %bb.5:
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX1164DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_nop 0
+; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_cfg:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
+; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1164GISEL-NEXT: ; %bb.1: ; %else
+; GFX1164GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c
+; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: s_mov_b32 s6, s4
+; GFX1164GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1]
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1164GISEL-NEXT: ; %bb.3: ; %if
+; GFX1164GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164GISEL-NEXT: s_brev_b32 s6, -2
+; GFX1164GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[4:5]
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1164GISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX1164GISEL-NEXT: s_min_i32 s6, s6, s8
+; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1164GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_nop 0
+; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_cfg:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr1
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
+; GFX1132DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1132DAGISEL-NEXT: s_load_b32 s1, s[2:3], 0x2c
+; GFX1132DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1132DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, s0
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1132DAGISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132DAGISEL-NEXT: s_brev_b32 s1, -2
+; GFX1132DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s5, s4
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1132DAGISEL-NEXT: s_bitset0_b32 s4, s5
+; GFX1132DAGISEL-NEXT: s_min_i32 s1, s1, s6
+; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1132DAGISEL-NEXT: ; %bb.5:
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1132DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_nop 0
+; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_cfg:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT: s_mov_b32 s1, exec_lo
+; GFX1132GISEL-NEXT: ; implicit-def: $sgpr0
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
+; GFX1132GISEL-NEXT: s_xor_b32 s1, exec_lo, s1
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1132GISEL-NEXT: ; %bb.1: ; %else
+; GFX1132GISEL-NEXT: s_load_b32 s0, s[2:3], 0x2c
+; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: s_mov_b32 s0, s0
+; GFX1132GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s1, s1
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1132GISEL-NEXT: ; %bb.3: ; %if
+; GFX1132GISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132GISEL-NEXT: s_brev_b32 s0, -2
+; GFX1132GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: s_ctz_i32_b32 s5, s4
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1132GISEL-NEXT: s_bitset0_b32 s4, s5
+; GFX1132GISEL-NEXT: s_min_i32 s0, s0, s6
+; GFX1132GISEL-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1132GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
+; GFX1132GISEL-NEXT: s_nop 0
+; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132GISEL-NEXT: s_endpgm
+ entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %d_cmp = icmp ult i32 %tid, 16
+ br i1 %d_cmp, label %if, label %else
+
+if:
+ %reducedValTid = call i32 @llvm.amdgcn.wave.reduce.min.i32(i32 %tid, i32 1)
+ br label %endif
+
+else:
+ %reducedValIn = call i32 @llvm.amdgcn.wave.reduce.min.i32(i32 %in, i32 1)
+ br label %endif
+
+endif:
+ %combine = phi i32 [%reducedValTid, %if], [%reducedValIn, %else]
+ store i32 %combine, ptr addrspace(1) %out
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll
new file mode 100644
index 00000000000000..064540ec540732
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll
@@ -0,0 +1,1031 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s
+
+declare i32 @llvm.amdgcn.wave.reduce.or.i32(i32, i32 immarg)
+declare i32 @llvm.amdgcn.workitem.id.x()
+
+define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: uniform_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8DAGISEL-NEXT: s_load_dword s2, s[2:3], 0x2c
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: uniform_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: uniform_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s4
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: uniform_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX10DAGISEL-LABEL: uniform_value:
+; GFX10DAGISEL: ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT: s_clause 0x1
+; GFX10DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, s4
+; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10DAGISEL-NEXT: s_endpgm
+;
+; GFX10GISEL-LABEL: uniform_value:
+; GFX10GISEL: ; %bb.0: ; %entry
+; GFX10GISEL-NEXT: s_clause 0x1
+; GFX10GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX10GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: uniform_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_clause 0x1
+; GFX1164DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s4
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_nop 0
+; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: uniform_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_clause 0x1
+; GFX1164GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_nop 0
+; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: uniform_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_clause 0x1
+; GFX1132DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_nop 0
+; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: uniform_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_clause 0x1
+; GFX1132GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_nop 0
+; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132GISEL-NEXT: s_endpgm
+entry:
+ %result = call i32 @llvm.amdgcn.wave.reduce.or.i32(i32 %in, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @const_value(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: const_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, 0x7b
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: const_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, 0x7b
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: const_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: const_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX10DAGISEL-LABEL: const_value:
+; GFX10DAGISEL: ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b
+; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10DAGISEL-NEXT: s_endpgm
+;
+; GFX10GISEL-LABEL: const_value:
+; GFX10GISEL: ; %bb.0: ; %entry
+; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: const_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_nop 0
+; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: const_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_nop 0
+; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: const_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7b
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_nop 0
+; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: const_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_nop 0
+; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132GISEL-NEXT: s_endpgm
+ entry:
+ %result = call i32 @llvm.amdgcn.wave.reduce.or.i32(i32 123, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @poison_value(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: poison_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v0
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: poison_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v0
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: poison_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v0, v0, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: poison_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v0, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX10DAGISEL-LABEL: poison_value:
+; GFX10DAGISEL: ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT: global_store_dword v0, v0, s[0:1]
+; GFX10DAGISEL-NEXT: s_endpgm
+;
+; GFX10GISEL-LABEL: poison_value:
+; GFX10GISEL: ; %bb.0: ; %entry
+; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT: global_store_dword v0, v0, s[0:1]
+; GFX10GISEL-NEXT: s_endpgm
+;
+; GFX11DAGISEL-LABEL: poison_value:
+; GFX11DAGISEL: ; %bb.0: ; %entry
+; GFX11DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX11DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX11DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11DAGISEL-NEXT: global_store_b32 v0, v0, s[0:1]
+; GFX11DAGISEL-NEXT: s_nop 0
+; GFX11DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11DAGISEL-NEXT: s_endpgm
+;
+; GFX11GISEL-LABEL: poison_value:
+; GFX11GISEL: ; %bb.0: ; %entry
+; GFX11GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX11GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX11GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11GISEL-NEXT: global_store_b32 v0, v0, s[0:1]
+; GFX11GISEL-NEXT: s_nop 0
+; GFX11GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11GISEL-NEXT: s_endpgm
+entry:
+ %result = call i32 @llvm.amdgcn.wave.reduce.or.i32(i32 poison, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: divergent_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_mov_b32 s4, 0
+; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX8DAGISEL-NEXT: s_or_b32 s4, s4, s6
+; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX8DAGISEL-NEXT: ; %bb.2:
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_mov_b32 s4, 0
+; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX8GISEL-NEXT: s_or_b32 s4, s4, s6
+; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX8GISEL-NEXT: ; %bb.2:
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_mov_b32 s4, 0
+; GFX9DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX9DAGISEL-NEXT: s_or_b32 s4, s4, s6
+; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX9DAGISEL-NEXT: ; %bb.2:
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_mov_b32 s4, 0
+; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX9GISEL-NEXT: s_or_b32 s4, s4, s6
+; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX9GISEL-NEXT: ; %bb.2:
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_value:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: s_mov_b32 s4, 0
+; GFX1064DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1064DAGISEL-NEXT: s_or_b32 s4, s4, s6
+; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1064DAGISEL-NEXT: ; %bb.2:
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_value:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: s_mov_b32 s4, 0
+; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1064GISEL-NEXT: s_or_b32 s4, s4, s6
+; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1064GISEL-NEXT: ; %bb.2:
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_value:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, 0
+; GFX1032DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1032DAGISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1032DAGISEL-NEXT: s_or_b32 s2, s2, s5
+; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1032DAGISEL-NEXT: ; %bb.2:
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_value:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032GISEL-NEXT: s_mov_b32 s2, 0
+; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032GISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1032GISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1032GISEL-NEXT: s_or_b32 s2, s2, s5
+; GFX1032GISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1032GISEL-NEXT: ; %bb.2:
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: s_mov_b32 s4, 0
+; GFX1164DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: s_or_b32 s4, s4, s6
+; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1164DAGISEL-NEXT: ; %bb.2:
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164DAGISEL-NEXT: s_nop 0
+; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: s_mov_b32 s4, 0
+; GFX1164GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT: s_or_b32 s4, s4, s6
+; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1164GISEL-NEXT: ; %bb.2:
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_nop 0
+; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132DAGISEL-NEXT: s_mov_b32 s2, 0
+; GFX1132DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s3
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: s_or_b32 s2, s2, s5
+; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1132DAGISEL-NEXT: ; %bb.2:
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132DAGISEL-NEXT: s_nop 0
+; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132GISEL-NEXT: s_mov_b32 s2, 0
+; GFX1132GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s3
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT: s_or_b32 s2, s2, s5
+; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1132GISEL-NEXT: ; %bb.2:
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_nop 0
+; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132GISEL-NEXT: s_endpgm
+entry:
+ %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+ %result = call i32 @llvm.amdgcn.wave.reduce.or.i32(i32 %id.x, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: divergent_cfg:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr4
+; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX8DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX8DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX8DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX8DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s4
+; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX8DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX8DAGISEL-NEXT: s_mov_b32 s6, 0
+; GFX8DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX8DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX8DAGISEL-NEXT: s_or_b32 s6, s6, s8
+; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX8DAGISEL-NEXT: ; %bb.5:
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX8DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT: flat_store_dword v[2:3], v1
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_cfg:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
+; GFX8GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX8GISEL-NEXT: ; %bb.1: ; %else
+; GFX8GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX8GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: s_mov_b32 s6, s4
+; GFX8GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX8GISEL-NEXT: ; %bb.3: ; %if
+; GFX8GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX8GISEL-NEXT: s_mov_b32 s6, 0
+; GFX8GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX8GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX8GISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX8GISEL-NEXT: s_or_b32 s6, s6, s8
+; GFX8GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX8GISEL-NEXT: .LBB4_5: ; %endif
+; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_cfg:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr4
+; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX9DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX9DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX9DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s4
+; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX9DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX9DAGISEL-NEXT: s_mov_b32 s6, 0
+; GFX9DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX9DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX9DAGISEL-NEXT: s_or_b32 s6, s6, s8
+; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX9DAGISEL-NEXT: ; %bb.5:
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX9DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_cfg:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
+; GFX9GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX9GISEL-NEXT: ; %bb.1: ; %else
+; GFX9GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX9GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: s_mov_b32 s6, s4
+; GFX9GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX9GISEL-NEXT: ; %bb.3: ; %if
+; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX9GISEL-NEXT: s_mov_b32 s6, 0
+; GFX9GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX9GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX9GISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX9GISEL-NEXT: s_or_b32 s6, s6, s8
+; GFX9GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX9GISEL-NEXT: .LBB4_5: ; %endif
+; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_cfg:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr4
+; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1064DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX1064DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1064DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s4
+; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064DAGISEL-NEXT: s_mov_b32 s6, 0
+; GFX1064DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX1064DAGISEL-NEXT: s_or_b32 s6, s6, s8
+; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1064DAGISEL-NEXT: ; %bb.5:
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX1064DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_cfg:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
+; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1064GISEL-NEXT: ; %bb.1: ; %else
+; GFX1064GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: s_mov_b32 s6, s4
+; GFX1064GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1064GISEL-NEXT: ; %bb.3: ; %if
+; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064GISEL-NEXT: s_mov_b32 s6, 0
+; GFX1064GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX1064GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1064GISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX1064GISEL-NEXT: s_or_b32 s6, s6, s8
+; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1064GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_cfg:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc_lo, 15, v0
+; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr1
+; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1032DAGISEL-NEXT: s_load_dword s1, s[2:3], 0x2c
+; GFX1032DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1032DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, s0
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1032DAGISEL-NEXT: s_mov_b32 s1, 0
+; GFX1032DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s5, s4
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1032DAGISEL-NEXT: s_bitset0_b32 s4, s5
+; GFX1032DAGISEL-NEXT: s_or_b32 s1, s1, s6
+; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1032DAGISEL-NEXT: ; %bb.5:
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1032DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_cfg:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v0
+; GFX1032GISEL-NEXT: ; implicit-def: $sgpr0
+; GFX1032GISEL-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX1032GISEL-NEXT: s_xor_b32 s1, exec_lo, s1
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1032GISEL-NEXT: ; %bb.1: ; %else
+; GFX1032GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c
+; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: s_mov_b32 s0, s0
+; GFX1032GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s1, s1
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1032GISEL-NEXT: ; %bb.3: ; %if
+; GFX1032GISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1032GISEL-NEXT: s_mov_b32 s0, 0
+; GFX1032GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: s_ff1_i32_b32 s5, s4
+; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1032GISEL-NEXT: s_bitset0_b32 s4, s5
+; GFX1032GISEL-NEXT: s_or_b32 s0, s0, s6
+; GFX1032GISEL-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1032GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[4:5]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_cfg:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr4
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
+; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1164DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c
+; GFX1164DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1164DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s4
+; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164DAGISEL-NEXT: s_mov_b32 s6, 0
+; GFX1164DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[4:5]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX1164DAGISEL-NEXT: s_or_b32 s6, s6, s8
+; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1164DAGISEL-NEXT: ; %bb.5:
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX1164DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_nop 0
+; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_cfg:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
+; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1164GISEL-NEXT: ; %bb.1: ; %else
+; GFX1164GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c
+; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: s_mov_b32 s6, s4
+; GFX1164GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1]
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1164GISEL-NEXT: ; %bb.3: ; %if
+; GFX1164GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164GISEL-NEXT: s_mov_b32 s6, 0
+; GFX1164GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[4:5]
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1164GISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX1164GISEL-NEXT: s_or_b32 s6, s6, s8
+; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1164GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_nop 0
+; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_cfg:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr1
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
+; GFX1132DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1132DAGISEL-NEXT: s_load_b32 s1, s[2:3], 0x2c
+; GFX1132DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1132DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, s0
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1132DAGISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0
+; GFX1132DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s5, s4
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1132DAGISEL-NEXT: s_bitset0_b32 s4, s5
+; GFX1132DAGISEL-NEXT: s_or_b32 s1, s1, s6
+; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1132DAGISEL-NEXT: ; %bb.5:
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1132DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_nop 0
+; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_cfg:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT: s_mov_b32 s1, exec_lo
+; GFX1132GISEL-NEXT: ; implicit-def: $sgpr0
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
+; GFX1132GISEL-NEXT: s_xor_b32 s1, exec_lo, s1
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1132GISEL-NEXT: ; %bb.1: ; %else
+; GFX1132GISEL-NEXT: s_load_b32 s0, s[2:3], 0x2c
+; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: s_mov_b32 s0, s0
+; GFX1132GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s1, s1
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1132GISEL-NEXT: ; %bb.3: ; %if
+; GFX1132GISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132GISEL-NEXT: s_mov_b32 s0, 0
+; GFX1132GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: s_ctz_i32_b32 s5, s4
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1132GISEL-NEXT: s_bitset0_b32 s4, s5
+; GFX1132GISEL-NEXT: s_or_b32 s0, s0, s6
+; GFX1132GISEL-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1132GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
+; GFX1132GISEL-NEXT: s_nop 0
+; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132GISEL-NEXT: s_endpgm
+ entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %d_cmp = icmp ult i32 %tid, 16
+ br i1 %d_cmp, label %if, label %else
+
+if:
+ %reducedValTid = call i32 @llvm.amdgcn.wave.reduce.or.i32(i32 %tid, i32 1)
+ br label %endif
+
+else:
+ %reducedValIn = call i32 @llvm.amdgcn.wave.reduce.or.i32(i32 %in, i32 1)
+ br label %endif
+
+endif:
+ %combine = phi i32 [%reducedValTid, %if], [%reducedValIn, %else]
+ store i32 %combine, ptr addrspace(1) %out
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll
new file mode 100644
index 00000000000000..422b688324cd71
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll
@@ -0,0 +1,1329 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s
+
+declare i32 @llvm.amdgcn.wave.reduce.sub.i32(i32, i32 immarg)
+declare i32 @llvm.amdgcn.workitem.id.x()
+
+define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: uniform_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: s_mul_i32 s3, s4, -1
+; GFX8DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: uniform_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: s_mul_i32 s3, s4, -1
+; GFX8GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: uniform_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: s_mul_i32 s3, s4, -1
+; GFX9DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: uniform_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: s_mul_i32 s3, s4, -1
+; GFX9GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: uniform_value:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_clause 0x1
+; GFX1064DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: s_mul_i32 s3, s4, -1
+; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: uniform_value:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_clause 0x1
+; GFX1064GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: s_mul_i32 s3, s4, -1
+; GFX1064GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: uniform_value:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_clause 0x1
+; GFX1032DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: s_mul_i32 s3, s4, -1
+; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: uniform_value:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_clause 0x1
+; GFX1032GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: s_mul_i32 s3, s4, -1
+; GFX1032GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: uniform_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_clause 0x1
+; GFX1164DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: s_mul_i32 s3, s4, -1
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_nop 0
+; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: uniform_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_clause 0x1
+; GFX1164GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: s_mul_i32 s3, s4, -1
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_nop 0
+; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: uniform_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_clause 0x1
+; GFX1132DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: s_mul_i32 s3, s4, -1
+; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_nop 0
+; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: uniform_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_clause 0x1
+; GFX1132GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: s_mul_i32 s3, s4, -1
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_nop 0
+; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132GISEL-NEXT: s_endpgm
+entry:
+ %result = call i32 @llvm.amdgcn.wave.reduce.sub.i32(i32 %in, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @const_value(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: const_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_mul_i32 s4, -1, 0x7b
+; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8DAGISEL-NEXT: s_mul_i32 s2, s4, s2
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: const_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8GISEL-NEXT: s_mul_i32 s3, -1, 0x7b
+; GFX8GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: const_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9DAGISEL-NEXT: s_mul_i32 s3, -1, 0x7b
+; GFX9DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: const_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9GISEL-NEXT: s_mul_i32 s3, -1, 0x7b
+; GFX9GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: const_value:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT: s_mul_i32 s3, -1, 0x7b
+; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: const_value:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT: s_mul_i32 s3, -1, 0x7b
+; GFX1064GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: const_value:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT: s_mul_i32 s3, -1, 0x7b
+; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: const_value:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT: s_mul_i32 s3, -1, 0x7b
+; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: const_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT: s_mul_i32 s3, -1, 0x7b
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_nop 0
+; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: const_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT: s_mul_i32 s3, -1, 0x7b
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_nop 0
+; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: const_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT: s_mul_i32 s3, -1, 0x7b
+; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_nop 0
+; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: const_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT: s_mul_i32 s3, -1, 0x7b
+; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_nop 0
+; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132GISEL-NEXT: s_endpgm
+ entry:
+ %result = call i32 @llvm.amdgcn.wave.reduce.sub.i32(i32 123, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @poison_value(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: poison_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: s_mul_i32 s4, s0, -1
+; GFX8DAGISEL-NEXT: s_mul_i32 s2, s4, s2
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: poison_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: s_mul_i32 s3, s0, -1
+; GFX8GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: poison_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: s_mul_i32 s3, s0, -1
+; GFX9DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: poison_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: s_mul_i32 s3, s0, -1
+; GFX9GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: poison_value:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: s_mul_i32 s3, s0, -1
+; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: poison_value:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: s_mul_i32 s3, s0, -1
+; GFX1064GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: poison_value:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: s_mul_i32 s3, s0, -1
+; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: poison_value:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: s_mul_i32 s3, s0, -1
+; GFX1032GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: poison_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: s_mul_i32 s3, s0, -1
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_nop 0
+; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: poison_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: s_mul_i32 s3, s0, -1
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_nop 0
+; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: poison_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: s_mul_i32 s3, s0, -1
+; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_nop 0
+; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: poison_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: s_mul_i32 s3, s0, -1
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_nop 0
+; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132GISEL-NEXT: s_endpgm
+entry:
+ %result = call i32 @llvm.amdgcn.wave.reduce.sub.i32(i32 poison, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: divergent_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_mov_b32 s4, 0
+; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX8DAGISEL-NEXT: s_sub_i32 s4, s4, s6
+; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX8DAGISEL-NEXT: ; %bb.2:
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_mov_b32 s4, 0
+; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX8GISEL-NEXT: s_sub_i32 s4, s4, s6
+; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX8GISEL-NEXT: ; %bb.2:
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_mov_b32 s4, 0
+; GFX9DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX9DAGISEL-NEXT: s_sub_i32 s4, s4, s6
+; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX9DAGISEL-NEXT: ; %bb.2:
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_mov_b32 s4, 0
+; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX9GISEL-NEXT: s_sub_i32 s4, s4, s6
+; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX9GISEL-NEXT: ; %bb.2:
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_value:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: s_mov_b32 s4, 0
+; GFX1064DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1064DAGISEL-NEXT: s_sub_i32 s4, s4, s6
+; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1064DAGISEL-NEXT: ; %bb.2:
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_value:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: s_mov_b32 s4, 0
+; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1064GISEL-NEXT: s_sub_i32 s4, s4, s6
+; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1064GISEL-NEXT: ; %bb.2:
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_value:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, 0
+; GFX1032DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1032DAGISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1032DAGISEL-NEXT: s_sub_i32 s2, s2, s5
+; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1032DAGISEL-NEXT: ; %bb.2:
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_value:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032GISEL-NEXT: s_mov_b32 s2, 0
+; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032GISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1032GISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1032GISEL-NEXT: s_sub_i32 s2, s2, s5
+; GFX1032GISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1032GISEL-NEXT: ; %bb.2:
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: s_mov_b32 s4, 0
+; GFX1164DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: s_sub_i32 s4, s4, s6
+; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1164DAGISEL-NEXT: ; %bb.2:
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164DAGISEL-NEXT: s_nop 0
+; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: s_mov_b32 s4, 0
+; GFX1164GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT: s_sub_i32 s4, s4, s6
+; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1164GISEL-NEXT: ; %bb.2:
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_nop 0
+; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132DAGISEL-NEXT: s_mov_b32 s2, 0
+; GFX1132DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s3
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: s_sub_i32 s2, s2, s5
+; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1132DAGISEL-NEXT: ; %bb.2:
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132DAGISEL-NEXT: s_nop 0
+; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132GISEL-NEXT: s_mov_b32 s2, 0
+; GFX1132GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s3
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT: s_sub_i32 s2, s2, s5
+; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1132GISEL-NEXT: ; %bb.2:
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_nop 0
+; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132GISEL-NEXT: s_endpgm
+entry:
+ %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+ %result = call i32 @llvm.amdgcn.wave.reduce.sub.i32(i32 %id.x, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: divergent_cfg:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr4
+; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX8DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX8DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX8DAGISEL-NEXT: s_load_dword s6, s[2:3], 0x2c
+; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: s_mul_i32 s5, s6, -1
+; GFX8DAGISEL-NEXT: s_mul_i32 s4, s5, s4
+; GFX8DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s4
+; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX8DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX8DAGISEL-NEXT: s_mov_b32 s6, 0
+; GFX8DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX8DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX8DAGISEL-NEXT: s_sub_i32 s6, s6, s8
+; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX8DAGISEL-NEXT: ; %bb.5:
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX8DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT: flat_store_dword v[2:3], v1
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_cfg:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
+; GFX8GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX8GISEL-NEXT: ; %bb.1: ; %else
+; GFX8GISEL-NEXT: s_load_dword s6, s[2:3], 0x2c
+; GFX8GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX8GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: s_mul_i32 s5, s6, -1
+; GFX8GISEL-NEXT: s_mul_i32 s6, s5, s4
+; GFX8GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX8GISEL-NEXT: ; %bb.3: ; %if
+; GFX8GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX8GISEL-NEXT: s_mov_b32 s6, 0
+; GFX8GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX8GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX8GISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX8GISEL-NEXT: s_sub_i32 s6, s6, s8
+; GFX8GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX8GISEL-NEXT: .LBB4_5: ; %endif
+; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_cfg:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr4
+; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX9DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX9DAGISEL-NEXT: s_load_dword s6, s[2:3], 0x2c
+; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: s_mul_i32 s5, s6, -1
+; GFX9DAGISEL-NEXT: s_mul_i32 s4, s5, s4
+; GFX9DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s4
+; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX9DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX9DAGISEL-NEXT: s_mov_b32 s6, 0
+; GFX9DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX9DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX9DAGISEL-NEXT: s_sub_i32 s6, s6, s8
+; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX9DAGISEL-NEXT: ; %bb.5:
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX9DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_cfg:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
+; GFX9GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX9GISEL-NEXT: ; %bb.1: ; %else
+; GFX9GISEL-NEXT: s_load_dword s6, s[2:3], 0x2c
+; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX9GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: s_mul_i32 s5, s6, -1
+; GFX9GISEL-NEXT: s_mul_i32 s6, s5, s4
+; GFX9GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX9GISEL-NEXT: ; %bb.3: ; %if
+; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX9GISEL-NEXT: s_mov_b32 s6, 0
+; GFX9GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX9GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX9GISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX9GISEL-NEXT: s_sub_i32 s6, s6, s8
+; GFX9GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX9GISEL-NEXT: .LBB4_5: ; %endif
+; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_cfg:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr4
+; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1064DAGISEL-NEXT: s_load_dword s6, s[2:3], 0x2c
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: s_mul_i32 s5, s6, -1
+; GFX1064DAGISEL-NEXT: s_mul_i32 s4, s5, s4
+; GFX1064DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s4
+; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064DAGISEL-NEXT: s_mov_b32 s6, 0
+; GFX1064DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX1064DAGISEL-NEXT: s_sub_i32 s6, s6, s8
+; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1064DAGISEL-NEXT: ; %bb.5:
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX1064DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_cfg:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
+; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1064GISEL-NEXT: ; %bb.1: ; %else
+; GFX1064GISEL-NEXT: s_load_dword s6, s[2:3], 0x2c
+; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: s_mul_i32 s5, s6, -1
+; GFX1064GISEL-NEXT: s_mul_i32 s6, s5, s4
+; GFX1064GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1064GISEL-NEXT: ; %bb.3: ; %if
+; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064GISEL-NEXT: s_mov_b32 s6, 0
+; GFX1064GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX1064GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1064GISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX1064GISEL-NEXT: s_sub_i32 s6, s6, s8
+; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1064GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_cfg:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc_lo, 15, v0
+; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr1
+; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1032DAGISEL-NEXT: s_load_dword s1, s[2:3], 0x2c
+; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1032DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s4, s4
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: s_mul_i32 s1, s1, -1
+; GFX1032DAGISEL-NEXT: s_mul_i32 s1, s1, s4
+; GFX1032DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, s0
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1032DAGISEL-NEXT: s_mov_b32 s1, 0
+; GFX1032DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s5, s4
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1032DAGISEL-NEXT: s_bitset0_b32 s4, s5
+; GFX1032DAGISEL-NEXT: s_sub_i32 s1, s1, s6
+; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1032DAGISEL-NEXT: ; %bb.5:
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1032DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_cfg:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v0
+; GFX1032GISEL-NEXT: ; implicit-def: $sgpr0
+; GFX1032GISEL-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX1032GISEL-NEXT: s_xor_b32 s1, exec_lo, s1
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1032GISEL-NEXT: ; %bb.1: ; %else
+; GFX1032GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c
+; GFX1032GISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s4, s4
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: s_mul_i32 s0, s0, -1
+; GFX1032GISEL-NEXT: s_mul_i32 s0, s0, s4
+; GFX1032GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s1, s1
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1032GISEL-NEXT: ; %bb.3: ; %if
+; GFX1032GISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1032GISEL-NEXT: s_mov_b32 s0, 0
+; GFX1032GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: s_ff1_i32_b32 s5, s4
+; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1032GISEL-NEXT: s_bitset0_b32 s4, s5
+; GFX1032GISEL-NEXT: s_sub_i32 s0, s0, s6
+; GFX1032GISEL-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1032GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[4:5]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_cfg:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr4
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
+; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1164DAGISEL-NEXT: s_load_b32 s6, s[2:3], 0x2c
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: s_mul_i32 s5, s6, -1
+; GFX1164DAGISEL-NEXT: s_mul_i32 s4, s5, s4
+; GFX1164DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s4
+; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164DAGISEL-NEXT: s_mov_b32 s6, 0
+; GFX1164DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[4:5]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX1164DAGISEL-NEXT: s_sub_i32 s6, s6, s8
+; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1164DAGISEL-NEXT: ; %bb.5:
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX1164DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_nop 0
+; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_cfg:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
+; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1164GISEL-NEXT: ; %bb.1: ; %else
+; GFX1164GISEL-NEXT: s_load_b32 s6, s[2:3], 0x2c
+; GFX1164GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: s_mul_i32 s5, s6, -1
+; GFX1164GISEL-NEXT: s_mul_i32 s6, s5, s4
+; GFX1164GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1]
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1164GISEL-NEXT: ; %bb.3: ; %if
+; GFX1164GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164GISEL-NEXT: s_mov_b32 s6, 0
+; GFX1164GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[4:5]
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1164GISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX1164GISEL-NEXT: s_sub_i32 s6, s6, s8
+; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1164GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_nop 0
+; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_cfg:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr1
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
+; GFX1132DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1132DAGISEL-NEXT: s_load_b32 s1, s[2:3], 0x2c
+; GFX1132DAGISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s4, s4
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: s_mul_i32 s1, s1, -1
+; GFX1132DAGISEL-NEXT: s_mul_i32 s1, s1, s4
+; GFX1132DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, s0
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1132DAGISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0
+; GFX1132DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s5, s4
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1132DAGISEL-NEXT: s_bitset0_b32 s4, s5
+; GFX1132DAGISEL-NEXT: s_sub_i32 s1, s1, s6
+; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1132DAGISEL-NEXT: ; %bb.5:
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1132DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_nop 0
+; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_cfg:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT: s_mov_b32 s1, exec_lo
+; GFX1132GISEL-NEXT: ; implicit-def: $sgpr0
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
+; GFX1132GISEL-NEXT: s_xor_b32 s1, exec_lo, s1
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1132GISEL-NEXT: ; %bb.1: ; %else
+; GFX1132GISEL-NEXT: s_load_b32 s0, s[2:3], 0x2c
+; GFX1132GISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s4, s4
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: s_mul_i32 s0, s0, -1
+; GFX1132GISEL-NEXT: s_mul_i32 s0, s0, s4
+; GFX1132GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s1, s1
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1132GISEL-NEXT: ; %bb.3: ; %if
+; GFX1132GISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132GISEL-NEXT: s_mov_b32 s0, 0
+; GFX1132GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: s_ctz_i32_b32 s5, s4
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1132GISEL-NEXT: s_bitset0_b32 s4, s5
+; GFX1132GISEL-NEXT: s_sub_i32 s0, s0, s6
+; GFX1132GISEL-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1132GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
+; GFX1132GISEL-NEXT: s_nop 0
+; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132GISEL-NEXT: s_endpgm
+ entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %d_cmp = icmp ult i32 %tid, 16
+ br i1 %d_cmp, label %if, label %else
+
+if:
+ %reducedValTid = call i32 @llvm.amdgcn.wave.reduce.sub.i32(i32 %tid, i32 1)
+ br label %endif
+
+else:
+ %reducedValIn = call i32 @llvm.amdgcn.wave.reduce.sub.i32(i32 %in, i32 1)
+ br label %endif
+
+endif:
+ %combine = phi i32 [%reducedValTid, %if], [%reducedValIn, %else]
+ store i32 %combine, ptr addrspace(1) %out
+ ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX10DAGISEL: {{.*}}
+; GFX10GISEL: {{.*}}
+; GFX11DAGISEL: {{.*}}
+; GFX11GISEL: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll
new file mode 100644
index 00000000000000..f04123634a5df6
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll
@@ -0,0 +1,1333 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s
+
+declare i32 @llvm.amdgcn.wave.reduce.xor.i32(i32, i32 immarg)
+declare i32 @llvm.amdgcn.workitem.id.x()
+
+define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: uniform_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8DAGISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: s_mul_i32 s2, s4, s2
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: uniform_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8GISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: s_mul_i32 s2, s4, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: uniform_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9DAGISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: s_mul_i32 s2, s4, s2
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: uniform_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9GISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: s_mul_i32 s2, s4, s2
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: uniform_value:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_clause 0x1
+; GFX1064DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s4, s2
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: uniform_value:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_clause 0x1
+; GFX1064GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: s_mul_i32 s2, s4, s2
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: uniform_value:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_clause 0x1
+; GFX1032DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032DAGISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s4, s2
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: uniform_value:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_clause 0x1
+; GFX1032GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032GISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: s_mul_i32 s2, s4, s2
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: uniform_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_clause 0x1
+; GFX1164DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s4, s2
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_nop 0
+; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: uniform_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_clause 0x1
+; GFX1164GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: s_mul_i32 s2, s4, s2
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_nop 0
+; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: uniform_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_clause 0x1
+; GFX1132DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132DAGISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s4, s2
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_nop 0
+; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: uniform_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_clause 0x1
+; GFX1132GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: s_mul_i32 s2, s4, s2
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_nop 0
+; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132GISEL-NEXT: s_endpgm
+entry:
+ %result = call i32 @llvm.amdgcn.wave.reduce.xor.i32(i32 %in, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @const_value(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: const_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8DAGISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX8DAGISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: const_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8GISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX8GISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: const_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9DAGISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX9DAGISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: const_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9GISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX9GISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: const_value:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX1064DAGISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: const_value:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX1064GISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: const_value:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032DAGISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX1032DAGISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: const_value:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032GISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX1032GISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: const_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX1164DAGISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_nop 0
+; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: const_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX1164GISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_nop 0
+; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: const_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132DAGISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_nop 0
+; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: const_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX1132GISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_nop 0
+; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132GISEL-NEXT: s_endpgm
+ entry:
+ %result = call i32 @llvm.amdgcn.wave.reduce.xor.i32(i32 123, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @poison_value(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: poison_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8DAGISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: poison_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8GISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: poison_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9DAGISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: poison_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9GISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: poison_value:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: poison_value:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: poison_value:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032DAGISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: poison_value:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032GISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: poison_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_nop 0
+; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: poison_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_nop 0
+; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: poison_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132DAGISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_nop 0
+; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: poison_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_nop 0
+; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132GISEL-NEXT: s_endpgm
+entry:
+ %result = call i32 @llvm.amdgcn.wave.reduce.xor.i32(i32 poison, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: divergent_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_mov_b32 s4, 0
+; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX8DAGISEL-NEXT: s_xor_b32 s4, s4, s6
+; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX8DAGISEL-NEXT: ; %bb.2:
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_mov_b32 s4, 0
+; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX8GISEL-NEXT: s_xor_b32 s4, s4, s6
+; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX8GISEL-NEXT: ; %bb.2:
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_mov_b32 s4, 0
+; GFX9DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX9DAGISEL-NEXT: s_xor_b32 s4, s4, s6
+; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX9DAGISEL-NEXT: ; %bb.2:
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_mov_b32 s4, 0
+; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX9GISEL-NEXT: s_xor_b32 s4, s4, s6
+; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX9GISEL-NEXT: ; %bb.2:
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_value:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: s_mov_b32 s4, 0
+; GFX1064DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1064DAGISEL-NEXT: s_xor_b32 s4, s4, s6
+; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1064DAGISEL-NEXT: ; %bb.2:
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_value:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: s_mov_b32 s4, 0
+; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1064GISEL-NEXT: s_xor_b32 s4, s4, s6
+; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1064GISEL-NEXT: ; %bb.2:
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_value:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, 0
+; GFX1032DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1032DAGISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1032DAGISEL-NEXT: s_xor_b32 s2, s2, s5
+; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1032DAGISEL-NEXT: ; %bb.2:
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_value:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032GISEL-NEXT: s_mov_b32 s2, 0
+; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032GISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1032GISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1032GISEL-NEXT: s_xor_b32 s2, s2, s5
+; GFX1032GISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1032GISEL-NEXT: ; %bb.2:
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: s_mov_b32 s4, 0
+; GFX1164DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: s_xor_b32 s4, s4, s6
+; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1164DAGISEL-NEXT: ; %bb.2:
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164DAGISEL-NEXT: s_nop 0
+; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: s_mov_b32 s4, 0
+; GFX1164GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT: s_xor_b32 s4, s4, s6
+; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1164GISEL-NEXT: ; %bb.2:
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_nop 0
+; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132DAGISEL-NEXT: s_mov_b32 s2, 0
+; GFX1132DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s3
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: s_xor_b32 s2, s2, s5
+; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1132DAGISEL-NEXT: ; %bb.2:
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132DAGISEL-NEXT: s_nop 0
+; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132GISEL-NEXT: s_mov_b32 s2, 0
+; GFX1132GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s3
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT: s_xor_b32 s2, s2, s5
+; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1132GISEL-NEXT: ; %bb.2:
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_nop 0
+; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132GISEL-NEXT: s_endpgm
+entry:
+ %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+ %result = call i32 @llvm.amdgcn.wave.reduce.xor.i32(i32 %id.x, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: divergent_cfg:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr4
+; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX8DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX8DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX8DAGISEL-NEXT: s_load_dword s6, s[2:3], 0x2c
+; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX8DAGISEL-NEXT: s_and_b32 s4, s4, 1
+; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: s_mul_i32 s4, s6, s4
+; GFX8DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s4
+; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX8DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX8DAGISEL-NEXT: s_mov_b32 s6, 0
+; GFX8DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX8DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX8DAGISEL-NEXT: s_xor_b32 s6, s6, s8
+; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX8DAGISEL-NEXT: ; %bb.5:
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX8DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT: flat_store_dword v[2:3], v1
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_cfg:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
+; GFX8GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX8GISEL-NEXT: ; %bb.1: ; %else
+; GFX8GISEL-NEXT: s_load_dword s6, s[2:3], 0x2c
+; GFX8GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX8GISEL-NEXT: s_and_b32 s4, s4, 1
+; GFX8GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: s_mul_i32 s6, s6, s4
+; GFX8GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX8GISEL-NEXT: ; %bb.3: ; %if
+; GFX8GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX8GISEL-NEXT: s_mov_b32 s6, 0
+; GFX8GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX8GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX8GISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX8GISEL-NEXT: s_xor_b32 s6, s6, s8
+; GFX8GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX8GISEL-NEXT: .LBB4_5: ; %endif
+; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_cfg:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr4
+; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX9DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX9DAGISEL-NEXT: s_load_dword s6, s[2:3], 0x2c
+; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX9DAGISEL-NEXT: s_and_b32 s4, s4, 1
+; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: s_mul_i32 s4, s6, s4
+; GFX9DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s4
+; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX9DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX9DAGISEL-NEXT: s_mov_b32 s6, 0
+; GFX9DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX9DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX9DAGISEL-NEXT: s_xor_b32 s6, s6, s8
+; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX9DAGISEL-NEXT: ; %bb.5:
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX9DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_cfg:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
+; GFX9GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX9GISEL-NEXT: ; %bb.1: ; %else
+; GFX9GISEL-NEXT: s_load_dword s6, s[2:3], 0x2c
+; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX9GISEL-NEXT: s_and_b32 s4, s4, 1
+; GFX9GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: s_mul_i32 s6, s6, s4
+; GFX9GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX9GISEL-NEXT: ; %bb.3: ; %if
+; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX9GISEL-NEXT: s_mov_b32 s6, 0
+; GFX9GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX9GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX9GISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX9GISEL-NEXT: s_xor_b32 s6, s6, s8
+; GFX9GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX9GISEL-NEXT: .LBB4_5: ; %endif
+; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_cfg:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr4
+; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1064DAGISEL-NEXT: s_load_dword s6, s[2:3], 0x2c
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1064DAGISEL-NEXT: s_and_b32 s4, s4, 1
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: s_mul_i32 s4, s6, s4
+; GFX1064DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s4
+; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064DAGISEL-NEXT: s_mov_b32 s6, 0
+; GFX1064DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX1064DAGISEL-NEXT: s_xor_b32 s6, s6, s8
+; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1064DAGISEL-NEXT: ; %bb.5:
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX1064DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_cfg:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
+; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1064GISEL-NEXT: ; %bb.1: ; %else
+; GFX1064GISEL-NEXT: s_load_dword s6, s[2:3], 0x2c
+; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1064GISEL-NEXT: s_and_b32 s4, s4, 1
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: s_mul_i32 s6, s6, s4
+; GFX1064GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1064GISEL-NEXT: ; %bb.3: ; %if
+; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064GISEL-NEXT: s_mov_b32 s6, 0
+; GFX1064GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX1064GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1064GISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX1064GISEL-NEXT: s_xor_b32 s6, s6, s8
+; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1064GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_cfg:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc_lo, 15, v0
+; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr1
+; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1032DAGISEL-NEXT: s_load_dword s1, s[2:3], 0x2c
+; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1032DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s4, s4
+; GFX1032DAGISEL-NEXT: s_and_b32 s4, s4, 1
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: s_mul_i32 s1, s1, s4
+; GFX1032DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, s0
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1032DAGISEL-NEXT: s_mov_b32 s1, 0
+; GFX1032DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s5, s4
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1032DAGISEL-NEXT: s_bitset0_b32 s4, s5
+; GFX1032DAGISEL-NEXT: s_xor_b32 s1, s1, s6
+; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1032DAGISEL-NEXT: ; %bb.5:
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1032DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_cfg:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v0
+; GFX1032GISEL-NEXT: ; implicit-def: $sgpr0
+; GFX1032GISEL-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX1032GISEL-NEXT: s_xor_b32 s1, exec_lo, s1
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1032GISEL-NEXT: ; %bb.1: ; %else
+; GFX1032GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c
+; GFX1032GISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s4, s4
+; GFX1032GISEL-NEXT: s_and_b32 s4, s4, 1
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: s_mul_i32 s0, s0, s4
+; GFX1032GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s1, s1
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1032GISEL-NEXT: ; %bb.3: ; %if
+; GFX1032GISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1032GISEL-NEXT: s_mov_b32 s0, 0
+; GFX1032GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: s_ff1_i32_b32 s5, s4
+; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1032GISEL-NEXT: s_bitset0_b32 s4, s5
+; GFX1032GISEL-NEXT: s_xor_b32 s0, s0, s6
+; GFX1032GISEL-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1032GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[4:5]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_cfg:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr4
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
+; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1164DAGISEL-NEXT: s_load_b32 s6, s[2:3], 0x2c
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1164DAGISEL-NEXT: s_and_b32 s4, s4, 1
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: s_mul_i32 s4, s6, s4
+; GFX1164DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s4
+; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164DAGISEL-NEXT: s_mov_b32 s6, 0
+; GFX1164DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[4:5]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX1164DAGISEL-NEXT: s_xor_b32 s6, s6, s8
+; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1164DAGISEL-NEXT: ; %bb.5:
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX1164DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_nop 0
+; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_cfg:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
+; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1164GISEL-NEXT: ; %bb.1: ; %else
+; GFX1164GISEL-NEXT: s_load_b32 s6, s[2:3], 0x2c
+; GFX1164GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1164GISEL-NEXT: s_and_b32 s4, s4, 1
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: s_mul_i32 s6, s6, s4
+; GFX1164GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1]
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1164GISEL-NEXT: ; %bb.3: ; %if
+; GFX1164GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164GISEL-NEXT: s_mov_b32 s6, 0
+; GFX1164GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[4:5]
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1164GISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX1164GISEL-NEXT: s_xor_b32 s6, s6, s8
+; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1164GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_nop 0
+; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_cfg:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr1
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
+; GFX1132DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1132DAGISEL-NEXT: s_load_b32 s1, s[2:3], 0x2c
+; GFX1132DAGISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s4, s4
+; GFX1132DAGISEL-NEXT: s_and_b32 s4, s4, 1
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: s_mul_i32 s1, s1, s4
+; GFX1132DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, s0
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1132DAGISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0
+; GFX1132DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s5, s4
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1132DAGISEL-NEXT: s_bitset0_b32 s4, s5
+; GFX1132DAGISEL-NEXT: s_xor_b32 s1, s1, s6
+; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1132DAGISEL-NEXT: ; %bb.5:
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1132DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_nop 0
+; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_cfg:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT: s_mov_b32 s1, exec_lo
+; GFX1132GISEL-NEXT: ; implicit-def: $sgpr0
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
+; GFX1132GISEL-NEXT: s_xor_b32 s1, exec_lo, s1
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1132GISEL-NEXT: ; %bb.1: ; %else
+; GFX1132GISEL-NEXT: s_load_b32 s0, s[2:3], 0x2c
+; GFX1132GISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s4, s4
+; GFX1132GISEL-NEXT: s_and_b32 s4, s4, 1
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: s_mul_i32 s0, s0, s4
+; GFX1132GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s1, s1
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1132GISEL-NEXT: ; %bb.3: ; %if
+; GFX1132GISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132GISEL-NEXT: s_mov_b32 s0, 0
+; GFX1132GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: s_ctz_i32_b32 s5, s4
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1132GISEL-NEXT: s_bitset0_b32 s4, s5
+; GFX1132GISEL-NEXT: s_xor_b32 s0, s0, s6
+; GFX1132GISEL-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1132GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
+; GFX1132GISEL-NEXT: s_nop 0
+; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132GISEL-NEXT: s_endpgm
+ entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %d_cmp = icmp ult i32 %tid, 16
+ br i1 %d_cmp, label %if, label %else
+
+if:
+ %reducedValTid = call i32 @llvm.amdgcn.wave.reduce.xor.i32(i32 %tid, i32 1)
+ br label %endif
+
+else:
+ %reducedValIn = call i32 @llvm.amdgcn.wave.reduce.xor.i32(i32 %in, i32 1)
+ br label %endif
+
+endif:
+ %combine = phi i32 [%reducedValTid, %if], [%reducedValIn, %else]
+ store i32 %combine, ptr addrspace(1) %out
+ ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX10DAGISEL: {{.*}}
+; GFX10GISEL: {{.*}}
+; GFX11DAGISEL: {{.*}}
+; GFX11GISEL: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.add.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.add.mir
new file mode 100644
index 00000000000000..6c0e2bb93afd58
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.add.mir
@@ -0,0 +1,90 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -mtriple=amdgcn -run-pass=finalize-isel -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
+
+---
+name: uniform_value
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ bb.0.entry:
+ liveins: $sgpr0_sgpr1
+
+ ; GCN-LABEL: name: uniform_value
+ ; GCN: liveins: $sgpr0_sgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0
+ ; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 44, 0
+ ; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
+ ; GCN-NEXT: [[S_BCNT1_I32_B64_:%[0-9]+]]:sgpr_32 = S_BCNT1_I32_B64 [[S_MOV_B64_]], implicit-def $scc
+ ; GCN-NEXT: [[S_MUL_I32_:%[0-9]+]]:sgpr_32 = S_MUL_I32 [[S_LOAD_DWORD_IMM]], [[S_BCNT1_I32_B64_]]
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MUL_I32_]]
+ ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY1]], killed [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0
+ %1:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ %4:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %5:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0
+ %6:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1(p4), 44, 0
+ %7:sgpr_32 = WAVE_REDUCE_ADD_PSEUDO_I32 killed %6, 1, implicit $exec
+ %8:vgpr_32 = COPY %7
+ GLOBAL_STORE_DWORD_SADDR killed %4, killed %8, killed %5, 0, 0, implicit $exec
+ S_ENDPGM 0
+
+...
+
+---
+name: divergent_value
+machineFunctionInfo:
+ isEntryFunction: true
+tracksRegLiveness: true
+body: |
+ ; GCN-LABEL: name: divergent_value
+ ; GCN: bb.0.entry:
+ ; GCN-NEXT: successors: %bb.2(0x80000000)
+ ; GCN-NEXT: liveins: $vgpr0, $sgpr0_sgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0
+ ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
+ ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
+ ; GCN-NEXT: S_BRANCH %bb.2
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.2:
+ ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[PHI:%[0-9]+]]:sgpr_32 = PHI [[S_MOV_B32_]], %bb.0, %4, %bb.2
+ ; GCN-NEXT: [[PHI1:%[0-9]+]]:sreg_64_xexec = PHI [[S_MOV_B64_]], %bb.0, %11, %bb.2
+ ; GCN-NEXT: [[S_FF1_I32_B64_:%[0-9]+]]:sgpr_32 = S_FF1_I32_B64 [[PHI1]]
+ ; GCN-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sgpr_32 = V_READLANE_B32 [[COPY1]], [[S_FF1_I32_B64_]]
+ ; GCN-NEXT: [[S_ADD_I32_:%[0-9]+]]:sgpr_32 = S_ADD_I32 [[PHI]], [[V_READLANE_B32_]], implicit-def $scc
+ ; GCN-NEXT: [[S_BITSET0_B64_:%[0-9]+]]:sreg_64_xexec = S_BITSET0_B64 [[S_FF1_I32_B64_]], [[PHI1]]
+ ; GCN-NEXT: S_CMP_LG_U64 [[S_BITSET0_B64_]], 0, implicit-def $scc
+ ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.3:
+ ; GCN-NEXT: successors: %bb.1(0x80000000)
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
+ ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY2]], killed [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.1:
+ ; GCN-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY1]], %bb.3
+ ; GCN-NEXT: S_ENDPGM 0
+ bb.0.entry:
+ liveins: $vgpr0, $sgpr0_sgpr1
+ %1:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ %0:vgpr_32 = COPY $vgpr0
+ %4:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0
+ %5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %6:sgpr_32 = WAVE_REDUCE_ADD_PSEUDO_I32 %0, 1, implicit $exec
+ %7:vgpr_32 = COPY %6
+ GLOBAL_STORE_DWORD_SADDR killed %5, killed %7, killed %4, 0, 0, implicit $exec
+ bb.1:
+ %8:vgpr_32 = PHI %0, %bb.0
+ S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.and.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.and.mir
new file mode 100644
index 00000000000000..72b485719c9cd4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.and.mir
@@ -0,0 +1,89 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -mtriple=amdgcn -run-pass=finalize-isel -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -passes=finalize-isel -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
+
+---
+name: uniform_value
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ bb.0.entry:
+ liveins: $sgpr0_sgpr1
+
+ ; GCN-LABEL: name: uniform_value
+ ; GCN: liveins: $sgpr0_sgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0
+ ; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 44, 0
+ ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 [[S_LOAD_DWORD_IMM]]
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY1]], killed [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0
+ %1:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ %4:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %5:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0
+ %6:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1(p4), 44, 0
+ %7:sgpr_32 = WAVE_REDUCE_AND_PSEUDO_B32 killed %6, 1, implicit $exec
+ %8:vgpr_32 = COPY %7
+ GLOBAL_STORE_DWORD_SADDR killed %4, killed %8, killed %5, 0, 0, implicit $exec
+ S_ENDPGM 0
+
+...
+
+---
+name: divergent_value
+machineFunctionInfo:
+ isEntryFunction: true
+tracksRegLiveness: true
+body: |
+ ; GCN-LABEL: name: divergent_value
+ ; GCN: bb.0.entry:
+ ; GCN-NEXT: successors: %bb.2(0x80000000)
+ ; GCN-NEXT: liveins: $vgpr0, $sgpr0_sgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0
+ ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
+ ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 4294967295
+ ; GCN-NEXT: S_BRANCH %bb.2
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.2:
+ ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[PHI:%[0-9]+]]:sgpr_32 = PHI [[S_MOV_B32_]], %bb.0, %4, %bb.2
+ ; GCN-NEXT: [[PHI1:%[0-9]+]]:sreg_64_xexec = PHI [[S_MOV_B64_]], %bb.0, %11, %bb.2
+ ; GCN-NEXT: [[S_FF1_I32_B64_:%[0-9]+]]:sgpr_32 = S_FF1_I32_B64 [[PHI1]]
+ ; GCN-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sgpr_32 = V_READLANE_B32 [[COPY1]], [[S_FF1_I32_B64_]]
+ ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sgpr_32 = S_AND_B32 [[PHI]], [[V_READLANE_B32_]], implicit-def $scc
+ ; GCN-NEXT: [[S_BITSET0_B64_:%[0-9]+]]:sreg_64_xexec = S_BITSET0_B64 [[S_FF1_I32_B64_]], [[PHI1]]
+ ; GCN-NEXT: S_CMP_LG_U64 [[S_BITSET0_B64_]], 0, implicit-def $scc
+ ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.3:
+ ; GCN-NEXT: successors: %bb.1(0x80000000)
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_AND_B32_]]
+ ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY2]], killed [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.1:
+ ; GCN-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY1]], %bb.3
+ ; GCN-NEXT: S_ENDPGM 0
+ bb.0.entry:
+ liveins: $vgpr0, $sgpr0_sgpr1
+ %1:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ %0:vgpr_32 = COPY $vgpr0
+ %4:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0
+ %5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %6:sgpr_32 = WAVE_REDUCE_AND_PSEUDO_B32 %0, 1, implicit $exec
+ %7:vgpr_32 = COPY %6
+ GLOBAL_STORE_DWORD_SADDR killed %5, killed %7, killed %4, 0, 0, implicit $exec
+ bb.1:
+ %8:vgpr_32 = PHI %0, %bb.0
+ S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.max.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.max.mir
new file mode 100644
index 00000000000000..17d9a6daac9268
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.max.mir
@@ -0,0 +1,89 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -mtriple=amdgcn -run-pass=finalize-isel -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -passes=finalize-isel -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
+
+---
+name: uniform_value
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ bb.0.entry:
+ liveins: $sgpr0_sgpr1
+
+ ; GCN-LABEL: name: uniform_value
+ ; GCN: liveins: $sgpr0_sgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0
+ ; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 44, 0
+ ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 [[S_LOAD_DWORD_IMM]]
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY1]], killed [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0
+ %1:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ %4:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %5:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0
+ %6:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1(p4), 44, 0
+ %7:sgpr_32 = WAVE_REDUCE_MAX_PSEUDO_I32 killed %6, 1, implicit $exec
+ %8:vgpr_32 = COPY %7
+ GLOBAL_STORE_DWORD_SADDR killed %4, killed %8, killed %5, 0, 0, implicit $exec
+ S_ENDPGM 0
+
+...
+
+---
+name: divergent_value
+machineFunctionInfo:
+ isEntryFunction: true
+tracksRegLiveness: true
+body: |
+ ; GCN-LABEL: name: divergent_value
+ ; GCN: bb.0.entry:
+ ; GCN-NEXT: successors: %bb.2(0x80000000)
+ ; GCN-NEXT: liveins: $vgpr0, $sgpr0_sgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0
+ ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
+ ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 2147483648
+ ; GCN-NEXT: S_BRANCH %bb.2
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.2:
+ ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[PHI:%[0-9]+]]:sgpr_32 = PHI [[S_MOV_B32_]], %bb.0, %4, %bb.2
+ ; GCN-NEXT: [[PHI1:%[0-9]+]]:sreg_64_xexec = PHI [[S_MOV_B64_]], %bb.0, %11, %bb.2
+ ; GCN-NEXT: [[S_FF1_I32_B64_:%[0-9]+]]:sgpr_32 = S_FF1_I32_B64 [[PHI1]]
+ ; GCN-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sgpr_32 = V_READLANE_B32 [[COPY1]], [[S_FF1_I32_B64_]]
+ ; GCN-NEXT: [[S_MAX_I32_:%[0-9]+]]:sgpr_32 = S_MAX_I32 [[PHI]], [[V_READLANE_B32_]], implicit-def $scc
+ ; GCN-NEXT: [[S_BITSET0_B64_:%[0-9]+]]:sreg_64_xexec = S_BITSET0_B64 [[S_FF1_I32_B64_]], [[PHI1]]
+ ; GCN-NEXT: S_CMP_LG_U64 [[S_BITSET0_B64_]], 0, implicit-def $scc
+ ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.3:
+ ; GCN-NEXT: successors: %bb.1(0x80000000)
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MAX_I32_]]
+ ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY2]], killed [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.1:
+ ; GCN-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY1]], %bb.3
+ ; GCN-NEXT: S_ENDPGM 0
+ bb.0.entry:
+ liveins: $vgpr0, $sgpr0_sgpr1
+ %1:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ %0:vgpr_32 = COPY $vgpr0
+ %4:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0
+ %5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %6:sgpr_32 = WAVE_REDUCE_MAX_PSEUDO_I32 %0, 1, implicit $exec
+ %7:vgpr_32 = COPY %6
+ GLOBAL_STORE_DWORD_SADDR killed %5, killed %7, killed %4, 0, 0, implicit $exec
+ bb.1:
+ %8:vgpr_32 = PHI %0, %bb.0
+ S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.min.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.min.mir
new file mode 100644
index 00000000000000..45a6d248e834aa
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.min.mir
@@ -0,0 +1,89 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -mtriple=amdgcn -run-pass=finalize-isel -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -passes=finalize-isel -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
+
+---
+name: uniform_value
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ bb.0.entry:
+ liveins: $sgpr0_sgpr1
+
+ ; GCN-LABEL: name: uniform_value
+ ; GCN: liveins: $sgpr0_sgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0
+ ; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 44, 0
+ ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 [[S_LOAD_DWORD_IMM]]
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY1]], killed [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0
+ %1:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ %4:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %5:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0
+ %6:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1(p4), 44, 0
+ %7:sgpr_32 = WAVE_REDUCE_MIN_PSEUDO_I32 killed %6, 1, implicit $exec
+ %8:vgpr_32 = COPY %7
+ GLOBAL_STORE_DWORD_SADDR killed %4, killed %8, killed %5, 0, 0, implicit $exec
+ S_ENDPGM 0
+
+...
+
+---
+name: divergent_value
+machineFunctionInfo:
+ isEntryFunction: true
+tracksRegLiveness: true
+body: |
+ ; GCN-LABEL: name: divergent_value
+ ; GCN: bb.0.entry:
+ ; GCN-NEXT: successors: %bb.2(0x80000000)
+ ; GCN-NEXT: liveins: $vgpr0, $sgpr0_sgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0
+ ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
+ ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 2147483647
+ ; GCN-NEXT: S_BRANCH %bb.2
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.2:
+ ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[PHI:%[0-9]+]]:sgpr_32 = PHI [[S_MOV_B32_]], %bb.0, %4, %bb.2
+ ; GCN-NEXT: [[PHI1:%[0-9]+]]:sreg_64_xexec = PHI [[S_MOV_B64_]], %bb.0, %11, %bb.2
+ ; GCN-NEXT: [[S_FF1_I32_B64_:%[0-9]+]]:sgpr_32 = S_FF1_I32_B64 [[PHI1]]
+ ; GCN-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sgpr_32 = V_READLANE_B32 [[COPY1]], [[S_FF1_I32_B64_]]
+ ; GCN-NEXT: [[S_MIN_I32_:%[0-9]+]]:sgpr_32 = S_MIN_I32 [[PHI]], [[V_READLANE_B32_]], implicit-def $scc
+ ; GCN-NEXT: [[S_BITSET0_B64_:%[0-9]+]]:sreg_64_xexec = S_BITSET0_B64 [[S_FF1_I32_B64_]], [[PHI1]]
+ ; GCN-NEXT: S_CMP_LG_U64 [[S_BITSET0_B64_]], 0, implicit-def $scc
+ ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.3:
+ ; GCN-NEXT: successors: %bb.1(0x80000000)
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MIN_I32_]]
+ ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY2]], killed [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.1:
+ ; GCN-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY1]], %bb.3
+ ; GCN-NEXT: S_ENDPGM 0
+ bb.0.entry:
+ liveins: $vgpr0, $sgpr0_sgpr1
+ %1:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ %0:vgpr_32 = COPY $vgpr0
+ %4:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0
+ %5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %6:sgpr_32 = WAVE_REDUCE_MIN_PSEUDO_I32 %0, 1, implicit $exec
+ %7:vgpr_32 = COPY %6
+ GLOBAL_STORE_DWORD_SADDR killed %5, killed %7, killed %4, 0, 0, implicit $exec
+ bb.1:
+ %8:vgpr_32 = PHI %0, %bb.0
+ S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.or.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.or.mir
new file mode 100644
index 00000000000000..1f23509cb4002b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.or.mir
@@ -0,0 +1,89 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -mtriple=amdgcn -run-pass=finalize-isel -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -passes=finalize-isel -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
+
+---
+name: uniform_value
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ bb.0.entry:
+ liveins: $sgpr0_sgpr1
+
+ ; GCN-LABEL: name: uniform_value
+ ; GCN: liveins: $sgpr0_sgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0
+ ; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 44, 0
+ ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 [[S_LOAD_DWORD_IMM]]
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY1]], killed [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0
+ %1:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ %4:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %5:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0
+ %6:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1(p4), 44, 0
+ %7:sgpr_32 = WAVE_REDUCE_OR_PSEUDO_B32 killed %6, 1, implicit $exec
+ %8:vgpr_32 = COPY %7
+ GLOBAL_STORE_DWORD_SADDR killed %4, killed %8, killed %5, 0, 0, implicit $exec
+ S_ENDPGM 0
+
+...
+
+---
+name: divergent_value
+machineFunctionInfo:
+ isEntryFunction: true
+tracksRegLiveness: true
+body: |
+ ; GCN-LABEL: name: divergent_value
+ ; GCN: bb.0.entry:
+ ; GCN-NEXT: successors: %bb.2(0x80000000)
+ ; GCN-NEXT: liveins: $vgpr0, $sgpr0_sgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0
+ ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
+ ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
+ ; GCN-NEXT: S_BRANCH %bb.2
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.2:
+ ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[PHI:%[0-9]+]]:sgpr_32 = PHI [[S_MOV_B32_]], %bb.0, %4, %bb.2
+ ; GCN-NEXT: [[PHI1:%[0-9]+]]:sreg_64_xexec = PHI [[S_MOV_B64_]], %bb.0, %11, %bb.2
+ ; GCN-NEXT: [[S_FF1_I32_B64_:%[0-9]+]]:sgpr_32 = S_FF1_I32_B64 [[PHI1]]
+ ; GCN-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sgpr_32 = V_READLANE_B32 [[COPY1]], [[S_FF1_I32_B64_]]
+ ; GCN-NEXT: [[S_OR_B32_:%[0-9]+]]:sgpr_32 = S_OR_B32 [[PHI]], [[V_READLANE_B32_]], implicit-def $scc
+ ; GCN-NEXT: [[S_BITSET0_B64_:%[0-9]+]]:sreg_64_xexec = S_BITSET0_B64 [[S_FF1_I32_B64_]], [[PHI1]]
+ ; GCN-NEXT: S_CMP_LG_U64 [[S_BITSET0_B64_]], 0, implicit-def $scc
+ ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.3:
+ ; GCN-NEXT: successors: %bb.1(0x80000000)
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_OR_B32_]]
+ ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY2]], killed [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.1:
+ ; GCN-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY1]], %bb.3
+ ; GCN-NEXT: S_ENDPGM 0
+ bb.0.entry:
+ liveins: $vgpr0, $sgpr0_sgpr1
+ %1:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ %0:vgpr_32 = COPY $vgpr0
+ %4:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0
+ %5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %6:sgpr_32 = WAVE_REDUCE_OR_PSEUDO_B32 %0, 1, implicit $exec
+ %7:vgpr_32 = COPY %6
+ GLOBAL_STORE_DWORD_SADDR killed %5, killed %7, killed %4, 0, 0, implicit $exec
+ bb.1:
+ %8:vgpr_32 = PHI %0, %bb.0
+ S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.sub.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.sub.mir
new file mode 100644
index 00000000000000..6af5f2e9b5a17a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.sub.mir
@@ -0,0 +1,92 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -mtriple=amdgcn -run-pass=finalize-isel -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -passes=finalize-isel -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
+
+---
+name: uniform_value
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ bb.0.entry:
+ liveins: $sgpr0_sgpr1
+
+ ; GCN-LABEL: name: uniform_value
+ ; GCN: liveins: $sgpr0_sgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0
+ ; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 44, 0
+ ; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
+ ; GCN-NEXT: [[S_BCNT1_I32_B64_:%[0-9]+]]:sgpr_32 = S_BCNT1_I32_B64 [[S_MOV_B64_]], implicit-def $scc
+ ; GCN-NEXT: [[S_MUL_I32_:%[0-9]+]]:sgpr_32 = S_MUL_I32 -1, [[S_LOAD_DWORD_IMM]]
+ ; GCN-NEXT: [[S_MUL_I32_1:%[0-9]+]]:sgpr_32 = S_MUL_I32 [[S_MUL_I32_]], [[S_BCNT1_I32_B64_]]
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MUL_I32_1]]
+ ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY1]], killed [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0
+ %1:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ %4:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %5:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0
+ %6:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1(p4), 44, 0
+ %7:sgpr_32 = WAVE_REDUCE_SUB_PSEUDO_I32 killed %6, 1, implicit $exec
+ %8:vgpr_32 = COPY %7
+ GLOBAL_STORE_DWORD_SADDR killed %4, killed %8, killed %5, 0, 0, implicit $exec
+ S_ENDPGM 0
+
+...
+
+---
+name: divergent_value
+machineFunctionInfo:
+ isEntryFunction: true
+tracksRegLiveness: true
+body: |
+ ; GCN-LABEL: name: divergent_value
+ ; GCN: bb.0.entry:
+ ; GCN-NEXT: successors: %bb.2(0x80000000)
+ ; GCN-NEXT: liveins: $vgpr0, $sgpr0_sgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0
+ ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
+ ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
+ ; GCN-NEXT: S_BRANCH %bb.2
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.2:
+ ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[PHI:%[0-9]+]]:sgpr_32 = PHI [[S_MOV_B32_]], %bb.0, %4, %bb.2
+ ; GCN-NEXT: [[PHI1:%[0-9]+]]:sreg_64_xexec = PHI [[S_MOV_B64_]], %bb.0, %11, %bb.2
+ ; GCN-NEXT: [[S_FF1_I32_B64_:%[0-9]+]]:sgpr_32 = S_FF1_I32_B64 [[PHI1]]
+ ; GCN-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sgpr_32 = V_READLANE_B32 [[COPY1]], [[S_FF1_I32_B64_]]
+ ; GCN-NEXT: [[S_SUB_I32_:%[0-9]+]]:sgpr_32 = S_SUB_I32 [[PHI]], [[V_READLANE_B32_]], implicit-def $scc
+ ; GCN-NEXT: [[S_BITSET0_B64_:%[0-9]+]]:sreg_64_xexec = S_BITSET0_B64 [[S_FF1_I32_B64_]], [[PHI1]]
+ ; GCN-NEXT: S_CMP_LG_U64 [[S_BITSET0_B64_]], 0, implicit-def $scc
+ ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.3:
+ ; GCN-NEXT: successors: %bb.1(0x80000000)
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_SUB_I32_]]
+ ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY2]], killed [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.1:
+ ; GCN-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY1]], %bb.3
+ ; GCN-NEXT: S_ENDPGM 0
+ bb.0.entry:
+ liveins: $vgpr0, $sgpr0_sgpr1
+ %1:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ %0:vgpr_32 = COPY $vgpr0
+ %4:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0
+ %5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %6:sgpr_32 = WAVE_REDUCE_SUB_PSEUDO_I32 %0, 1, implicit $exec
+ %7:vgpr_32 = COPY %6
+ GLOBAL_STORE_DWORD_SADDR killed %5, killed %7, killed %4, 0, 0, implicit $exec
+ bb.1:
+ %8:vgpr_32 = PHI %0, %bb.0
+ S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.umax.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.umax.mir
index 179c9f4f8dc4d0..2983e646208dd6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.umax.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.umax.mir
@@ -26,7 +26,7 @@ body: |
%4:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
%5:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0
%6:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1(p4), 44, 0
- %7:sgpr_32 = WAVE_REDUCE_UMAX_PSEUDO_U32 killed %6, 1, implicit $exec
+ %7:sgpr_32 = WAVE_REDUCE_MAX_PSEUDO_U32 killed %6, 1, implicit $exec
%8:vgpr_32 = COPY %7
GLOBAL_STORE_DWORD_SADDR killed %4, killed %8, killed %5, 0, 0, implicit $exec
S_ENDPGM 0
@@ -79,7 +79,7 @@ body: |
%0:vgpr_32 = COPY $vgpr0
%4:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0
%5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- %6:sgpr_32 = WAVE_REDUCE_UMAX_PSEUDO_U32 %0, 1, implicit $exec
+ %6:sgpr_32 = WAVE_REDUCE_MAX_PSEUDO_U32 %0, 1, implicit $exec
%7:vgpr_32 = COPY %6
GLOBAL_STORE_DWORD_SADDR killed %5, killed %7, killed %4, 0, 0, implicit $exec
bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.umin.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.umin.mir
index 88c35a6417d237..db698a3b29371a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.umin.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.umin.mir
@@ -26,7 +26,7 @@ body: |
%4:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
%5:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0
%6:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1(p4), 44, 0
- %7:sgpr_32 = WAVE_REDUCE_UMIN_PSEUDO_U32 killed %6, 1, implicit $exec
+ %7:sgpr_32 = WAVE_REDUCE_MIN_PSEUDO_U32 killed %6, 1, implicit $exec
%8:vgpr_32 = COPY %7
GLOBAL_STORE_DWORD_SADDR killed %4, killed %8, killed %5, 0, 0, implicit $exec
S_ENDPGM 0
@@ -79,7 +79,7 @@ body: |
%0:vgpr_32 = COPY $vgpr0
%4:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0
%5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- %6:sgpr_32 = WAVE_REDUCE_UMIN_PSEUDO_U32 %0, 1, implicit $exec
+ %6:sgpr_32 = WAVE_REDUCE_MIN_PSEUDO_U32 %0, 1, implicit $exec
%7:vgpr_32 = COPY %6
GLOBAL_STORE_DWORD_SADDR killed %5, killed %7, killed %4, 0, 0, implicit $exec
bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.xor.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.xor.mir
new file mode 100644
index 00000000000000..54ac7ceb6a2c1b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.xor.mir
@@ -0,0 +1,92 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -mtriple=amdgcn -run-pass=finalize-isel -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -passes=finalize-isel -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
+
+---
+name: uniform_value
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ bb.0.entry:
+ liveins: $sgpr0_sgpr1
+
+ ; GCN-LABEL: name: uniform_value
+ ; GCN: liveins: $sgpr0_sgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0
+ ; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 44, 0
+ ; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
+ ; GCN-NEXT: [[S_BCNT1_I32_B64_:%[0-9]+]]:sgpr_32 = S_BCNT1_I32_B64 [[S_MOV_B64_]], implicit-def $scc
+ ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sgpr_32 = S_AND_B32 [[S_BCNT1_I32_B64_]], 1, implicit-def $scc
+ ; GCN-NEXT: [[S_MUL_I32_:%[0-9]+]]:sgpr_32 = S_MUL_I32 [[S_LOAD_DWORD_IMM]], [[S_AND_B32_]]
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MUL_I32_]]
+ ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY1]], killed [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0
+ %1:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ %4:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %5:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0
+ %6:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1(p4), 44, 0
+ %7:sgpr_32 = WAVE_REDUCE_XOR_PSEUDO_B32 killed %6, 1, implicit $exec
+ %8:vgpr_32 = COPY %7
+ GLOBAL_STORE_DWORD_SADDR killed %4, killed %8, killed %5, 0, 0, implicit $exec
+ S_ENDPGM 0
+
+...
+
+---
+name: divergent_value
+machineFunctionInfo:
+ isEntryFunction: true
+tracksRegLiveness: true
+body: |
+ ; GCN-LABEL: name: divergent_value
+ ; GCN: bb.0.entry:
+ ; GCN-NEXT: successors: %bb.2(0x80000000)
+ ; GCN-NEXT: liveins: $vgpr0, $sgpr0_sgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0
+ ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
+ ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
+ ; GCN-NEXT: S_BRANCH %bb.2
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.2:
+ ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[PHI:%[0-9]+]]:sgpr_32 = PHI [[S_MOV_B32_]], %bb.0, %4, %bb.2
+ ; GCN-NEXT: [[PHI1:%[0-9]+]]:sreg_64_xexec = PHI [[S_MOV_B64_]], %bb.0, %11, %bb.2
+ ; GCN-NEXT: [[S_FF1_I32_B64_:%[0-9]+]]:sgpr_32 = S_FF1_I32_B64 [[PHI1]]
+ ; GCN-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sgpr_32 = V_READLANE_B32 [[COPY1]], [[S_FF1_I32_B64_]]
+ ; GCN-NEXT: [[S_XOR_B32_:%[0-9]+]]:sgpr_32 = S_XOR_B32 [[PHI]], [[V_READLANE_B32_]], implicit-def $scc
+ ; GCN-NEXT: [[S_BITSET0_B64_:%[0-9]+]]:sreg_64_xexec = S_BITSET0_B64 [[S_FF1_I32_B64_]], [[PHI1]]
+ ; GCN-NEXT: S_CMP_LG_U64 [[S_BITSET0_B64_]], 0, implicit-def $scc
+ ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.3:
+ ; GCN-NEXT: successors: %bb.1(0x80000000)
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_XOR_B32_]]
+ ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY2]], killed [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.1:
+ ; GCN-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY1]], %bb.3
+ ; GCN-NEXT: S_ENDPGM 0
+ bb.0.entry:
+ liveins: $vgpr0, $sgpr0_sgpr1
+ %1:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ %0:vgpr_32 = COPY $vgpr0
+ %4:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0
+ %5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %6:sgpr_32 = WAVE_REDUCE_XOR_PSEUDO_B32 %0, 1, implicit $exec
+ %7:vgpr_32 = COPY %6
+ GLOBAL_STORE_DWORD_SADDR killed %5, killed %7, killed %4, 0, 0, implicit $exec
+ bb.1:
+ %8:vgpr_32 = PHI %0, %bb.0
+ S_ENDPGM 0
+
+...
- Previous message: [llvm] [AMDGPU] Extend Wave Reduce Intrinsics for add, sub, and, or, xor, min, max (Integer type) (PR #111342)
- Next message: [llvm] [AMDGPU] Extend Wave Reduce Intrinsics for add, sub, and, or, xor, min, max (Integer type) (PR #111342)
- Messages sorted by:
[ date ]
[ thread ]
[ subject ]
[ author ]
More information about the llvm-commits
mailing list