[llvm] [AMDGPU] Extend wave reduce intrinsics for i32 type (PR #126469)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Jun 18 02:50:51 PDT 2025
https://github.com/easyonaadit updated https://github.com/llvm/llvm-project/pull/126469
>From 5af5499860995ee33c2bd50709cf6ceb90227e21 Mon Sep 17 00:00:00 2001
From: easyonaadit <aaditya.alokdeshpande at amd.com>
Date: Wed, 5 Feb 2025 11:39:49 +0530
Subject: [PATCH 1/7] [AMDGPU] Wave Reduce Intrinsics for i32 type
Currently, wave reduction intrinsics are supported for `umin` and `umax` operations only.
This patch extends support for the following operations:
`uadd`, `add`, `usub`, `sub`, `min`, `max`, `and`, `or`, `xor` for `i32` type.
---
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 10 +-
.../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 11 +-
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 124 +-
llvm/lib/Target/AMDGPU/SIInstructions.td | 31 +-
.../CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll | 1237 ++++++++++++++++
.../CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll | 986 +++++++++++++
.../CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll | 986 +++++++++++++
.../CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll | 986 +++++++++++++
.../CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll | 986 +++++++++++++
.../CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll | 1286 ++++++++++++++++
.../CodeGen/AMDGPU/llvm.amdgcn.reduce.uadd.ll | 1240 ++++++++++++++++
.../CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll | 24 +-
.../CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll | 24 +-
.../CodeGen/AMDGPU/llvm.amdgcn.reduce.usub.ll | 1286 ++++++++++++++++
.../CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll | 1290 +++++++++++++++++
.../AMDGPU/llvm.amdgcn.wave.reduce.add.mir | 90 ++
.../AMDGPU/llvm.amdgcn.wave.reduce.and.mir | 89 ++
.../AMDGPU/llvm.amdgcn.wave.reduce.max.mir | 89 ++
.../AMDGPU/llvm.amdgcn.wave.reduce.min.mir | 89 ++
.../AMDGPU/llvm.amdgcn.wave.reduce.or.mir | 89 ++
.../AMDGPU/llvm.amdgcn.wave.reduce.sub.mir | 92 ++
.../AMDGPU/llvm.amdgcn.wave.reduce.umax.mir | 4 +-
.../AMDGPU/llvm.amdgcn.wave.reduce.umin.mir | 4 +-
.../AMDGPU/llvm.amdgcn.wave.reduce.xor.mir | 92 ++
24 files changed, 11096 insertions(+), 49 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.uadd.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.usub.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.add.mir
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.and.mir
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.max.mir
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.min.mir
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.or.mir
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.sub.mir
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.xor.mir
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 412993755dac8..be2f25d034dee 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2342,8 +2342,14 @@ class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic<
],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<1>>]>;
-def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce;
-def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce;
+multiclass AMDGPUWaveReduceOps<list<string> Operations> {
+ foreach Op = Operations in { def Op : AMDGPUWaveReduce; }
+}
+
+defvar Operations = [
+ "umin", "min", "umax", "max", "uadd", "add", "usub", "sub", "and", "or", "xor"
+];
+defm int_amdgcn_wave_reduce_ : AMDGPUWaveReduceOps<Operations>;
def int_amdgcn_readfirstlane :
Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 4391a48ff2b68..5084d2106f232 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -5006,8 +5006,17 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize);
break;
}
+ case Intrinsic::amdgcn_wave_reduce_add:
+ case Intrinsic::amdgcn_wave_reduce_uadd:
+ case Intrinsic::amdgcn_wave_reduce_sub:
+ case Intrinsic::amdgcn_wave_reduce_usub:
+ case Intrinsic::amdgcn_wave_reduce_min:
case Intrinsic::amdgcn_wave_reduce_umin:
- case Intrinsic::amdgcn_wave_reduce_umax: {
+ case Intrinsic::amdgcn_wave_reduce_max:
+ case Intrinsic::amdgcn_wave_reduce_umax:
+ case Intrinsic::amdgcn_wave_reduce_and:
+ case Intrinsic::amdgcn_wave_reduce_or:
+ case Intrinsic::amdgcn_wave_reduce_xor: {
unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 586de433ea28a..6ade924794368 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5038,6 +5038,28 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
return LoopBB;
}
+static uint32_t getInitialValueForWaveReduction(unsigned Opc) {
+ switch (Opc) {
+ case AMDGPU::S_MIN_U32:
+ return std::numeric_limits<uint32_t>::max();
+ case AMDGPU::S_MIN_I32:
+ return std::numeric_limits<int32_t>::max();
+ case AMDGPU::S_MAX_U32:
+ return std::numeric_limits<uint32_t>::min();
+ case AMDGPU::S_MAX_I32:
+ return std::numeric_limits<int32_t>::min();
+ case AMDGPU::S_ADD_I32:
+ case AMDGPU::S_SUB_I32:
+ case AMDGPU::S_OR_B32:
+ case AMDGPU::S_XOR_B32:
+ return std::numeric_limits<uint32_t>::min();
+ case AMDGPU::S_AND_B32:
+ return std::numeric_limits<uint32_t>::max();
+ default:
+ llvm_unreachable("Unexpected opcode in getInitialValueForWaveReduction");
+ }
+}
+
static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
MachineBasicBlock &BB,
const GCNSubtarget &ST,
@@ -5053,13 +5075,78 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
Register DstReg = MI.getOperand(0).getReg();
MachineBasicBlock *RetBB = nullptr;
if (isSGPR) {
- // These operations with a uniform value i.e. SGPR are idempotent.
- // Reduced value will be same as given sgpr.
- // clang-format off
- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg)
- .addReg(SrcReg);
- // clang-format on
- RetBB = &BB;
+ switch (Opc) {
+ case AMDGPU::S_MIN_U32:
+ case AMDGPU::S_MIN_I32:
+ case AMDGPU::S_MAX_U32:
+ case AMDGPU::S_MAX_I32:
+ case AMDGPU::S_AND_B32:
+ case AMDGPU::S_OR_B32: {
+ // Idempotent operations.
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
+ RetBB = &BB;
+ break;
+ }
+ case AMDGPU::S_XOR_B32:
+ case AMDGPU::S_ADD_I32:
+ case AMDGPU::S_SUB_I32: {
+ const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
+ const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
+ Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
+ Register ActiveLanes = MRI.createVirtualRegister(DstRegClass);
+
+ bool IsWave32 = ST.isWave32();
+ unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+ unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+ unsigned CountReg =
+ IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
+
+ auto Exec =
+ BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
+
+ auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
+ .addReg(Exec->getOperand(0).getReg());
+
+ switch (Opc) {
+ case AMDGPU::S_XOR_B32: {
+ // Performing an XOR operation on a uniform value
+ // depends on the parity of the number of active lanes.
+ // For even parity, the result will be 0, for odd
+ // parity the result will be the same as the input value.
+ Register ParityRegister = MRI.createVirtualRegister(DstRegClass);
+
+ auto ParityReg =
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
+ .addReg(NewAccumulator->getOperand(0).getReg())
+ .addImm(1);
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
+ .addReg(SrcReg)
+ .addReg(ParityReg->getOperand(0).getReg());
+ break;
+ }
+ case AMDGPU::S_SUB_I32: {
+ Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
+
+ // Take the negation of the source operand.
+ auto InvertedValReg =
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal)
+ .addImm(-1)
+ .addReg(SrcReg);
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
+ .addReg(InvertedValReg->getOperand(0).getReg())
+ .addReg(NewAccumulator->getOperand(0).getReg());
+ break;
+ }
+ case AMDGPU::S_ADD_I32: {
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
+ .addReg(SrcReg)
+ .addReg(NewAccumulator->getOperand(0).getReg());
+ break;
+ }
+ }
+ RetBB = &BB;
+ }
+ }
} else {
// TODO: Implement DPP Strategy and switch based on immediate strategy
// operand. For now, for all the cases (default, Iterative and DPP we use
@@ -5096,9 +5183,8 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
// Create initail values of induction variable from Exec, Accumulator and
- // insert branch instr to newly created ComputeBlockk
- uint32_t InitalValue =
- (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
+ // insert branch instr to newly created ComputeBlock
+ uint32_t InitalValue = getInitialValueForWaveReduction(Opc);
auto TmpSReg =
BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
@@ -5170,8 +5256,26 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
switch (MI.getOpcode()) {
case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
+ case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
+ case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
+ case AMDGPU::WAVE_REDUCE_UADD_PSEUDO_U32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
+ case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
+ case AMDGPU::WAVE_REDUCE_USUB_PSEUDO_U32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
+ case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
+ case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
+ case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
+ case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
case AMDGPU::S_UADDO_PSEUDO:
case AMDGPU::S_USUBO_PSEUDO: {
const DebugLoc &DL = MI.getDebugLoc();
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 897c30948cf06..c10250852ea7b 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -303,16 +303,31 @@ def : GCNPat <(vt (int_amdgcn_set_inactive vt:$src, vt:$inactive)),
def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)),
(V_SET_INACTIVE_B32 0, VGPR_32:$src, 0, VGPR_32:$inactive, (IMPLICIT_DEF))>;
-let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
- def WAVE_REDUCE_UMIN_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
- (ins VSrc_b32: $src, VSrc_b32:$strategy),
- [(set i32:$sdst, (int_amdgcn_wave_reduce_umin i32:$src, i32:$strategy))]> {
+// clang-format off
+defvar int_amdgcn_wave_reduce_ = "int_amdgcn_wave_reduce_";
+multiclass
+ AMDGPUWaveReducePseudoGenerator<string Op, string DataType, string Size> {
+ let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
+ def !toupper(Op) #"_PSEUDO_" #DataType #Size
+ : VPseudoInstSI<(outs SGPR_32 : $sdst),
+ (ins VSrc_b32 : $src, VSrc_b32 : $strategy),
+ [(set i32 : $sdst, (!cast<AMDGPUWaveReduce>(int_amdgcn_wave_reduce_ #Op) i32 : $src, i32 : $strategy))]> {}
}
+}
+// clang-format on
- def WAVE_REDUCE_UMAX_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
- (ins VSrc_b32: $src, VSrc_b32:$strategy),
- [(set i32:$sdst, (int_amdgcn_wave_reduce_umax i32:$src, i32:$strategy))]> {
- }
+// Input list : [Operation_name,
+// type - Signed(I)/Unsigned(U)/Float(F)/Bitwise(B),
+// Size_in_bits]
+defvar Operations = [
+ ["umin", "U", "32"], ["min", "I", "32"], ["umax", "U", "32"],
+ ["max", "I", "32"], ["uadd", "U", "32"], ["add", "I", "32"],
+ ["usub", "U", "32"], ["sub", "I", "32"], ["and", "B", "32"],
+ ["or", "B", "32"], ["xor", "B", "32"]
+];
+
+foreach Op = Operations in {
+ defm WAVE_REDUCE_ : AMDGPUWaveReducePseudoGenerator<Op[0], Op[1], Op[2]>;
}
let usesCustomInserter = 1, Defs = [VCC] in {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll
new file mode 100644
index 0000000000000..e93e8d4108ba0
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll
@@ -0,0 +1,1237 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 < %s | FileCheck -check-prefixes=GFX8DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 < %s | FileCheck -check-prefixes=GFX8GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 < %s | FileCheck -check-prefixes=GFX9DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 < %s | FileCheck -check-prefixes=GFX9GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s
+
+define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: uniform_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: s_mul_i32 s2, s6, s2
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: uniform_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: s_mul_i32 s2, s6, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: uniform_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: s_mul_i32 s2, s6, s2
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: uniform_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: s_mul_i32 s2, s6, s2
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: uniform_value:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_clause 0x1
+; GFX1064DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s6, s2
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: uniform_value:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_clause 0x1
+; GFX1064GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: s_mul_i32 s2, s6, s2
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: uniform_value:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_clause 0x1
+; GFX1032DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s2, s3
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: uniform_value:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_clause 0x1
+; GFX1032GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: s_mul_i32 s2, s2, s3
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: uniform_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_clause 0x1
+; GFX1164DAGISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s6, s2
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: uniform_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_clause 0x1
+; GFX1164GISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: s_mul_i32 s2, s6, s2
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: uniform_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_clause 0x1
+; GFX1132DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s2, s3
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: uniform_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_clause 0x1
+; GFX1132GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: s_mul_i32 s2, s2, s3
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_endpgm
+entry:
+ %result = call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 %in, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @const_value(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: const_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8DAGISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: const_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8GISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: const_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9DAGISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: const_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9GISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: const_value:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: const_value:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: const_value:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032DAGISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: const_value:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032GISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: const_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: const_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: const_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132DAGISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: const_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_endpgm
+ entry:
+ %result = call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 123, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @poison_value(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: poison_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: poison_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: poison_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: poison_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: poison_value:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: poison_value:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: poison_value:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: poison_value:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: poison_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: poison_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: poison_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: poison_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_endpgm
+entry:
+ %result = call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 poison, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: divergent_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_mov_b32 s4, 0
+; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX8DAGISEL-NEXT: s_add_i32 s4, s4, s6
+; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX8DAGISEL-NEXT: ; %bb.2:
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_mov_b32 s4, 0
+; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX8GISEL-NEXT: s_add_i32 s4, s4, s6
+; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX8GISEL-NEXT: ; %bb.2:
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_mov_b32 s4, 0
+; GFX9DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX9DAGISEL-NEXT: s_add_i32 s4, s4, s6
+; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX9DAGISEL-NEXT: ; %bb.2:
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_mov_b32 s4, 0
+; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX9GISEL-NEXT: s_add_i32 s4, s4, s6
+; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX9GISEL-NEXT: ; %bb.2:
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_value:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: s_mov_b32 s4, 0
+; GFX1064DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1064DAGISEL-NEXT: s_add_i32 s4, s4, s6
+; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1064DAGISEL-NEXT: ; %bb.2:
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_value:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: s_mov_b32 s4, 0
+; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1064GISEL-NEXT: s_add_i32 s4, s4, s6
+; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1064GISEL-NEXT: ; %bb.2:
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_value:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, 0
+; GFX1032DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1032DAGISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1032DAGISEL-NEXT: s_add_i32 s2, s2, s5
+; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1032DAGISEL-NEXT: ; %bb.2:
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_value:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032GISEL-NEXT: s_mov_b32 s2, 0
+; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032GISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1032GISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1032GISEL-NEXT: s_add_i32 s2, s2, s5
+; GFX1032GISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1032GISEL-NEXT: ; %bb.2:
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: s_mov_b32 s4, 0
+; GFX1164DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: s_add_i32 s4, s4, s6
+; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1164DAGISEL-NEXT: ; %bb.2:
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: s_mov_b32 s4, 0
+; GFX1164GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT: s_add_i32 s4, s4, s6
+; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1164GISEL-NEXT: ; %bb.2:
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132DAGISEL-NEXT: s_mov_b32 s2, 0
+; GFX1132DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s3
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: s_add_i32 s2, s2, s5
+; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1132DAGISEL-NEXT: ; %bb.2:
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132GISEL-NEXT: s_mov_b32 s2, 0
+; GFX1132GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s3
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT: s_add_i32 s2, s2, s5
+; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1132GISEL-NEXT: ; %bb.2:
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_endpgm
+entry:
+ %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+ %result = call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 %id.x, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: divergent_cfg:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr2
+; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX8DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX8DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX8DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: s_mul_i32 s2, s6, s2
+; GFX8DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX8DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_mov_b32 s6, 0
+; GFX8DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX8DAGISEL-NEXT: s_add_i32 s6, s6, s8
+; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX8DAGISEL-NEXT: ; %bb.5:
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX8DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT: flat_store_dword v[2:3], v1
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_cfg:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
+; GFX8GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX8GISEL-NEXT: ; %bb.1: ; %else
+; GFX8GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: s_mul_i32 s6, s6, s2
+; GFX8GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX8GISEL-NEXT: ; %bb.3: ; %if
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_mov_b32 s6, 0
+; GFX8GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX8GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX8GISEL-NEXT: s_add_i32 s6, s6, s8
+; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX8GISEL-NEXT: .LBB4_5: ; %endif
+; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_cfg:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr2
+; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX9DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX9DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: s_mul_i32 s2, s6, s2
+; GFX9DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX9DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_mov_b32 s6, 0
+; GFX9DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX9DAGISEL-NEXT: s_add_i32 s6, s6, s8
+; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX9DAGISEL-NEXT: ; %bb.5:
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX9DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_cfg:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
+; GFX9GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX9GISEL-NEXT: ; %bb.1: ; %else
+; GFX9GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: s_mul_i32 s6, s6, s2
+; GFX9GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX9GISEL-NEXT: ; %bb.3: ; %if
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_mov_b32 s6, 0
+; GFX9GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX9GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX9GISEL-NEXT: s_add_i32 s6, s6, s8
+; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX9GISEL-NEXT: .LBB4_5: ; %endif
+; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_cfg:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr2
+; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1064DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s6, s2
+; GFX1064DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: s_mov_b32 s6, 0
+; GFX1064DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX1064DAGISEL-NEXT: s_add_i32 s6, s6, s8
+; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1064DAGISEL-NEXT: ; %bb.5:
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX1064DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_cfg:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
+; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1064GISEL-NEXT: ; %bb.1: ; %else
+; GFX1064GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: s_mul_i32 s6, s6, s2
+; GFX1064GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1064GISEL-NEXT: ; %bb.3: ; %if
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: s_mov_b32 s6, 0
+; GFX1064GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX1064GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX1064GISEL-NEXT: s_add_i32 s6, s6, s8
+; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1064GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_cfg:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc_lo, 15, v0
+; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr1
+; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1032DAGISEL-NEXT: s_load_dword s1, s[4:5], 0x2c
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: s_mul_i32 s1, s1, s2
+; GFX1032DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, s0
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT: s_mov_b32 s1, 0
+; GFX1032DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s3, s2
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1032DAGISEL-NEXT: s_bitset0_b32 s2, s3
+; GFX1032DAGISEL-NEXT: s_add_i32 s1, s1, s6
+; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1032DAGISEL-NEXT: ; %bb.5:
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1032DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_cfg:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v0
+; GFX1032GISEL-NEXT: ; implicit-def: $sgpr0
+; GFX1032GISEL-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX1032GISEL-NEXT: s_xor_b32 s1, exec_lo, s1
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1032GISEL-NEXT: ; %bb.1: ; %else
+; GFX1032GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c
+; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: s_mul_i32 s0, s0, s2
+; GFX1032GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s1, s1
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1032GISEL-NEXT: ; %bb.3: ; %if
+; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT: s_mov_b32 s0, 0
+; GFX1032GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: s_ff1_i32_b32 s3, s2
+; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1032GISEL-NEXT: s_bitset0_b32 s2, s3
+; GFX1032GISEL-NEXT: s_add_i32 s0, s0, s6
+; GFX1032GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1032GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_cfg:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr2
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
+; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1164DAGISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s6, s2
+; GFX1164DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: s_mov_b32 s6, 0
+; GFX1164DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX1164DAGISEL-NEXT: s_add_i32 s6, s6, s8
+; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1164DAGISEL-NEXT: ; %bb.5:
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX1164DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_cfg:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
+; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1164GISEL-NEXT: ; %bb.1: ; %else
+; GFX1164GISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: s_mul_i32 s6, s6, s2
+; GFX1164GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1]
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1164GISEL-NEXT: ; %bb.3: ; %if
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: s_mov_b32 s6, 0
+; GFX1164GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX1164GISEL-NEXT: s_add_i32 s6, s6, s8
+; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1164GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_cfg:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr1
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
+; GFX1132DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1132DAGISEL-NEXT: s_load_b32 s1, s[4:5], 0x2c
+; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: s_mul_i32 s1, s1, s2
+; GFX1132DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, s0
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0
+; GFX1132DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3
+; GFX1132DAGISEL-NEXT: s_add_i32 s1, s1, s6
+; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1132DAGISEL-NEXT: ; %bb.5:
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1132DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_cfg:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT: s_mov_b32 s1, exec_lo
+; GFX1132GISEL-NEXT: ; implicit-def: $sgpr0
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
+; GFX1132GISEL-NEXT: s_xor_b32 s1, exec_lo, s1
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1132GISEL-NEXT: ; %bb.1: ; %else
+; GFX1132GISEL-NEXT: s_load_b32 s0, s[4:5], 0x2c
+; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: s_mul_i32 s0, s0, s2
+; GFX1132GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s1, s1
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1132GISEL-NEXT: ; %bb.3: ; %if
+; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT: s_mov_b32 s0, 0
+; GFX1132GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3
+; GFX1132GISEL-NEXT: s_add_i32 s0, s0, s6
+; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1132GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
+; GFX1132GISEL-NEXT: s_endpgm
+ entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %d_cmp = icmp ult i32 %tid, 16
+ br i1 %d_cmp, label %if, label %else
+
+if:
+ %reducedValTid = call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 %tid, i32 1)
+ br label %endif
+
+else:
+ %reducedValIn = call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 %in, i32 1)
+ br label %endif
+
+endif:
+ %combine = phi i32 [%reducedValTid, %if], [%reducedValIn, %else]
+ store i32 %combine, ptr addrspace(1) %out
+ ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX10DAGISEL: {{.*}}
+; GFX10GISEL: {{.*}}
+; GFX11DAGISEL: {{.*}}
+; GFX11GISEL: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll
new file mode 100644
index 0000000000000..ac11fa6a5a7a5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll
@@ -0,0 +1,986 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 < %s | FileCheck -check-prefixes=GFX8DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 < %s | FileCheck -check-prefixes=GFX8GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 < %s | FileCheck -check-prefixes=GFX9DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 < %s | FileCheck -check-prefixes=GFX9GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s
+
+define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: uniform_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: uniform_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: uniform_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: uniform_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX10DAGISEL-LABEL: uniform_value:
+; GFX10DAGISEL: ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT: s_clause 0x1
+; GFX10DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10DAGISEL-NEXT: s_endpgm
+;
+; GFX10GISEL-LABEL: uniform_value:
+; GFX10GISEL: ; %bb.0: ; %entry
+; GFX10GISEL-NEXT: s_clause 0x1
+; GFX10GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX10GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: uniform_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_clause 0x1
+; GFX1164DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: uniform_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_clause 0x1
+; GFX1164GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: uniform_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_clause 0x1
+; GFX1132DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: uniform_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_clause 0x1
+; GFX1132GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_endpgm
+entry:
+ %result = call i32 @llvm.amdgcn.wave.reduce.and.i32(i32 %in, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @const_value(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: const_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, 0x7b
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: const_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, 0x7b
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: const_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: const_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX10DAGISEL-LABEL: const_value:
+; GFX10DAGISEL: ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b
+; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10DAGISEL-NEXT: s_endpgm
+;
+; GFX10GISEL-LABEL: const_value:
+; GFX10GISEL: ; %bb.0: ; %entry
+; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: const_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: const_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: const_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7b
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: const_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_endpgm
+ entry:
+ %result = call i32 @llvm.amdgcn.wave.reduce.and.i32(i32 123, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @poison_value(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: poison_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v0
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: poison_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v0
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: poison_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v0, v0, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: poison_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v0, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX10DAGISEL-LABEL: poison_value:
+; GFX10DAGISEL: ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT: global_store_dword v0, v0, s[0:1]
+; GFX10DAGISEL-NEXT: s_endpgm
+;
+; GFX10GISEL-LABEL: poison_value:
+; GFX10GISEL: ; %bb.0: ; %entry
+; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT: global_store_dword v0, v0, s[0:1]
+; GFX10GISEL-NEXT: s_endpgm
+;
+; GFX11DAGISEL-LABEL: poison_value:
+; GFX11DAGISEL: ; %bb.0: ; %entry
+; GFX11DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX11DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11DAGISEL-NEXT: global_store_b32 v0, v0, s[0:1]
+; GFX11DAGISEL-NEXT: s_endpgm
+;
+; GFX11GISEL-LABEL: poison_value:
+; GFX11GISEL: ; %bb.0: ; %entry
+; GFX11GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX11GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11GISEL-NEXT: global_store_b32 v0, v0, s[0:1]
+; GFX11GISEL-NEXT: s_endpgm
+entry:
+ %result = call i32 @llvm.amdgcn.wave.reduce.and.i32(i32 poison, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: divergent_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_mov_b32 s4, -1
+; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX8DAGISEL-NEXT: s_and_b32 s4, s4, s6
+; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX8DAGISEL-NEXT: ; %bb.2:
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_mov_b32 s4, -1
+; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX8GISEL-NEXT: s_and_b32 s4, s4, s6
+; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX8GISEL-NEXT: ; %bb.2:
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_mov_b32 s4, -1
+; GFX9DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX9DAGISEL-NEXT: s_and_b32 s4, s4, s6
+; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX9DAGISEL-NEXT: ; %bb.2:
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_mov_b32 s4, -1
+; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX9GISEL-NEXT: s_and_b32 s4, s4, s6
+; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX9GISEL-NEXT: ; %bb.2:
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_value:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: s_mov_b32 s4, -1
+; GFX1064DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1064DAGISEL-NEXT: s_and_b32 s4, s4, s6
+; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1064DAGISEL-NEXT: ; %bb.2:
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_value:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: s_mov_b32 s4, -1
+; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1064GISEL-NEXT: s_and_b32 s4, s4, s6
+; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1064GISEL-NEXT: ; %bb.2:
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_value:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, -1
+; GFX1032DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1032DAGISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1032DAGISEL-NEXT: s_and_b32 s2, s2, s5
+; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1032DAGISEL-NEXT: ; %bb.2:
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_value:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032GISEL-NEXT: s_mov_b32 s2, -1
+; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032GISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1032GISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1032GISEL-NEXT: s_and_b32 s2, s2, s5
+; GFX1032GISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1032GISEL-NEXT: ; %bb.2:
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: s_mov_b32 s4, -1
+; GFX1164DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: s_and_b32 s4, s4, s6
+; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1164DAGISEL-NEXT: ; %bb.2:
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: s_mov_b32 s4, -1
+; GFX1164GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT: s_and_b32 s4, s4, s6
+; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1164GISEL-NEXT: ; %bb.2:
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132DAGISEL-NEXT: s_mov_b32 s2, -1
+; GFX1132DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s3
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: s_and_b32 s2, s2, s5
+; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1132DAGISEL-NEXT: ; %bb.2:
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132GISEL-NEXT: s_mov_b32 s2, -1
+; GFX1132GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s3
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT: s_and_b32 s2, s2, s5
+; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1132GISEL-NEXT: ; %bb.2:
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_endpgm
+entry:
+ %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+ %result = call i32 @llvm.amdgcn.wave.reduce.and.i32(i32 %id.x, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: divergent_cfg:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr2
+; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX8DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX8DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX8DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX8DAGISEL-NEXT: ; %bb.2: ; %Flow
+; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX8DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_mov_b32 s6, -1
+; GFX8DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX8DAGISEL-NEXT: s_and_b32 s6, s6, s8
+; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX8DAGISEL-NEXT: ; %bb.5:
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX8DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT: flat_store_dword v[2:3], v1
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_cfg:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
+; GFX8GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX8GISEL-NEXT: ; %bb.1: ; %else
+; GFX8GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX8GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: s_mov_b32 s6, s2
+; GFX8GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX8GISEL-NEXT: ; %bb.3: ; %if
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_mov_b32 s6, -1
+; GFX8GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX8GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX8GISEL-NEXT: s_and_b32 s6, s6, s8
+; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX8GISEL-NEXT: .LBB4_5: ; %endif
+; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_cfg:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr2
+; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX9DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX9DAGISEL-NEXT: ; %bb.2: ; %Flow
+; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX9DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_mov_b32 s6, -1
+; GFX9DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX9DAGISEL-NEXT: s_and_b32 s6, s6, s8
+; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX9DAGISEL-NEXT: ; %bb.5:
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX9DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_cfg:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
+; GFX9GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX9GISEL-NEXT: ; %bb.1: ; %else
+; GFX9GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX9GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: s_mov_b32 s6, s2
+; GFX9GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX9GISEL-NEXT: ; %bb.3: ; %if
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_mov_b32 s6, -1
+; GFX9GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX9GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX9GISEL-NEXT: s_and_b32 s6, s6, s8
+; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX9GISEL-NEXT: .LBB4_5: ; %endif
+; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_cfg:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr2
+; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1064DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX1064DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1064DAGISEL-NEXT: ; %bb.2: ; %Flow
+; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: s_mov_b32 s6, -1
+; GFX1064DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX1064DAGISEL-NEXT: s_and_b32 s6, s6, s8
+; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1064DAGISEL-NEXT: ; %bb.5:
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX1064DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_cfg:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
+; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1064GISEL-NEXT: ; %bb.1: ; %else
+; GFX1064GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: s_mov_b32 s6, s2
+; GFX1064GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1064GISEL-NEXT: ; %bb.3: ; %if
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: s_mov_b32 s6, -1
+; GFX1064GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX1064GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX1064GISEL-NEXT: s_and_b32 s6, s6, s8
+; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1064GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_cfg:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc_lo, 15, v0
+; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr1
+; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1032DAGISEL-NEXT: s_load_dword s1, s[4:5], 0x2c
+; GFX1032DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1032DAGISEL-NEXT: ; %bb.2: ; %Flow
+; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, s0
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT: s_mov_b32 s1, -1
+; GFX1032DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s3, s2
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1032DAGISEL-NEXT: s_bitset0_b32 s2, s3
+; GFX1032DAGISEL-NEXT: s_and_b32 s1, s1, s6
+; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1032DAGISEL-NEXT: ; %bb.5:
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1032DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_cfg:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v0
+; GFX1032GISEL-NEXT: ; implicit-def: $sgpr0
+; GFX1032GISEL-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX1032GISEL-NEXT: s_xor_b32 s1, exec_lo, s1
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1032GISEL-NEXT: ; %bb.1: ; %else
+; GFX1032GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c
+; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: s_mov_b32 s0, s0
+; GFX1032GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s1, s1
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1032GISEL-NEXT: ; %bb.3: ; %if
+; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT: s_mov_b32 s0, -1
+; GFX1032GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: s_ff1_i32_b32 s3, s2
+; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1032GISEL-NEXT: s_bitset0_b32 s2, s3
+; GFX1032GISEL-NEXT: s_and_b32 s0, s0, s6
+; GFX1032GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1032GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_cfg:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr2
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
+; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1164DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1164DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1164DAGISEL-NEXT: ; %bb.2: ; %Flow
+; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: s_mov_b32 s6, -1
+; GFX1164DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX1164DAGISEL-NEXT: s_and_b32 s6, s6, s8
+; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1164DAGISEL-NEXT: ; %bb.5:
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX1164DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_cfg:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
+; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1164GISEL-NEXT: ; %bb.1: ; %else
+; GFX1164GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: s_mov_b32 s6, s2
+; GFX1164GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1]
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1164GISEL-NEXT: ; %bb.3: ; %if
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: s_mov_b32 s6, -1
+; GFX1164GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX1164GISEL-NEXT: s_and_b32 s6, s6, s8
+; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1164GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_cfg:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr1
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
+; GFX1132DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1132DAGISEL-NEXT: s_load_b32 s1, s[4:5], 0x2c
+; GFX1132DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1132DAGISEL-NEXT: ; %bb.2: ; %Flow
+; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, s0
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT: s_mov_b32 s1, -1
+; GFX1132DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3
+; GFX1132DAGISEL-NEXT: s_and_b32 s1, s1, s6
+; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1132DAGISEL-NEXT: ; %bb.5:
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1132DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_cfg:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT: s_mov_b32 s1, exec_lo
+; GFX1132GISEL-NEXT: ; implicit-def: $sgpr0
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
+; GFX1132GISEL-NEXT: s_xor_b32 s1, exec_lo, s1
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1132GISEL-NEXT: ; %bb.1: ; %else
+; GFX1132GISEL-NEXT: s_load_b32 s0, s[4:5], 0x2c
+; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: s_mov_b32 s0, s0
+; GFX1132GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s1, s1
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1132GISEL-NEXT: ; %bb.3: ; %if
+; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT: s_mov_b32 s0, -1
+; GFX1132GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3
+; GFX1132GISEL-NEXT: s_and_b32 s0, s0, s6
+; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1132GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
+; GFX1132GISEL-NEXT: s_endpgm
+ entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %d_cmp = icmp ult i32 %tid, 16
+ br i1 %d_cmp, label %if, label %else
+
+if:
+ %reducedValTid = call i32 @llvm.amdgcn.wave.reduce.and.i32(i32 %tid, i32 1)
+ br label %endif
+
+else:
+ %reducedValIn = call i32 @llvm.amdgcn.wave.reduce.and.i32(i32 %in, i32 1)
+ br label %endif
+
+endif:
+ %combine = phi i32 [%reducedValTid, %if], [%reducedValIn, %else]
+ store i32 %combine, ptr addrspace(1) %out
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll
new file mode 100644
index 0000000000000..2df2886d068c8
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll
@@ -0,0 +1,986 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 < %s | FileCheck -check-prefixes=GFX8DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 < %s | FileCheck -check-prefixes=GFX8GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 < %s | FileCheck -check-prefixes=GFX9DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 < %s | FileCheck -check-prefixes=GFX9GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s
+
+define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: uniform_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: uniform_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: uniform_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: uniform_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX10DAGISEL-LABEL: uniform_value:
+; GFX10DAGISEL: ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT: s_clause 0x1
+; GFX10DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10DAGISEL-NEXT: s_endpgm
+;
+; GFX10GISEL-LABEL: uniform_value:
+; GFX10GISEL: ; %bb.0: ; %entry
+; GFX10GISEL-NEXT: s_clause 0x1
+; GFX10GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX10GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: uniform_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_clause 0x1
+; GFX1164DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: uniform_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_clause 0x1
+; GFX1164GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: uniform_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_clause 0x1
+; GFX1132DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: uniform_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_clause 0x1
+; GFX1132GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_endpgm
+entry:
+ %result = call i32 @llvm.amdgcn.wave.reduce.max.i32(i32 %in, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @const_value(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: const_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, 0x7b
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: const_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, 0x7b
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: const_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: const_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX10DAGISEL-LABEL: const_value:
+; GFX10DAGISEL: ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b
+; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10DAGISEL-NEXT: s_endpgm
+;
+; GFX10GISEL-LABEL: const_value:
+; GFX10GISEL: ; %bb.0: ; %entry
+; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: const_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: const_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: const_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7b
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: const_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_endpgm
+ entry:
+ %result = call i32 @llvm.amdgcn.wave.reduce.max.i32(i32 123, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @poison_value(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: poison_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v0
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: poison_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v0
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: poison_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v0, v0, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: poison_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v0, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX10DAGISEL-LABEL: poison_value:
+; GFX10DAGISEL: ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT: global_store_dword v0, v0, s[0:1]
+; GFX10DAGISEL-NEXT: s_endpgm
+;
+; GFX10GISEL-LABEL: poison_value:
+; GFX10GISEL: ; %bb.0: ; %entry
+; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT: global_store_dword v0, v0, s[0:1]
+; GFX10GISEL-NEXT: s_endpgm
+;
+; GFX11DAGISEL-LABEL: poison_value:
+; GFX11DAGISEL: ; %bb.0: ; %entry
+; GFX11DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX11DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11DAGISEL-NEXT: global_store_b32 v0, v0, s[0:1]
+; GFX11DAGISEL-NEXT: s_endpgm
+;
+; GFX11GISEL-LABEL: poison_value:
+; GFX11GISEL: ; %bb.0: ; %entry
+; GFX11GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX11GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11GISEL-NEXT: global_store_b32 v0, v0, s[0:1]
+; GFX11GISEL-NEXT: s_endpgm
+entry:
+ %result = call i32 @llvm.amdgcn.wave.reduce.max.i32(i32 poison, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: divergent_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_brev_b32 s4, 1
+; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX8DAGISEL-NEXT: s_max_i32 s4, s4, s6
+; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX8DAGISEL-NEXT: ; %bb.2:
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_brev_b32 s4, 1
+; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX8GISEL-NEXT: s_max_i32 s4, s4, s6
+; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX8GISEL-NEXT: ; %bb.2:
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_brev_b32 s4, 1
+; GFX9DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX9DAGISEL-NEXT: s_max_i32 s4, s4, s6
+; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX9DAGISEL-NEXT: ; %bb.2:
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_brev_b32 s4, 1
+; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX9GISEL-NEXT: s_max_i32 s4, s4, s6
+; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX9GISEL-NEXT: ; %bb.2:
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_value:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: s_brev_b32 s4, 1
+; GFX1064DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1064DAGISEL-NEXT: s_max_i32 s4, s4, s6
+; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1064DAGISEL-NEXT: ; %bb.2:
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_value:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: s_brev_b32 s4, 1
+; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1064GISEL-NEXT: s_max_i32 s4, s4, s6
+; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1064GISEL-NEXT: ; %bb.2:
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_value:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032DAGISEL-NEXT: s_brev_b32 s2, 1
+; GFX1032DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1032DAGISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1032DAGISEL-NEXT: s_max_i32 s2, s2, s5
+; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1032DAGISEL-NEXT: ; %bb.2:
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_value:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032GISEL-NEXT: s_brev_b32 s2, 1
+; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032GISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1032GISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1032GISEL-NEXT: s_max_i32 s2, s2, s5
+; GFX1032GISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1032GISEL-NEXT: ; %bb.2:
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: s_brev_b32 s4, 1
+; GFX1164DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: s_max_i32 s4, s4, s6
+; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1164DAGISEL-NEXT: ; %bb.2:
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: s_brev_b32 s4, 1
+; GFX1164GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT: s_max_i32 s4, s4, s6
+; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1164GISEL-NEXT: ; %bb.2:
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132DAGISEL-NEXT: s_brev_b32 s2, 1
+; GFX1132DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s3
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: s_max_i32 s2, s2, s5
+; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1132DAGISEL-NEXT: ; %bb.2:
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132GISEL-NEXT: s_brev_b32 s2, 1
+; GFX1132GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s3
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT: s_max_i32 s2, s2, s5
+; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1132GISEL-NEXT: ; %bb.2:
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_endpgm
+entry:
+ %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+ %result = call i32 @llvm.amdgcn.wave.reduce.max.i32(i32 %id.x, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: divergent_cfg:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr2
+; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX8DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX8DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX8DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX8DAGISEL-NEXT: ; %bb.2: ; %Flow
+; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX8DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_brev_b32 s6, 1
+; GFX8DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX8DAGISEL-NEXT: s_max_i32 s6, s6, s8
+; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX8DAGISEL-NEXT: ; %bb.5:
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX8DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT: flat_store_dword v[2:3], v1
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_cfg:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
+; GFX8GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX8GISEL-NEXT: ; %bb.1: ; %else
+; GFX8GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX8GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: s_mov_b32 s6, s2
+; GFX8GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX8GISEL-NEXT: ; %bb.3: ; %if
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_brev_b32 s6, 1
+; GFX8GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX8GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX8GISEL-NEXT: s_max_i32 s6, s6, s8
+; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX8GISEL-NEXT: .LBB4_5: ; %endif
+; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_cfg:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr2
+; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX9DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX9DAGISEL-NEXT: ; %bb.2: ; %Flow
+; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX9DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_brev_b32 s6, 1
+; GFX9DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX9DAGISEL-NEXT: s_max_i32 s6, s6, s8
+; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX9DAGISEL-NEXT: ; %bb.5:
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX9DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_cfg:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
+; GFX9GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX9GISEL-NEXT: ; %bb.1: ; %else
+; GFX9GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX9GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: s_mov_b32 s6, s2
+; GFX9GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX9GISEL-NEXT: ; %bb.3: ; %if
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_brev_b32 s6, 1
+; GFX9GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX9GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX9GISEL-NEXT: s_max_i32 s6, s6, s8
+; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX9GISEL-NEXT: .LBB4_5: ; %endif
+; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_cfg:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr2
+; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1064DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX1064DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1064DAGISEL-NEXT: ; %bb.2: ; %Flow
+; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: s_brev_b32 s6, 1
+; GFX1064DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX1064DAGISEL-NEXT: s_max_i32 s6, s6, s8
+; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1064DAGISEL-NEXT: ; %bb.5:
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX1064DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_cfg:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
+; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1064GISEL-NEXT: ; %bb.1: ; %else
+; GFX1064GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: s_mov_b32 s6, s2
+; GFX1064GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1064GISEL-NEXT: ; %bb.3: ; %if
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: s_brev_b32 s6, 1
+; GFX1064GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX1064GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX1064GISEL-NEXT: s_max_i32 s6, s6, s8
+; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1064GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_cfg:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc_lo, 15, v0
+; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr1
+; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1032DAGISEL-NEXT: s_load_dword s1, s[4:5], 0x2c
+; GFX1032DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1032DAGISEL-NEXT: ; %bb.2: ; %Flow
+; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, s0
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT: s_brev_b32 s1, 1
+; GFX1032DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s3, s2
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1032DAGISEL-NEXT: s_bitset0_b32 s2, s3
+; GFX1032DAGISEL-NEXT: s_max_i32 s1, s1, s6
+; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1032DAGISEL-NEXT: ; %bb.5:
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1032DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_cfg:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v0
+; GFX1032GISEL-NEXT: ; implicit-def: $sgpr0
+; GFX1032GISEL-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX1032GISEL-NEXT: s_xor_b32 s1, exec_lo, s1
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1032GISEL-NEXT: ; %bb.1: ; %else
+; GFX1032GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c
+; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: s_mov_b32 s0, s0
+; GFX1032GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s1, s1
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1032GISEL-NEXT: ; %bb.3: ; %if
+; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT: s_brev_b32 s0, 1
+; GFX1032GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: s_ff1_i32_b32 s3, s2
+; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1032GISEL-NEXT: s_bitset0_b32 s2, s3
+; GFX1032GISEL-NEXT: s_max_i32 s0, s0, s6
+; GFX1032GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1032GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_cfg:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr2
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
+; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1164DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1164DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1164DAGISEL-NEXT: ; %bb.2: ; %Flow
+; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: s_brev_b32 s6, 1
+; GFX1164DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX1164DAGISEL-NEXT: s_max_i32 s6, s6, s8
+; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1164DAGISEL-NEXT: ; %bb.5:
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX1164DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_cfg:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
+; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1164GISEL-NEXT: ; %bb.1: ; %else
+; GFX1164GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: s_mov_b32 s6, s2
+; GFX1164GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1]
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1164GISEL-NEXT: ; %bb.3: ; %if
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: s_brev_b32 s6, 1
+; GFX1164GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX1164GISEL-NEXT: s_max_i32 s6, s6, s8
+; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1164GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_cfg:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr1
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
+; GFX1132DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1132DAGISEL-NEXT: s_load_b32 s1, s[4:5], 0x2c
+; GFX1132DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1132DAGISEL-NEXT: ; %bb.2: ; %Flow
+; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, s0
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT: s_brev_b32 s1, 1
+; GFX1132DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3
+; GFX1132DAGISEL-NEXT: s_max_i32 s1, s1, s6
+; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1132DAGISEL-NEXT: ; %bb.5:
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1132DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_cfg:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT: s_mov_b32 s1, exec_lo
+; GFX1132GISEL-NEXT: ; implicit-def: $sgpr0
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
+; GFX1132GISEL-NEXT: s_xor_b32 s1, exec_lo, s1
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1132GISEL-NEXT: ; %bb.1: ; %else
+; GFX1132GISEL-NEXT: s_load_b32 s0, s[4:5], 0x2c
+; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: s_mov_b32 s0, s0
+; GFX1132GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s1, s1
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1132GISEL-NEXT: ; %bb.3: ; %if
+; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT: s_brev_b32 s0, 1
+; GFX1132GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3
+; GFX1132GISEL-NEXT: s_max_i32 s0, s0, s6
+; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1132GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
+; GFX1132GISEL-NEXT: s_endpgm
+ entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %d_cmp = icmp ult i32 %tid, 16
+ br i1 %d_cmp, label %if, label %else
+
+if:
+ %reducedValTid = call i32 @llvm.amdgcn.wave.reduce.max.i32(i32 %tid, i32 1)
+ br label %endif
+
+else:
+ %reducedValIn = call i32 @llvm.amdgcn.wave.reduce.max.i32(i32 %in, i32 1)
+ br label %endif
+
+endif:
+ %combine = phi i32 [%reducedValTid, %if], [%reducedValIn, %else]
+ store i32 %combine, ptr addrspace(1) %out
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll
new file mode 100644
index 0000000000000..45335c7234956
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll
@@ -0,0 +1,986 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 < %s | FileCheck -check-prefixes=GFX8DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 < %s | FileCheck -check-prefixes=GFX8GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 < %s | FileCheck -check-prefixes=GFX9DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 < %s | FileCheck -check-prefixes=GFX9GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s
+
+define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: uniform_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: uniform_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: uniform_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: uniform_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX10DAGISEL-LABEL: uniform_value:
+; GFX10DAGISEL: ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT: s_clause 0x1
+; GFX10DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10DAGISEL-NEXT: s_endpgm
+;
+; GFX10GISEL-LABEL: uniform_value:
+; GFX10GISEL: ; %bb.0: ; %entry
+; GFX10GISEL-NEXT: s_clause 0x1
+; GFX10GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX10GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: uniform_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_clause 0x1
+; GFX1164DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: uniform_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_clause 0x1
+; GFX1164GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: uniform_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_clause 0x1
+; GFX1132DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: uniform_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_clause 0x1
+; GFX1132GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_endpgm
+entry:
+ %result = call i32 @llvm.amdgcn.wave.reduce.min.i32(i32 %in, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @const_value(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: const_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, 0x7b
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: const_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, 0x7b
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: const_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: const_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX10DAGISEL-LABEL: const_value:
+; GFX10DAGISEL: ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b
+; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10DAGISEL-NEXT: s_endpgm
+;
+; GFX10GISEL-LABEL: const_value:
+; GFX10GISEL: ; %bb.0: ; %entry
+; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: const_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: const_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: const_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7b
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: const_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_endpgm
+ entry:
+ %result = call i32 @llvm.amdgcn.wave.reduce.min.i32(i32 123, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @poison_value(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: poison_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v0
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: poison_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v0
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: poison_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v0, v0, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: poison_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v0, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX10DAGISEL-LABEL: poison_value:
+; GFX10DAGISEL: ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT: global_store_dword v0, v0, s[0:1]
+; GFX10DAGISEL-NEXT: s_endpgm
+;
+; GFX10GISEL-LABEL: poison_value:
+; GFX10GISEL: ; %bb.0: ; %entry
+; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT: global_store_dword v0, v0, s[0:1]
+; GFX10GISEL-NEXT: s_endpgm
+;
+; GFX11DAGISEL-LABEL: poison_value:
+; GFX11DAGISEL: ; %bb.0: ; %entry
+; GFX11DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX11DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11DAGISEL-NEXT: global_store_b32 v0, v0, s[0:1]
+; GFX11DAGISEL-NEXT: s_endpgm
+;
+; GFX11GISEL-LABEL: poison_value:
+; GFX11GISEL: ; %bb.0: ; %entry
+; GFX11GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX11GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11GISEL-NEXT: global_store_b32 v0, v0, s[0:1]
+; GFX11GISEL-NEXT: s_endpgm
+entry:
+ %result = call i32 @llvm.amdgcn.wave.reduce.min.i32(i32 poison, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: divergent_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_brev_b32 s4, -2
+; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX8DAGISEL-NEXT: s_min_i32 s4, s4, s6
+; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX8DAGISEL-NEXT: ; %bb.2:
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_brev_b32 s4, -2
+; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX8GISEL-NEXT: s_min_i32 s4, s4, s6
+; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX8GISEL-NEXT: ; %bb.2:
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_brev_b32 s4, -2
+; GFX9DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX9DAGISEL-NEXT: s_min_i32 s4, s4, s6
+; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX9DAGISEL-NEXT: ; %bb.2:
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_brev_b32 s4, -2
+; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX9GISEL-NEXT: s_min_i32 s4, s4, s6
+; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX9GISEL-NEXT: ; %bb.2:
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_value:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: s_brev_b32 s4, -2
+; GFX1064DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1064DAGISEL-NEXT: s_min_i32 s4, s4, s6
+; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1064DAGISEL-NEXT: ; %bb.2:
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_value:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: s_brev_b32 s4, -2
+; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1064GISEL-NEXT: s_min_i32 s4, s4, s6
+; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1064GISEL-NEXT: ; %bb.2:
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_value:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032DAGISEL-NEXT: s_brev_b32 s2, -2
+; GFX1032DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1032DAGISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1032DAGISEL-NEXT: s_min_i32 s2, s2, s5
+; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1032DAGISEL-NEXT: ; %bb.2:
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_value:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032GISEL-NEXT: s_brev_b32 s2, -2
+; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032GISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1032GISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1032GISEL-NEXT: s_min_i32 s2, s2, s5
+; GFX1032GISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1032GISEL-NEXT: ; %bb.2:
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: s_brev_b32 s4, -2
+; GFX1164DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: s_min_i32 s4, s4, s6
+; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1164DAGISEL-NEXT: ; %bb.2:
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: s_brev_b32 s4, -2
+; GFX1164GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT: s_min_i32 s4, s4, s6
+; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1164GISEL-NEXT: ; %bb.2:
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132DAGISEL-NEXT: s_brev_b32 s2, -2
+; GFX1132DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s3
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: s_min_i32 s2, s2, s5
+; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1132DAGISEL-NEXT: ; %bb.2:
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132GISEL-NEXT: s_brev_b32 s2, -2
+; GFX1132GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s3
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT: s_min_i32 s2, s2, s5
+; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1132GISEL-NEXT: ; %bb.2:
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_endpgm
+entry:
+ %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+ %result = call i32 @llvm.amdgcn.wave.reduce.min.i32(i32 %id.x, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: divergent_cfg:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr2
+; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX8DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX8DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX8DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX8DAGISEL-NEXT: ; %bb.2: ; %Flow
+; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX8DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_brev_b32 s6, -2
+; GFX8DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX8DAGISEL-NEXT: s_min_i32 s6, s6, s8
+; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX8DAGISEL-NEXT: ; %bb.5:
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX8DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT: flat_store_dword v[2:3], v1
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_cfg:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
+; GFX8GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX8GISEL-NEXT: ; %bb.1: ; %else
+; GFX8GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX8GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: s_mov_b32 s6, s2
+; GFX8GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX8GISEL-NEXT: ; %bb.3: ; %if
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_brev_b32 s6, -2
+; GFX8GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX8GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX8GISEL-NEXT: s_min_i32 s6, s6, s8
+; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX8GISEL-NEXT: .LBB4_5: ; %endif
+; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_cfg:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr2
+; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX9DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX9DAGISEL-NEXT: ; %bb.2: ; %Flow
+; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX9DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_brev_b32 s6, -2
+; GFX9DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX9DAGISEL-NEXT: s_min_i32 s6, s6, s8
+; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX9DAGISEL-NEXT: ; %bb.5:
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX9DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_cfg:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
+; GFX9GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX9GISEL-NEXT: ; %bb.1: ; %else
+; GFX9GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX9GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: s_mov_b32 s6, s2
+; GFX9GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX9GISEL-NEXT: ; %bb.3: ; %if
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_brev_b32 s6, -2
+; GFX9GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX9GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX9GISEL-NEXT: s_min_i32 s6, s6, s8
+; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX9GISEL-NEXT: .LBB4_5: ; %endif
+; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_cfg:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr2
+; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1064DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX1064DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1064DAGISEL-NEXT: ; %bb.2: ; %Flow
+; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: s_brev_b32 s6, -2
+; GFX1064DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX1064DAGISEL-NEXT: s_min_i32 s6, s6, s8
+; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1064DAGISEL-NEXT: ; %bb.5:
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX1064DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_cfg:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
+; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1064GISEL-NEXT: ; %bb.1: ; %else
+; GFX1064GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: s_mov_b32 s6, s2
+; GFX1064GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1064GISEL-NEXT: ; %bb.3: ; %if
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: s_brev_b32 s6, -2
+; GFX1064GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX1064GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX1064GISEL-NEXT: s_min_i32 s6, s6, s8
+; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1064GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_cfg:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc_lo, 15, v0
+; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr1
+; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1032DAGISEL-NEXT: s_load_dword s1, s[4:5], 0x2c
+; GFX1032DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1032DAGISEL-NEXT: ; %bb.2: ; %Flow
+; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, s0
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT: s_brev_b32 s1, -2
+; GFX1032DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s3, s2
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1032DAGISEL-NEXT: s_bitset0_b32 s2, s3
+; GFX1032DAGISEL-NEXT: s_min_i32 s1, s1, s6
+; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1032DAGISEL-NEXT: ; %bb.5:
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1032DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_cfg:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v0
+; GFX1032GISEL-NEXT: ; implicit-def: $sgpr0
+; GFX1032GISEL-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX1032GISEL-NEXT: s_xor_b32 s1, exec_lo, s1
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1032GISEL-NEXT: ; %bb.1: ; %else
+; GFX1032GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c
+; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: s_mov_b32 s0, s0
+; GFX1032GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s1, s1
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1032GISEL-NEXT: ; %bb.3: ; %if
+; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT: s_brev_b32 s0, -2
+; GFX1032GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: s_ff1_i32_b32 s3, s2
+; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1032GISEL-NEXT: s_bitset0_b32 s2, s3
+; GFX1032GISEL-NEXT: s_min_i32 s0, s0, s6
+; GFX1032GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1032GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_cfg:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr2
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
+; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1164DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1164DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1164DAGISEL-NEXT: ; %bb.2: ; %Flow
+; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: s_brev_b32 s6, -2
+; GFX1164DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX1164DAGISEL-NEXT: s_min_i32 s6, s6, s8
+; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1164DAGISEL-NEXT: ; %bb.5:
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX1164DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_cfg:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
+; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1164GISEL-NEXT: ; %bb.1: ; %else
+; GFX1164GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: s_mov_b32 s6, s2
+; GFX1164GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1]
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1164GISEL-NEXT: ; %bb.3: ; %if
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: s_brev_b32 s6, -2
+; GFX1164GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX1164GISEL-NEXT: s_min_i32 s6, s6, s8
+; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1164GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_cfg:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr1
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
+; GFX1132DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1132DAGISEL-NEXT: s_load_b32 s1, s[4:5], 0x2c
+; GFX1132DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1132DAGISEL-NEXT: ; %bb.2: ; %Flow
+; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, s0
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT: s_brev_b32 s1, -2
+; GFX1132DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3
+; GFX1132DAGISEL-NEXT: s_min_i32 s1, s1, s6
+; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1132DAGISEL-NEXT: ; %bb.5:
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1132DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_cfg:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT: s_mov_b32 s1, exec_lo
+; GFX1132GISEL-NEXT: ; implicit-def: $sgpr0
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
+; GFX1132GISEL-NEXT: s_xor_b32 s1, exec_lo, s1
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1132GISEL-NEXT: ; %bb.1: ; %else
+; GFX1132GISEL-NEXT: s_load_b32 s0, s[4:5], 0x2c
+; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: s_mov_b32 s0, s0
+; GFX1132GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s1, s1
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1132GISEL-NEXT: ; %bb.3: ; %if
+; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT: s_brev_b32 s0, -2
+; GFX1132GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3
+; GFX1132GISEL-NEXT: s_min_i32 s0, s0, s6
+; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1132GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
+; GFX1132GISEL-NEXT: s_endpgm
+ entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %d_cmp = icmp ult i32 %tid, 16
+ br i1 %d_cmp, label %if, label %else
+
+if:
+ %reducedValTid = call i32 @llvm.amdgcn.wave.reduce.min.i32(i32 %tid, i32 1)
+ br label %endif
+
+else:
+ %reducedValIn = call i32 @llvm.amdgcn.wave.reduce.min.i32(i32 %in, i32 1)
+ br label %endif
+
+endif:
+ %combine = phi i32 [%reducedValTid, %if], [%reducedValIn, %else]
+ store i32 %combine, ptr addrspace(1) %out
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll
new file mode 100644
index 0000000000000..590c5cc383d43
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll
@@ -0,0 +1,986 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 < %s | FileCheck -check-prefixes=GFX8DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 < %s | FileCheck -check-prefixes=GFX8GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 < %s | FileCheck -check-prefixes=GFX9DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 < %s | FileCheck -check-prefixes=GFX9GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s
+
+define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: uniform_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: uniform_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: uniform_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: uniform_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX10DAGISEL-LABEL: uniform_value:
+; GFX10DAGISEL: ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT: s_clause 0x1
+; GFX10DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10DAGISEL-NEXT: s_endpgm
+;
+; GFX10GISEL-LABEL: uniform_value:
+; GFX10GISEL: ; %bb.0: ; %entry
+; GFX10GISEL-NEXT: s_clause 0x1
+; GFX10GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX10GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: uniform_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_clause 0x1
+; GFX1164DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: uniform_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_clause 0x1
+; GFX1164GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: uniform_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_clause 0x1
+; GFX1132DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: uniform_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_clause 0x1
+; GFX1132GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_endpgm
+entry:
+ %result = call i32 @llvm.amdgcn.wave.reduce.or.i32(i32 %in, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @const_value(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: const_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, 0x7b
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: const_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, 0x7b
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: const_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: const_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX10DAGISEL-LABEL: const_value:
+; GFX10DAGISEL: ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b
+; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10DAGISEL-NEXT: s_endpgm
+;
+; GFX10GISEL-LABEL: const_value:
+; GFX10GISEL: ; %bb.0: ; %entry
+; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: const_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: const_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: const_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7b
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: const_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_endpgm
+ entry:
+ %result = call i32 @llvm.amdgcn.wave.reduce.or.i32(i32 123, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @poison_value(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: poison_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v0
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: poison_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v0
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: poison_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v0, v0, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: poison_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v0, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX10DAGISEL-LABEL: poison_value:
+; GFX10DAGISEL: ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT: global_store_dword v0, v0, s[0:1]
+; GFX10DAGISEL-NEXT: s_endpgm
+;
+; GFX10GISEL-LABEL: poison_value:
+; GFX10GISEL: ; %bb.0: ; %entry
+; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT: global_store_dword v0, v0, s[0:1]
+; GFX10GISEL-NEXT: s_endpgm
+;
+; GFX11DAGISEL-LABEL: poison_value:
+; GFX11DAGISEL: ; %bb.0: ; %entry
+; GFX11DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX11DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11DAGISEL-NEXT: global_store_b32 v0, v0, s[0:1]
+; GFX11DAGISEL-NEXT: s_endpgm
+;
+; GFX11GISEL-LABEL: poison_value:
+; GFX11GISEL: ; %bb.0: ; %entry
+; GFX11GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX11GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11GISEL-NEXT: global_store_b32 v0, v0, s[0:1]
+; GFX11GISEL-NEXT: s_endpgm
+entry:
+ %result = call i32 @llvm.amdgcn.wave.reduce.or.i32(i32 poison, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: divergent_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_mov_b32 s4, 0
+; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX8DAGISEL-NEXT: s_or_b32 s4, s4, s6
+; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX8DAGISEL-NEXT: ; %bb.2:
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_mov_b32 s4, 0
+; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX8GISEL-NEXT: s_or_b32 s4, s4, s6
+; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX8GISEL-NEXT: ; %bb.2:
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_mov_b32 s4, 0
+; GFX9DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX9DAGISEL-NEXT: s_or_b32 s4, s4, s6
+; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX9DAGISEL-NEXT: ; %bb.2:
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_mov_b32 s4, 0
+; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX9GISEL-NEXT: s_or_b32 s4, s4, s6
+; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX9GISEL-NEXT: ; %bb.2:
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_value:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: s_mov_b32 s4, 0
+; GFX1064DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1064DAGISEL-NEXT: s_or_b32 s4, s4, s6
+; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1064DAGISEL-NEXT: ; %bb.2:
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_value:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: s_mov_b32 s4, 0
+; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1064GISEL-NEXT: s_or_b32 s4, s4, s6
+; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1064GISEL-NEXT: ; %bb.2:
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_value:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, 0
+; GFX1032DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1032DAGISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1032DAGISEL-NEXT: s_or_b32 s2, s2, s5
+; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1032DAGISEL-NEXT: ; %bb.2:
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_value:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032GISEL-NEXT: s_mov_b32 s2, 0
+; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032GISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1032GISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1032GISEL-NEXT: s_or_b32 s2, s2, s5
+; GFX1032GISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1032GISEL-NEXT: ; %bb.2:
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: s_mov_b32 s4, 0
+; GFX1164DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: s_or_b32 s4, s4, s6
+; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1164DAGISEL-NEXT: ; %bb.2:
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: s_mov_b32 s4, 0
+; GFX1164GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT: s_or_b32 s4, s4, s6
+; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1164GISEL-NEXT: ; %bb.2:
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132DAGISEL-NEXT: s_mov_b32 s2, 0
+; GFX1132DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s3
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: s_or_b32 s2, s2, s5
+; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1132DAGISEL-NEXT: ; %bb.2:
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132GISEL-NEXT: s_mov_b32 s2, 0
+; GFX1132GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s3
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT: s_or_b32 s2, s2, s5
+; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1132GISEL-NEXT: ; %bb.2:
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_endpgm
+entry:
+ %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+ %result = call i32 @llvm.amdgcn.wave.reduce.or.i32(i32 %id.x, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: divergent_cfg:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr2
+; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX8DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX8DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX8DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX8DAGISEL-NEXT: ; %bb.2: ; %Flow
+; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX8DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_mov_b32 s6, 0
+; GFX8DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX8DAGISEL-NEXT: s_or_b32 s6, s6, s8
+; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX8DAGISEL-NEXT: ; %bb.5:
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX8DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT: flat_store_dword v[2:3], v1
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_cfg:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
+; GFX8GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX8GISEL-NEXT: ; %bb.1: ; %else
+; GFX8GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX8GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: s_mov_b32 s6, s2
+; GFX8GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX8GISEL-NEXT: ; %bb.3: ; %if
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_mov_b32 s6, 0
+; GFX8GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX8GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX8GISEL-NEXT: s_or_b32 s6, s6, s8
+; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX8GISEL-NEXT: .LBB4_5: ; %endif
+; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_cfg:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr2
+; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX9DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX9DAGISEL-NEXT: ; %bb.2: ; %Flow
+; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX9DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_mov_b32 s6, 0
+; GFX9DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX9DAGISEL-NEXT: s_or_b32 s6, s6, s8
+; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX9DAGISEL-NEXT: ; %bb.5:
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX9DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_cfg:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
+; GFX9GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX9GISEL-NEXT: ; %bb.1: ; %else
+; GFX9GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX9GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: s_mov_b32 s6, s2
+; GFX9GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX9GISEL-NEXT: ; %bb.3: ; %if
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_mov_b32 s6, 0
+; GFX9GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX9GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX9GISEL-NEXT: s_or_b32 s6, s6, s8
+; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX9GISEL-NEXT: .LBB4_5: ; %endif
+; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_cfg:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr2
+; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1064DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX1064DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1064DAGISEL-NEXT: ; %bb.2: ; %Flow
+; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: s_mov_b32 s6, 0
+; GFX1064DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX1064DAGISEL-NEXT: s_or_b32 s6, s6, s8
+; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1064DAGISEL-NEXT: ; %bb.5:
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX1064DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_cfg:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
+; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1064GISEL-NEXT: ; %bb.1: ; %else
+; GFX1064GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: s_mov_b32 s6, s2
+; GFX1064GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1064GISEL-NEXT: ; %bb.3: ; %if
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: s_mov_b32 s6, 0
+; GFX1064GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX1064GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX1064GISEL-NEXT: s_or_b32 s6, s6, s8
+; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1064GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_cfg:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc_lo, 15, v0
+; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr1
+; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1032DAGISEL-NEXT: s_load_dword s1, s[4:5], 0x2c
+; GFX1032DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1032DAGISEL-NEXT: ; %bb.2: ; %Flow
+; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, s0
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT: s_mov_b32 s1, 0
+; GFX1032DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s3, s2
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1032DAGISEL-NEXT: s_bitset0_b32 s2, s3
+; GFX1032DAGISEL-NEXT: s_or_b32 s1, s1, s6
+; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1032DAGISEL-NEXT: ; %bb.5:
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1032DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_cfg:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v0
+; GFX1032GISEL-NEXT: ; implicit-def: $sgpr0
+; GFX1032GISEL-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX1032GISEL-NEXT: s_xor_b32 s1, exec_lo, s1
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1032GISEL-NEXT: ; %bb.1: ; %else
+; GFX1032GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c
+; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: s_mov_b32 s0, s0
+; GFX1032GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s1, s1
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1032GISEL-NEXT: ; %bb.3: ; %if
+; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT: s_mov_b32 s0, 0
+; GFX1032GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: s_ff1_i32_b32 s3, s2
+; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1032GISEL-NEXT: s_bitset0_b32 s2, s3
+; GFX1032GISEL-NEXT: s_or_b32 s0, s0, s6
+; GFX1032GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1032GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_cfg:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr2
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
+; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1164DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1164DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1164DAGISEL-NEXT: ; %bb.2: ; %Flow
+; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: s_mov_b32 s6, 0
+; GFX1164DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX1164DAGISEL-NEXT: s_or_b32 s6, s6, s8
+; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1164DAGISEL-NEXT: ; %bb.5:
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX1164DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_cfg:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
+; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1164GISEL-NEXT: ; %bb.1: ; %else
+; GFX1164GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: s_mov_b32 s6, s2
+; GFX1164GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1]
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1164GISEL-NEXT: ; %bb.3: ; %if
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: s_mov_b32 s6, 0
+; GFX1164GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX1164GISEL-NEXT: s_or_b32 s6, s6, s8
+; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1164GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_cfg:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr1
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
+; GFX1132DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1132DAGISEL-NEXT: s_load_b32 s1, s[4:5], 0x2c
+; GFX1132DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1132DAGISEL-NEXT: ; %bb.2: ; %Flow
+; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, s0
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0
+; GFX1132DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3
+; GFX1132DAGISEL-NEXT: s_or_b32 s1, s1, s6
+; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1132DAGISEL-NEXT: ; %bb.5:
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1132DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_cfg:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT: s_mov_b32 s1, exec_lo
+; GFX1132GISEL-NEXT: ; implicit-def: $sgpr0
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
+; GFX1132GISEL-NEXT: s_xor_b32 s1, exec_lo, s1
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1132GISEL-NEXT: ; %bb.1: ; %else
+; GFX1132GISEL-NEXT: s_load_b32 s0, s[4:5], 0x2c
+; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: s_mov_b32 s0, s0
+; GFX1132GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s1, s1
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1132GISEL-NEXT: ; %bb.3: ; %if
+; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT: s_mov_b32 s0, 0
+; GFX1132GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3
+; GFX1132GISEL-NEXT: s_or_b32 s0, s0, s6
+; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1132GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
+; GFX1132GISEL-NEXT: s_endpgm
+ entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %d_cmp = icmp ult i32 %tid, 16
+ br i1 %d_cmp, label %if, label %else
+
+if:
+ %reducedValTid = call i32 @llvm.amdgcn.wave.reduce.or.i32(i32 %tid, i32 1)
+ br label %endif
+
+else:
+ %reducedValIn = call i32 @llvm.amdgcn.wave.reduce.or.i32(i32 %in, i32 1)
+ br label %endif
+
+endif:
+ %combine = phi i32 [%reducedValTid, %if], [%reducedValIn, %else]
+ store i32 %combine, ptr addrspace(1) %out
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll
new file mode 100644
index 0000000000000..1377dc51c4b34
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll
@@ -0,0 +1,1286 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 < %s | FileCheck -check-prefixes=GFX8DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 < %s | FileCheck -check-prefixes=GFX8GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 < %s | FileCheck -check-prefixes=GFX9DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 < %s | FileCheck -check-prefixes=GFX9GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s
+
+define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: uniform_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: s_mul_i32 s3, s6, -1
+; GFX8DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: uniform_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: s_mul_i32 s3, s6, -1
+; GFX8GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: uniform_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: s_mul_i32 s3, s6, -1
+; GFX9DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: uniform_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: s_mul_i32 s3, s6, -1
+; GFX9GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: uniform_value:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_clause 0x1
+; GFX1064DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: s_mul_i32 s3, s6, -1
+; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: uniform_value:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_clause 0x1
+; GFX1064GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: s_mul_i32 s3, s6, -1
+; GFX1064GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: uniform_value:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_clause 0x1
+; GFX1032DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s2, -1
+; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s2, s3
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: uniform_value:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_clause 0x1
+; GFX1032GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: s_mul_i32 s2, s2, -1
+; GFX1032GISEL-NEXT: s_mul_i32 s2, s2, s3
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: uniform_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_clause 0x1
+; GFX1164DAGISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: s_mul_i32 s3, s6, -1
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: uniform_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_clause 0x1
+; GFX1164GISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: s_mul_i32 s3, s6, -1
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: uniform_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_clause 0x1
+; GFX1132DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s2, -1
+; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s2, s3
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: uniform_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_clause 0x1
+; GFX1132GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: s_mul_i32 s2, s2, -1
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: s_mul_i32 s2, s2, s3
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_endpgm
+entry:
+ %result = call i32 @llvm.amdgcn.wave.reduce.sub.i32(i32 %in, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @const_value(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: const_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_mul_i32 s4, -1, 0x7b
+; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8DAGISEL-NEXT: s_mul_i32 s2, s4, s2
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: const_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8GISEL-NEXT: s_mul_i32 s3, -1, 0x7b
+; GFX8GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: const_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9DAGISEL-NEXT: s_mul_i32 s3, -1, 0x7b
+; GFX9DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: const_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9GISEL-NEXT: s_mul_i32 s3, -1, 0x7b
+; GFX9GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: const_value:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT: s_mul_i32 s3, -1, 0x7b
+; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: const_value:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT: s_mul_i32 s3, -1, 0x7b
+; GFX1064GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: const_value:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT: s_mul_i32 s3, -1, 0x7b
+; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: const_value:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT: s_mul_i32 s3, -1, 0x7b
+; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: const_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT: s_mul_i32 s3, -1, 0x7b
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: const_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT: s_mul_i32 s3, -1, 0x7b
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: const_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT: s_mul_i32 s3, -1, 0x7b
+; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: const_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT: s_mul_i32 s3, -1, 0x7b
+; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_endpgm
+ entry:
+ %result = call i32 @llvm.amdgcn.wave.reduce.sub.i32(i32 123, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @poison_value(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: poison_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: s_mul_i32 s4, s0, -1
+; GFX8DAGISEL-NEXT: s_mul_i32 s2, s4, s2
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: poison_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: s_mul_i32 s3, s0, -1
+; GFX8GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: poison_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: s_mul_i32 s3, s0, -1
+; GFX9DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: poison_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: s_mul_i32 s3, s0, -1
+; GFX9GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: poison_value:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: s_mul_i32 s3, s0, -1
+; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: poison_value:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: s_mul_i32 s3, s0, -1
+; GFX1064GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: poison_value:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: s_mul_i32 s3, s0, -1
+; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: poison_value:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: s_mul_i32 s3, s0, -1
+; GFX1032GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: poison_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: s_mul_i32 s3, s0, -1
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: poison_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: s_mul_i32 s3, s0, -1
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: poison_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: s_mul_i32 s3, s0, -1
+; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: poison_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: s_mul_i32 s3, s0, -1
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_endpgm
+entry:
+ %result = call i32 @llvm.amdgcn.wave.reduce.sub.i32(i32 poison, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: divergent_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_mov_b32 s4, 0
+; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX8DAGISEL-NEXT: s_sub_i32 s4, s4, s6
+; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX8DAGISEL-NEXT: ; %bb.2:
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_mov_b32 s4, 0
+; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX8GISEL-NEXT: s_sub_i32 s4, s4, s6
+; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX8GISEL-NEXT: ; %bb.2:
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_mov_b32 s4, 0
+; GFX9DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX9DAGISEL-NEXT: s_sub_i32 s4, s4, s6
+; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX9DAGISEL-NEXT: ; %bb.2:
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_mov_b32 s4, 0
+; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX9GISEL-NEXT: s_sub_i32 s4, s4, s6
+; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX9GISEL-NEXT: ; %bb.2:
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_value:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: s_mov_b32 s4, 0
+; GFX1064DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1064DAGISEL-NEXT: s_sub_i32 s4, s4, s6
+; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1064DAGISEL-NEXT: ; %bb.2:
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_value:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: s_mov_b32 s4, 0
+; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1064GISEL-NEXT: s_sub_i32 s4, s4, s6
+; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1064GISEL-NEXT: ; %bb.2:
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_value:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, 0
+; GFX1032DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1032DAGISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1032DAGISEL-NEXT: s_sub_i32 s2, s2, s5
+; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1032DAGISEL-NEXT: ; %bb.2:
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_value:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032GISEL-NEXT: s_mov_b32 s2, 0
+; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032GISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1032GISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1032GISEL-NEXT: s_sub_i32 s2, s2, s5
+; GFX1032GISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1032GISEL-NEXT: ; %bb.2:
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: s_mov_b32 s4, 0
+; GFX1164DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: s_sub_i32 s4, s4, s6
+; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1164DAGISEL-NEXT: ; %bb.2:
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: s_mov_b32 s4, 0
+; GFX1164GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT: s_sub_i32 s4, s4, s6
+; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1164GISEL-NEXT: ; %bb.2:
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132DAGISEL-NEXT: s_mov_b32 s2, 0
+; GFX1132DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s3
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: s_sub_i32 s2, s2, s5
+; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1132DAGISEL-NEXT: ; %bb.2:
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132GISEL-NEXT: s_mov_b32 s2, 0
+; GFX1132GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s3
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT: s_sub_i32 s2, s2, s5
+; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1132GISEL-NEXT: ; %bb.2:
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_endpgm
+entry:
+ %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+ %result = call i32 @llvm.amdgcn.wave.reduce.sub.i32(i32 %id.x, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: divergent_cfg:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr2
+; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX8DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX8DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX8DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: s_mul_i32 s3, s6, -1
+; GFX8DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX8DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX8DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_mov_b32 s6, 0
+; GFX8DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX8DAGISEL-NEXT: s_sub_i32 s6, s6, s8
+; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX8DAGISEL-NEXT: ; %bb.5:
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX8DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT: flat_store_dword v[2:3], v1
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_cfg:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
+; GFX8GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX8GISEL-NEXT: ; %bb.1: ; %else
+; GFX8GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: s_mul_i32 s3, s6, -1
+; GFX8GISEL-NEXT: s_mul_i32 s6, s3, s2
+; GFX8GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX8GISEL-NEXT: ; %bb.3: ; %if
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_mov_b32 s6, 0
+; GFX8GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX8GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX8GISEL-NEXT: s_sub_i32 s6, s6, s8
+; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX8GISEL-NEXT: .LBB4_5: ; %endif
+; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_cfg:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr2
+; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX9DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX9DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: s_mul_i32 s3, s6, -1
+; GFX9DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX9DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX9DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_mov_b32 s6, 0
+; GFX9DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX9DAGISEL-NEXT: s_sub_i32 s6, s6, s8
+; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX9DAGISEL-NEXT: ; %bb.5:
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX9DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_cfg:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
+; GFX9GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX9GISEL-NEXT: ; %bb.1: ; %else
+; GFX9GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: s_mul_i32 s3, s6, -1
+; GFX9GISEL-NEXT: s_mul_i32 s6, s3, s2
+; GFX9GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX9GISEL-NEXT: ; %bb.3: ; %if
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_mov_b32 s6, 0
+; GFX9GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX9GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX9GISEL-NEXT: s_sub_i32 s6, s6, s8
+; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX9GISEL-NEXT: .LBB4_5: ; %endif
+; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_cfg:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr2
+; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1064DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: s_mul_i32 s3, s6, -1
+; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1064DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: s_mov_b32 s6, 0
+; GFX1064DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX1064DAGISEL-NEXT: s_sub_i32 s6, s6, s8
+; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1064DAGISEL-NEXT: ; %bb.5:
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX1064DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_cfg:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
+; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1064GISEL-NEXT: ; %bb.1: ; %else
+; GFX1064GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: s_mul_i32 s3, s6, -1
+; GFX1064GISEL-NEXT: s_mul_i32 s6, s3, s2
+; GFX1064GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1064GISEL-NEXT: ; %bb.3: ; %if
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: s_mov_b32 s6, 0
+; GFX1064GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX1064GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX1064GISEL-NEXT: s_sub_i32 s6, s6, s8
+; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1064GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_cfg:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc_lo, 15, v0
+; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr1
+; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1032DAGISEL-NEXT: s_load_dword s1, s[4:5], 0x2c
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: s_mul_i32 s1, s1, -1
+; GFX1032DAGISEL-NEXT: s_mul_i32 s1, s1, s2
+; GFX1032DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, s0
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT: s_mov_b32 s1, 0
+; GFX1032DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s3, s2
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1032DAGISEL-NEXT: s_bitset0_b32 s2, s3
+; GFX1032DAGISEL-NEXT: s_sub_i32 s1, s1, s6
+; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1032DAGISEL-NEXT: ; %bb.5:
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1032DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_cfg:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v0
+; GFX1032GISEL-NEXT: ; implicit-def: $sgpr0
+; GFX1032GISEL-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX1032GISEL-NEXT: s_xor_b32 s1, exec_lo, s1
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1032GISEL-NEXT: ; %bb.1: ; %else
+; GFX1032GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c
+; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: s_mul_i32 s0, s0, -1
+; GFX1032GISEL-NEXT: s_mul_i32 s0, s0, s2
+; GFX1032GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s1, s1
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1032GISEL-NEXT: ; %bb.3: ; %if
+; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT: s_mov_b32 s0, 0
+; GFX1032GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: s_ff1_i32_b32 s3, s2
+; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1032GISEL-NEXT: s_bitset0_b32 s2, s3
+; GFX1032GISEL-NEXT: s_sub_i32 s0, s0, s6
+; GFX1032GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1032GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_cfg:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr2
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
+; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1164DAGISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: s_mul_i32 s3, s6, -1
+; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1164DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: s_mov_b32 s6, 0
+; GFX1164DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX1164DAGISEL-NEXT: s_sub_i32 s6, s6, s8
+; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1164DAGISEL-NEXT: ; %bb.5:
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX1164DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_cfg:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
+; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1164GISEL-NEXT: ; %bb.1: ; %else
+; GFX1164GISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: s_mul_i32 s3, s6, -1
+; GFX1164GISEL-NEXT: s_mul_i32 s6, s3, s2
+; GFX1164GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1]
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1164GISEL-NEXT: ; %bb.3: ; %if
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: s_mov_b32 s6, 0
+; GFX1164GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX1164GISEL-NEXT: s_sub_i32 s6, s6, s8
+; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1164GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_cfg:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr1
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
+; GFX1132DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1132DAGISEL-NEXT: s_load_b32 s1, s[4:5], 0x2c
+; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: s_mul_i32 s1, s1, -1
+; GFX1132DAGISEL-NEXT: s_mul_i32 s1, s1, s2
+; GFX1132DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, s0
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0
+; GFX1132DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3
+; GFX1132DAGISEL-NEXT: s_sub_i32 s1, s1, s6
+; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1132DAGISEL-NEXT: ; %bb.5:
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1132DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_cfg:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT: s_mov_b32 s1, exec_lo
+; GFX1132GISEL-NEXT: ; implicit-def: $sgpr0
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
+; GFX1132GISEL-NEXT: s_xor_b32 s1, exec_lo, s1
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1132GISEL-NEXT: ; %bb.1: ; %else
+; GFX1132GISEL-NEXT: s_load_b32 s0, s[4:5], 0x2c
+; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: s_mul_i32 s0, s0, -1
+; GFX1132GISEL-NEXT: s_mul_i32 s0, s0, s2
+; GFX1132GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s1, s1
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1132GISEL-NEXT: ; %bb.3: ; %if
+; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT: s_mov_b32 s0, 0
+; GFX1132GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3
+; GFX1132GISEL-NEXT: s_sub_i32 s0, s0, s6
+; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1132GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
+; GFX1132GISEL-NEXT: s_endpgm
+ entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %d_cmp = icmp ult i32 %tid, 16
+ br i1 %d_cmp, label %if, label %else
+
+if:
+ %reducedValTid = call i32 @llvm.amdgcn.wave.reduce.sub.i32(i32 %tid, i32 1)
+ br label %endif
+
+else:
+ %reducedValIn = call i32 @llvm.amdgcn.wave.reduce.sub.i32(i32 %in, i32 1)
+ br label %endif
+
+endif:
+ %combine = phi i32 [%reducedValTid, %if], [%reducedValIn, %else]
+ store i32 %combine, ptr addrspace(1) %out
+ ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX10DAGISEL: {{.*}}
+; GFX10GISEL: {{.*}}
+; GFX11DAGISEL: {{.*}}
+; GFX11GISEL: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.uadd.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.uadd.ll
new file mode 100644
index 0000000000000..d7b0ebb709a77
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.uadd.ll
@@ -0,0 +1,1240 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 < %s | FileCheck -check-prefixes=GFX8DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 < %s | FileCheck -check-prefixes=GFX8GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 < %s | FileCheck -check-prefixes=GFX9DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 < %s | FileCheck -check-prefixes=GFX9GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s
+
+declare i32 @llvm.amdgcn.wave.reduce.uadd.i32(i32, i32 immarg)
+declare i32 @llvm.amdgcn.workitem.id.x()
+
+define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: uniform_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: s_mul_i32 s2, s6, s2
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: uniform_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: s_mul_i32 s2, s6, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: uniform_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: s_mul_i32 s2, s6, s2
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: uniform_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: s_mul_i32 s2, s6, s2
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: uniform_value:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_clause 0x1
+; GFX1064DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s6, s2
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: uniform_value:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_clause 0x1
+; GFX1064GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: s_mul_i32 s2, s6, s2
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: uniform_value:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_clause 0x1
+; GFX1032DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s2, s3
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: uniform_value:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_clause 0x1
+; GFX1032GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: s_mul_i32 s2, s2, s3
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: uniform_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_clause 0x1
+; GFX1164DAGISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s6, s2
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: uniform_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_clause 0x1
+; GFX1164GISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: s_mul_i32 s2, s6, s2
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: uniform_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_clause 0x1
+; GFX1132DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s2, s3
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: uniform_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_clause 0x1
+; GFX1132GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: s_mul_i32 s2, s2, s3
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_endpgm
+entry:
+ %result = call i32 @llvm.amdgcn.wave.reduce.uadd.i32(i32 %in, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @const_value(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: const_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8DAGISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: const_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8GISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: const_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9DAGISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: const_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9GISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: const_value:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: const_value:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: const_value:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032DAGISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: const_value:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032GISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: const_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: const_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: const_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132DAGISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: const_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_endpgm
+ entry:
+ %result = call i32 @llvm.amdgcn.wave.reduce.uadd.i32(i32 123, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @poison_value(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: poison_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: poison_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: poison_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: poison_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: poison_value:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: poison_value:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: poison_value:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: poison_value:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: poison_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: poison_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: poison_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: poison_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_endpgm
+entry:
+ %result = call i32 @llvm.amdgcn.wave.reduce.uadd.i32(i32 poison, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: divergent_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_mov_b32 s4, 0
+; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX8DAGISEL-NEXT: s_add_i32 s4, s4, s6
+; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX8DAGISEL-NEXT: ; %bb.2:
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_mov_b32 s4, 0
+; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX8GISEL-NEXT: s_add_i32 s4, s4, s6
+; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX8GISEL-NEXT: ; %bb.2:
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_mov_b32 s4, 0
+; GFX9DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX9DAGISEL-NEXT: s_add_i32 s4, s4, s6
+; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX9DAGISEL-NEXT: ; %bb.2:
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_mov_b32 s4, 0
+; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX9GISEL-NEXT: s_add_i32 s4, s4, s6
+; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX9GISEL-NEXT: ; %bb.2:
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_value:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: s_mov_b32 s4, 0
+; GFX1064DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1064DAGISEL-NEXT: s_add_i32 s4, s4, s6
+; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1064DAGISEL-NEXT: ; %bb.2:
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_value:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: s_mov_b32 s4, 0
+; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1064GISEL-NEXT: s_add_i32 s4, s4, s6
+; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1064GISEL-NEXT: ; %bb.2:
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_value:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, 0
+; GFX1032DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1032DAGISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1032DAGISEL-NEXT: s_add_i32 s2, s2, s5
+; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1032DAGISEL-NEXT: ; %bb.2:
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_value:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032GISEL-NEXT: s_mov_b32 s2, 0
+; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032GISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1032GISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1032GISEL-NEXT: s_add_i32 s2, s2, s5
+; GFX1032GISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1032GISEL-NEXT: ; %bb.2:
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: s_mov_b32 s4, 0
+; GFX1164DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: s_add_i32 s4, s4, s6
+; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1164DAGISEL-NEXT: ; %bb.2:
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: s_mov_b32 s4, 0
+; GFX1164GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT: s_add_i32 s4, s4, s6
+; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1164GISEL-NEXT: ; %bb.2:
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132DAGISEL-NEXT: s_mov_b32 s2, 0
+; GFX1132DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s3
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: s_add_i32 s2, s2, s5
+; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1132DAGISEL-NEXT: ; %bb.2:
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132GISEL-NEXT: s_mov_b32 s2, 0
+; GFX1132GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s3
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT: s_add_i32 s2, s2, s5
+; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1132GISEL-NEXT: ; %bb.2:
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_endpgm
+entry:
+ %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+ %result = call i32 @llvm.amdgcn.wave.reduce.uadd.i32(i32 %id.x, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: divergent_cfg:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr2
+; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX8DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX8DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX8DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: s_mul_i32 s2, s6, s2
+; GFX8DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX8DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_mov_b32 s6, 0
+; GFX8DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX8DAGISEL-NEXT: s_add_i32 s6, s6, s8
+; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX8DAGISEL-NEXT: ; %bb.5:
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX8DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT: flat_store_dword v[2:3], v1
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_cfg:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
+; GFX8GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX8GISEL-NEXT: ; %bb.1: ; %else
+; GFX8GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: s_mul_i32 s6, s6, s2
+; GFX8GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX8GISEL-NEXT: ; %bb.3: ; %if
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_mov_b32 s6, 0
+; GFX8GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX8GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX8GISEL-NEXT: s_add_i32 s6, s6, s8
+; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX8GISEL-NEXT: .LBB4_5: ; %endif
+; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_cfg:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr2
+; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX9DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX9DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: s_mul_i32 s2, s6, s2
+; GFX9DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX9DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_mov_b32 s6, 0
+; GFX9DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX9DAGISEL-NEXT: s_add_i32 s6, s6, s8
+; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX9DAGISEL-NEXT: ; %bb.5:
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX9DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_cfg:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
+; GFX9GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX9GISEL-NEXT: ; %bb.1: ; %else
+; GFX9GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: s_mul_i32 s6, s6, s2
+; GFX9GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX9GISEL-NEXT: ; %bb.3: ; %if
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_mov_b32 s6, 0
+; GFX9GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX9GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX9GISEL-NEXT: s_add_i32 s6, s6, s8
+; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX9GISEL-NEXT: .LBB4_5: ; %endif
+; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_cfg:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr2
+; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1064DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s6, s2
+; GFX1064DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: s_mov_b32 s6, 0
+; GFX1064DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX1064DAGISEL-NEXT: s_add_i32 s6, s6, s8
+; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1064DAGISEL-NEXT: ; %bb.5:
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX1064DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_cfg:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
+; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1064GISEL-NEXT: ; %bb.1: ; %else
+; GFX1064GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: s_mul_i32 s6, s6, s2
+; GFX1064GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1064GISEL-NEXT: ; %bb.3: ; %if
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: s_mov_b32 s6, 0
+; GFX1064GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX1064GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX1064GISEL-NEXT: s_add_i32 s6, s6, s8
+; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1064GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_cfg:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc_lo, 15, v0
+; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr1
+; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1032DAGISEL-NEXT: s_load_dword s1, s[4:5], 0x2c
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: s_mul_i32 s1, s1, s2
+; GFX1032DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, s0
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT: s_mov_b32 s1, 0
+; GFX1032DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s3, s2
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1032DAGISEL-NEXT: s_bitset0_b32 s2, s3
+; GFX1032DAGISEL-NEXT: s_add_i32 s1, s1, s6
+; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1032DAGISEL-NEXT: ; %bb.5:
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1032DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_cfg:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v0
+; GFX1032GISEL-NEXT: ; implicit-def: $sgpr0
+; GFX1032GISEL-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX1032GISEL-NEXT: s_xor_b32 s1, exec_lo, s1
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1032GISEL-NEXT: ; %bb.1: ; %else
+; GFX1032GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c
+; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: s_mul_i32 s0, s0, s2
+; GFX1032GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s1, s1
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1032GISEL-NEXT: ; %bb.3: ; %if
+; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT: s_mov_b32 s0, 0
+; GFX1032GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: s_ff1_i32_b32 s3, s2
+; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1032GISEL-NEXT: s_bitset0_b32 s2, s3
+; GFX1032GISEL-NEXT: s_add_i32 s0, s0, s6
+; GFX1032GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1032GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_cfg:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr2
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
+; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1164DAGISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s6, s2
+; GFX1164DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: s_mov_b32 s6, 0
+; GFX1164DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX1164DAGISEL-NEXT: s_add_i32 s6, s6, s8
+; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1164DAGISEL-NEXT: ; %bb.5:
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX1164DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_cfg:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
+; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1164GISEL-NEXT: ; %bb.1: ; %else
+; GFX1164GISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: s_mul_i32 s6, s6, s2
+; GFX1164GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1]
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1164GISEL-NEXT: ; %bb.3: ; %if
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: s_mov_b32 s6, 0
+; GFX1164GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX1164GISEL-NEXT: s_add_i32 s6, s6, s8
+; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1164GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_cfg:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr1
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
+; GFX1132DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1132DAGISEL-NEXT: s_load_b32 s1, s[4:5], 0x2c
+; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: s_mul_i32 s1, s1, s2
+; GFX1132DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, s0
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0
+; GFX1132DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3
+; GFX1132DAGISEL-NEXT: s_add_i32 s1, s1, s6
+; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1132DAGISEL-NEXT: ; %bb.5:
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1132DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_cfg:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT: s_mov_b32 s1, exec_lo
+; GFX1132GISEL-NEXT: ; implicit-def: $sgpr0
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
+; GFX1132GISEL-NEXT: s_xor_b32 s1, exec_lo, s1
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1132GISEL-NEXT: ; %bb.1: ; %else
+; GFX1132GISEL-NEXT: s_load_b32 s0, s[4:5], 0x2c
+; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: s_mul_i32 s0, s0, s2
+; GFX1132GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s1, s1
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1132GISEL-NEXT: ; %bb.3: ; %if
+; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT: s_mov_b32 s0, 0
+; GFX1132GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3
+; GFX1132GISEL-NEXT: s_add_i32 s0, s0, s6
+; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1132GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
+; GFX1132GISEL-NEXT: s_endpgm
+ entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %d_cmp = icmp ult i32 %tid, 16
+ br i1 %d_cmp, label %if, label %else
+
+if:
+ %reducedValTid = call i32 @llvm.amdgcn.wave.reduce.uadd.i32(i32 %tid, i32 1)
+ br label %endif
+
+else:
+ %reducedValIn = call i32 @llvm.amdgcn.wave.reduce.uadd.i32(i32 %in, i32 1)
+ br label %endif
+
+endif:
+ %combine = phi i32 [%reducedValTid, %if], [%reducedValIn, %else]
+ store i32 %combine, ptr addrspace(1) %out
+ ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX10DAGISEL: {{.*}}
+; GFX10GISEL: {{.*}}
+; GFX11DAGISEL: {{.*}}
+; GFX11GISEL: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll
index deeceed3a19be..153a343e0babb 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll
@@ -1,16 +1,16 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8DAGISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9DAGISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 < %s | FileCheck -check-prefixes=GFX8DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 < %s | FileCheck -check-prefixes=GFX8GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 < %s | FileCheck -check-prefixes=GFX9DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 < %s | FileCheck -check-prefixes=GFX9GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s
declare i32 @llvm.amdgcn.wave.reduce.umax.i32(i32, i32 immarg)
declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll
index 434e761a5f8a2..42974d5761bf0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll
@@ -1,16 +1,16 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8DAGISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9DAGISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 < %s | FileCheck -check-prefixes=GFX8DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 < %s | FileCheck -check-prefixes=GFX8GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 < %s | FileCheck -check-prefixes=GFX9DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 < %s | FileCheck -check-prefixes=GFX9GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s
declare i32 @llvm.amdgcn.wave.reduce.umin.i32(i32, i32 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.usub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.usub.ll
new file mode 100644
index 0000000000000..daff91c48c814
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.usub.ll
@@ -0,0 +1,1286 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 < %s | FileCheck -check-prefixes=GFX8DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 < %s | FileCheck -check-prefixes=GFX8GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 < %s | FileCheck -check-prefixes=GFX9DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 < %s | FileCheck -check-prefixes=GFX9GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s
+
+define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: uniform_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: s_mul_i32 s3, s6, -1
+; GFX8DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: uniform_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: s_mul_i32 s3, s6, -1
+; GFX8GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: uniform_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: s_mul_i32 s3, s6, -1
+; GFX9DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: uniform_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: s_mul_i32 s3, s6, -1
+; GFX9GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: uniform_value:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_clause 0x1
+; GFX1064DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: s_mul_i32 s3, s6, -1
+; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: uniform_value:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_clause 0x1
+; GFX1064GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: s_mul_i32 s3, s6, -1
+; GFX1064GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: uniform_value:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_clause 0x1
+; GFX1032DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s2, -1
+; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s2, s3
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: uniform_value:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_clause 0x1
+; GFX1032GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: s_mul_i32 s2, s2, -1
+; GFX1032GISEL-NEXT: s_mul_i32 s2, s2, s3
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: uniform_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_clause 0x1
+; GFX1164DAGISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: s_mul_i32 s3, s6, -1
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: uniform_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_clause 0x1
+; GFX1164GISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: s_mul_i32 s3, s6, -1
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: uniform_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_clause 0x1
+; GFX1132DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s2, -1
+; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s2, s3
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: uniform_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_clause 0x1
+; GFX1132GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: s_mul_i32 s2, s2, -1
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: s_mul_i32 s2, s2, s3
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_endpgm
+entry:
+ %result = call i32 @llvm.amdgcn.wave.reduce.usub.i32(i32 %in, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @const_value(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: const_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_mul_i32 s4, -1, 0x7b
+; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8DAGISEL-NEXT: s_mul_i32 s2, s4, s2
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: const_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8GISEL-NEXT: s_mul_i32 s3, -1, 0x7b
+; GFX8GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: const_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9DAGISEL-NEXT: s_mul_i32 s3, -1, 0x7b
+; GFX9DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: const_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9GISEL-NEXT: s_mul_i32 s3, -1, 0x7b
+; GFX9GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: const_value:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT: s_mul_i32 s3, -1, 0x7b
+; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: const_value:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT: s_mul_i32 s3, -1, 0x7b
+; GFX1064GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: const_value:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT: s_mul_i32 s3, -1, 0x7b
+; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: const_value:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT: s_mul_i32 s3, -1, 0x7b
+; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: const_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT: s_mul_i32 s3, -1, 0x7b
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: const_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT: s_mul_i32 s3, -1, 0x7b
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: const_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT: s_mul_i32 s3, -1, 0x7b
+; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: const_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT: s_mul_i32 s3, -1, 0x7b
+; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_endpgm
+ entry:
+ %result = call i32 @llvm.amdgcn.wave.reduce.usub.i32(i32 123, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @poison_value(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: poison_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: s_mul_i32 s4, s0, -1
+; GFX8DAGISEL-NEXT: s_mul_i32 s2, s4, s2
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: poison_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: s_mul_i32 s3, s0, -1
+; GFX8GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: poison_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: s_mul_i32 s3, s0, -1
+; GFX9DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: poison_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: s_mul_i32 s3, s0, -1
+; GFX9GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: poison_value:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: s_mul_i32 s3, s0, -1
+; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: poison_value:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: s_mul_i32 s3, s0, -1
+; GFX1064GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: poison_value:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: s_mul_i32 s3, s0, -1
+; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: poison_value:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: s_mul_i32 s3, s0, -1
+; GFX1032GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: poison_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: s_mul_i32 s3, s0, -1
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: poison_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: s_mul_i32 s3, s0, -1
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: poison_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: s_mul_i32 s3, s0, -1
+; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: poison_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: s_mul_i32 s3, s0, -1
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_endpgm
+entry:
+ %result = call i32 @llvm.amdgcn.wave.reduce.usub.i32(i32 poison, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: divergent_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_mov_b32 s4, 0
+; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX8DAGISEL-NEXT: s_sub_i32 s4, s4, s6
+; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX8DAGISEL-NEXT: ; %bb.2:
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_mov_b32 s4, 0
+; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX8GISEL-NEXT: s_sub_i32 s4, s4, s6
+; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX8GISEL-NEXT: ; %bb.2:
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_mov_b32 s4, 0
+; GFX9DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX9DAGISEL-NEXT: s_sub_i32 s4, s4, s6
+; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX9DAGISEL-NEXT: ; %bb.2:
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_mov_b32 s4, 0
+; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX9GISEL-NEXT: s_sub_i32 s4, s4, s6
+; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX9GISEL-NEXT: ; %bb.2:
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_value:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: s_mov_b32 s4, 0
+; GFX1064DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1064DAGISEL-NEXT: s_sub_i32 s4, s4, s6
+; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1064DAGISEL-NEXT: ; %bb.2:
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_value:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: s_mov_b32 s4, 0
+; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1064GISEL-NEXT: s_sub_i32 s4, s4, s6
+; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1064GISEL-NEXT: ; %bb.2:
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_value:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, 0
+; GFX1032DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1032DAGISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1032DAGISEL-NEXT: s_sub_i32 s2, s2, s5
+; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1032DAGISEL-NEXT: ; %bb.2:
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_value:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032GISEL-NEXT: s_mov_b32 s2, 0
+; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032GISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1032GISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1032GISEL-NEXT: s_sub_i32 s2, s2, s5
+; GFX1032GISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1032GISEL-NEXT: ; %bb.2:
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: s_mov_b32 s4, 0
+; GFX1164DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: s_sub_i32 s4, s4, s6
+; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1164DAGISEL-NEXT: ; %bb.2:
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: s_mov_b32 s4, 0
+; GFX1164GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT: s_sub_i32 s4, s4, s6
+; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1164GISEL-NEXT: ; %bb.2:
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132DAGISEL-NEXT: s_mov_b32 s2, 0
+; GFX1132DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s3
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: s_sub_i32 s2, s2, s5
+; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1132DAGISEL-NEXT: ; %bb.2:
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132GISEL-NEXT: s_mov_b32 s2, 0
+; GFX1132GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s3
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT: s_sub_i32 s2, s2, s5
+; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1132GISEL-NEXT: ; %bb.2:
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_endpgm
+entry:
+ %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+ %result = call i32 @llvm.amdgcn.wave.reduce.usub.i32(i32 %id.x, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: divergent_cfg:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr2
+; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX8DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX8DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX8DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: s_mul_i32 s3, s6, -1
+; GFX8DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX8DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX8DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_mov_b32 s6, 0
+; GFX8DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX8DAGISEL-NEXT: s_sub_i32 s6, s6, s8
+; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX8DAGISEL-NEXT: ; %bb.5:
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX8DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT: flat_store_dword v[2:3], v1
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_cfg:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
+; GFX8GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX8GISEL-NEXT: ; %bb.1: ; %else
+; GFX8GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: s_mul_i32 s3, s6, -1
+; GFX8GISEL-NEXT: s_mul_i32 s6, s3, s2
+; GFX8GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX8GISEL-NEXT: ; %bb.3: ; %if
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_mov_b32 s6, 0
+; GFX8GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX8GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX8GISEL-NEXT: s_sub_i32 s6, s6, s8
+; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX8GISEL-NEXT: .LBB4_5: ; %endif
+; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_cfg:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr2
+; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX9DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX9DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: s_mul_i32 s3, s6, -1
+; GFX9DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX9DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX9DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_mov_b32 s6, 0
+; GFX9DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX9DAGISEL-NEXT: s_sub_i32 s6, s6, s8
+; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX9DAGISEL-NEXT: ; %bb.5:
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX9DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_cfg:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
+; GFX9GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX9GISEL-NEXT: ; %bb.1: ; %else
+; GFX9GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: s_mul_i32 s3, s6, -1
+; GFX9GISEL-NEXT: s_mul_i32 s6, s3, s2
+; GFX9GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX9GISEL-NEXT: ; %bb.3: ; %if
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_mov_b32 s6, 0
+; GFX9GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX9GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX9GISEL-NEXT: s_sub_i32 s6, s6, s8
+; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX9GISEL-NEXT: .LBB4_5: ; %endif
+; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_cfg:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr2
+; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1064DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: s_mul_i32 s3, s6, -1
+; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1064DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: s_mov_b32 s6, 0
+; GFX1064DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX1064DAGISEL-NEXT: s_sub_i32 s6, s6, s8
+; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1064DAGISEL-NEXT: ; %bb.5:
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX1064DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_cfg:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
+; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1064GISEL-NEXT: ; %bb.1: ; %else
+; GFX1064GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: s_mul_i32 s3, s6, -1
+; GFX1064GISEL-NEXT: s_mul_i32 s6, s3, s2
+; GFX1064GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1064GISEL-NEXT: ; %bb.3: ; %if
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: s_mov_b32 s6, 0
+; GFX1064GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX1064GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX1064GISEL-NEXT: s_sub_i32 s6, s6, s8
+; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1064GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_cfg:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc_lo, 15, v0
+; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr1
+; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1032DAGISEL-NEXT: s_load_dword s1, s[4:5], 0x2c
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: s_mul_i32 s1, s1, -1
+; GFX1032DAGISEL-NEXT: s_mul_i32 s1, s1, s2
+; GFX1032DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, s0
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT: s_mov_b32 s1, 0
+; GFX1032DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s3, s2
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1032DAGISEL-NEXT: s_bitset0_b32 s2, s3
+; GFX1032DAGISEL-NEXT: s_sub_i32 s1, s1, s6
+; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1032DAGISEL-NEXT: ; %bb.5:
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1032DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_cfg:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v0
+; GFX1032GISEL-NEXT: ; implicit-def: $sgpr0
+; GFX1032GISEL-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX1032GISEL-NEXT: s_xor_b32 s1, exec_lo, s1
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1032GISEL-NEXT: ; %bb.1: ; %else
+; GFX1032GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c
+; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: s_mul_i32 s0, s0, -1
+; GFX1032GISEL-NEXT: s_mul_i32 s0, s0, s2
+; GFX1032GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s1, s1
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1032GISEL-NEXT: ; %bb.3: ; %if
+; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT: s_mov_b32 s0, 0
+; GFX1032GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: s_ff1_i32_b32 s3, s2
+; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1032GISEL-NEXT: s_bitset0_b32 s2, s3
+; GFX1032GISEL-NEXT: s_sub_i32 s0, s0, s6
+; GFX1032GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1032GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_cfg:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr2
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
+; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1164DAGISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: s_mul_i32 s3, s6, -1
+; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1164DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: s_mov_b32 s6, 0
+; GFX1164DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX1164DAGISEL-NEXT: s_sub_i32 s6, s6, s8
+; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1164DAGISEL-NEXT: ; %bb.5:
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX1164DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_cfg:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
+; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1164GISEL-NEXT: ; %bb.1: ; %else
+; GFX1164GISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: s_mul_i32 s3, s6, -1
+; GFX1164GISEL-NEXT: s_mul_i32 s6, s3, s2
+; GFX1164GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1]
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1164GISEL-NEXT: ; %bb.3: ; %if
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: s_mov_b32 s6, 0
+; GFX1164GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX1164GISEL-NEXT: s_sub_i32 s6, s6, s8
+; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1164GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_cfg:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr1
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
+; GFX1132DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1132DAGISEL-NEXT: s_load_b32 s1, s[4:5], 0x2c
+; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: s_mul_i32 s1, s1, -1
+; GFX1132DAGISEL-NEXT: s_mul_i32 s1, s1, s2
+; GFX1132DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, s0
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0
+; GFX1132DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3
+; GFX1132DAGISEL-NEXT: s_sub_i32 s1, s1, s6
+; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1132DAGISEL-NEXT: ; %bb.5:
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1132DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_cfg:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT: s_mov_b32 s1, exec_lo
+; GFX1132GISEL-NEXT: ; implicit-def: $sgpr0
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
+; GFX1132GISEL-NEXT: s_xor_b32 s1, exec_lo, s1
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1132GISEL-NEXT: ; %bb.1: ; %else
+; GFX1132GISEL-NEXT: s_load_b32 s0, s[4:5], 0x2c
+; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: s_mul_i32 s0, s0, -1
+; GFX1132GISEL-NEXT: s_mul_i32 s0, s0, s2
+; GFX1132GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s1, s1
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1132GISEL-NEXT: ; %bb.3: ; %if
+; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT: s_mov_b32 s0, 0
+; GFX1132GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3
+; GFX1132GISEL-NEXT: s_sub_i32 s0, s0, s6
+; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1132GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
+; GFX1132GISEL-NEXT: s_endpgm
+ entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %d_cmp = icmp ult i32 %tid, 16
+ br i1 %d_cmp, label %if, label %else
+
+if:
+ %reducedValTid = call i32 @llvm.amdgcn.wave.reduce.usub.i32(i32 %tid, i32 1)
+ br label %endif
+
+else:
+ %reducedValIn = call i32 @llvm.amdgcn.wave.reduce.usub.i32(i32 %in, i32 1)
+ br label %endif
+
+endif:
+ %combine = phi i32 [%reducedValTid, %if], [%reducedValIn, %else]
+ store i32 %combine, ptr addrspace(1) %out
+ ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX10DAGISEL: {{.*}}
+; GFX10GISEL: {{.*}}
+; GFX11DAGISEL: {{.*}}
+; GFX11GISEL: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll
new file mode 100644
index 0000000000000..6194aea667953
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll
@@ -0,0 +1,1290 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 < %s | FileCheck -check-prefixes=GFX8DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 < %s | FileCheck -check-prefixes=GFX8GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 < %s | FileCheck -check-prefixes=GFX9DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 < %s | FileCheck -check-prefixes=GFX9GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s
+
+define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: uniform_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8DAGISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: s_mul_i32 s2, s6, s2
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: uniform_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8GISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: s_mul_i32 s2, s6, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: uniform_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9DAGISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: s_mul_i32 s2, s6, s2
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: uniform_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9GISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: s_mul_i32 s2, s6, s2
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: uniform_value:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_clause 0x1
+; GFX1064DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s6, s2
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: uniform_value:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_clause 0x1
+; GFX1064GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: s_mul_i32 s2, s6, s2
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: uniform_value:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_clause 0x1
+; GFX1032DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032DAGISEL-NEXT: s_and_b32 s3, s3, 1
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s2, s3
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: uniform_value:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_clause 0x1
+; GFX1032GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032GISEL-NEXT: s_and_b32 s3, s3, 1
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: s_mul_i32 s2, s2, s3
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: uniform_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_clause 0x1
+; GFX1164DAGISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s6, s2
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: uniform_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_clause 0x1
+; GFX1164GISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: s_mul_i32 s2, s6, s2
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: uniform_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_clause 0x1
+; GFX1132DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1132DAGISEL-NEXT: s_and_b32 s3, s3, 1
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s2, s3
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: uniform_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_clause 0x1
+; GFX1132GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: s_and_b32 s3, s3, 1
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: s_mul_i32 s2, s2, s3
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_endpgm
+entry:
+ %result = call i32 @llvm.amdgcn.wave.reduce.xor.i32(i32 %in, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @const_value(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: const_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8DAGISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX8DAGISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: const_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8GISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX8GISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: const_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9DAGISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX9DAGISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: const_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9GISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX9GISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: const_value:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX1064DAGISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: const_value:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX1064GISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: const_value:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032DAGISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX1032DAGISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: const_value:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032GISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX1032GISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: const_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX1164DAGISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: const_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX1164GISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: const_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132DAGISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: const_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX1132GISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_endpgm
+ entry:
+ %result = call i32 @llvm.amdgcn.wave.reduce.xor.i32(i32 123, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @poison_value(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: poison_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8DAGISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: poison_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8GISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: poison_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9DAGISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: poison_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9GISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: poison_value:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: poison_value:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: poison_value:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032DAGISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: poison_value:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032GISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: poison_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: poison_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: poison_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132DAGISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: poison_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_endpgm
+entry:
+ %result = call i32 @llvm.amdgcn.wave.reduce.xor.i32(i32 poison, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: divergent_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_mov_b32 s4, 0
+; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX8DAGISEL-NEXT: s_xor_b32 s4, s4, s6
+; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX8DAGISEL-NEXT: ; %bb.2:
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_mov_b32 s4, 0
+; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX8GISEL-NEXT: s_xor_b32 s4, s4, s6
+; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX8GISEL-NEXT: ; %bb.2:
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_mov_b32 s4, 0
+; GFX9DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX9DAGISEL-NEXT: s_xor_b32 s4, s4, s6
+; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX9DAGISEL-NEXT: ; %bb.2:
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_mov_b32 s4, 0
+; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX9GISEL-NEXT: s_xor_b32 s4, s4, s6
+; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX9GISEL-NEXT: ; %bb.2:
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_value:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: s_mov_b32 s4, 0
+; GFX1064DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1064DAGISEL-NEXT: s_xor_b32 s4, s4, s6
+; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1064DAGISEL-NEXT: ; %bb.2:
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_value:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: s_mov_b32 s4, 0
+; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1064GISEL-NEXT: s_xor_b32 s4, s4, s6
+; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1064GISEL-NEXT: ; %bb.2:
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_value:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, 0
+; GFX1032DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1032DAGISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1032DAGISEL-NEXT: s_xor_b32 s2, s2, s5
+; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1032DAGISEL-NEXT: ; %bb.2:
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_value:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032GISEL-NEXT: s_mov_b32 s2, 0
+; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032GISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1032GISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1032GISEL-NEXT: s_xor_b32 s2, s2, s5
+; GFX1032GISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1032GISEL-NEXT: ; %bb.2:
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: s_mov_b32 s4, 0
+; GFX1164DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: s_xor_b32 s4, s4, s6
+; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1164DAGISEL-NEXT: ; %bb.2:
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: s_mov_b32 s4, 0
+; GFX1164GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT: s_xor_b32 s4, s4, s6
+; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1164GISEL-NEXT: ; %bb.2:
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132DAGISEL-NEXT: s_mov_b32 s2, 0
+; GFX1132DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s3
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: s_xor_b32 s2, s2, s5
+; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1132DAGISEL-NEXT: ; %bb.2:
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132GISEL-NEXT: s_mov_b32 s2, 0
+; GFX1132GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s3
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT: s_xor_b32 s2, s2, s5
+; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1132GISEL-NEXT: ; %bb.2:
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_endpgm
+entry:
+ %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+ %result = call i32 @llvm.amdgcn.wave.reduce.xor.i32(i32 %id.x, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: divergent_cfg:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr2
+; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX8DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX8DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX8DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8DAGISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: s_mul_i32 s2, s6, s2
+; GFX8DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX8DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_mov_b32 s6, 0
+; GFX8DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX8DAGISEL-NEXT: s_xor_b32 s6, s6, s8
+; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX8DAGISEL-NEXT: ; %bb.5:
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX8DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT: flat_store_dword v[2:3], v1
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_cfg:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
+; GFX8GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX8GISEL-NEXT: ; %bb.1: ; %else
+; GFX8GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8GISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX8GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: s_mul_i32 s6, s6, s2
+; GFX8GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX8GISEL-NEXT: ; %bb.3: ; %if
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_mov_b32 s6, 0
+; GFX8GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX8GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX8GISEL-NEXT: s_xor_b32 s6, s6, s8
+; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX8GISEL-NEXT: .LBB4_5: ; %endif
+; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_cfg:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr2
+; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX9DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX9DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9DAGISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: s_mul_i32 s2, s6, s2
+; GFX9DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX9DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_mov_b32 s6, 0
+; GFX9DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX9DAGISEL-NEXT: s_xor_b32 s6, s6, s8
+; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX9DAGISEL-NEXT: ; %bb.5:
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX9DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_cfg:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
+; GFX9GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX9GISEL-NEXT: ; %bb.1: ; %else
+; GFX9GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9GISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX9GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: s_mul_i32 s6, s6, s2
+; GFX9GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX9GISEL-NEXT: ; %bb.3: ; %if
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_mov_b32 s6, 0
+; GFX9GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX9GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX9GISEL-NEXT: s_xor_b32 s6, s6, s8
+; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX9GISEL-NEXT: .LBB4_5: ; %endif
+; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_cfg:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr2
+; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1064DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s6, s2
+; GFX1064DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: s_mov_b32 s6, 0
+; GFX1064DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX1064DAGISEL-NEXT: s_xor_b32 s6, s6, s8
+; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1064DAGISEL-NEXT: ; %bb.5:
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX1064DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_cfg:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
+; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1064GISEL-NEXT: ; %bb.1: ; %else
+; GFX1064GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: s_mul_i32 s6, s6, s2
+; GFX1064GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1064GISEL-NEXT: ; %bb.3: ; %if
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: s_mov_b32 s6, 0
+; GFX1064GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX1064GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX1064GISEL-NEXT: s_xor_b32 s6, s6, s8
+; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1064GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_cfg:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc_lo, 15, v0
+; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr1
+; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1032DAGISEL-NEXT: s_load_dword s1, s[4:5], 0x2c
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032DAGISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: s_mul_i32 s1, s1, s2
+; GFX1032DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, s0
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT: s_mov_b32 s1, 0
+; GFX1032DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s3, s2
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1032DAGISEL-NEXT: s_bitset0_b32 s2, s3
+; GFX1032DAGISEL-NEXT: s_xor_b32 s1, s1, s6
+; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1032DAGISEL-NEXT: ; %bb.5:
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1032DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_cfg:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v0
+; GFX1032GISEL-NEXT: ; implicit-def: $sgpr0
+; GFX1032GISEL-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX1032GISEL-NEXT: s_xor_b32 s1, exec_lo, s1
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1032GISEL-NEXT: ; %bb.1: ; %else
+; GFX1032GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c
+; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032GISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: s_mul_i32 s0, s0, s2
+; GFX1032GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s1, s1
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1032GISEL-NEXT: ; %bb.3: ; %if
+; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT: s_mov_b32 s0, 0
+; GFX1032GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: s_ff1_i32_b32 s3, s2
+; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1032GISEL-NEXT: s_bitset0_b32 s2, s3
+; GFX1032GISEL-NEXT: s_xor_b32 s0, s0, s6
+; GFX1032GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1032GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_cfg:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr2
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
+; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1164DAGISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s6, s2
+; GFX1164DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: s_mov_b32 s6, 0
+; GFX1164DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX1164DAGISEL-NEXT: s_xor_b32 s6, s6, s8
+; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1164DAGISEL-NEXT: ; %bb.5:
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX1164DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_cfg:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
+; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1164GISEL-NEXT: ; %bb.1: ; %else
+; GFX1164GISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: s_mul_i32 s6, s6, s2
+; GFX1164GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1]
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1164GISEL-NEXT: ; %bb.3: ; %if
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: s_mov_b32 s6, 0
+; GFX1164GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX1164GISEL-NEXT: s_xor_b32 s6, s6, s8
+; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1164GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_cfg:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr1
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
+; GFX1132DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1132DAGISEL-NEXT: s_load_b32 s1, s[4:5], 0x2c
+; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132DAGISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: s_mul_i32 s1, s1, s2
+; GFX1132DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, s0
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0
+; GFX1132DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3
+; GFX1132DAGISEL-NEXT: s_xor_b32 s1, s1, s6
+; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1132DAGISEL-NEXT: ; %bb.5:
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1132DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_cfg:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT: s_mov_b32 s1, exec_lo
+; GFX1132GISEL-NEXT: ; implicit-def: $sgpr0
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
+; GFX1132GISEL-NEXT: s_xor_b32 s1, exec_lo, s1
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1132GISEL-NEXT: ; %bb.1: ; %else
+; GFX1132GISEL-NEXT: s_load_b32 s0, s[4:5], 0x2c
+; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132GISEL-NEXT: s_and_b32 s2, s2, 1
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: s_mul_i32 s0, s0, s2
+; GFX1132GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s1, s1
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1132GISEL-NEXT: ; %bb.3: ; %if
+; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT: s_mov_b32 s0, 0
+; GFX1132GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3
+; GFX1132GISEL-NEXT: s_xor_b32 s0, s0, s6
+; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1132GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
+; GFX1132GISEL-NEXT: s_endpgm
+ entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %d_cmp = icmp ult i32 %tid, 16
+ br i1 %d_cmp, label %if, label %else
+
+if:
+ %reducedValTid = call i32 @llvm.amdgcn.wave.reduce.xor.i32(i32 %tid, i32 1)
+ br label %endif
+
+else:
+ %reducedValIn = call i32 @llvm.amdgcn.wave.reduce.xor.i32(i32 %in, i32 1)
+ br label %endif
+
+endif:
+ %combine = phi i32 [%reducedValTid, %if], [%reducedValIn, %else]
+ store i32 %combine, ptr addrspace(1) %out
+ ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX10DAGISEL: {{.*}}
+; GFX10GISEL: {{.*}}
+; GFX11DAGISEL: {{.*}}
+; GFX11GISEL: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.add.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.add.mir
new file mode 100644
index 0000000000000..ed9005010a71d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.add.mir
@@ -0,0 +1,90 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -mtriple=amdgcn -run-pass=finalize-isel %s -o - | FileCheck -check-prefix=GCN %s
+
+---
+name: uniform_value
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ bb.0.entry:
+ liveins: $sgpr0_sgpr1
+
+ ; GCN-LABEL: name: uniform_value
+ ; GCN: liveins: $sgpr0_sgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0
+ ; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 44, 0
+ ; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
+ ; GCN-NEXT: [[S_BCNT1_I32_B64_:%[0-9]+]]:sgpr_32 = S_BCNT1_I32_B64 [[S_MOV_B64_]], implicit-def $scc
+ ; GCN-NEXT: [[S_MUL_I32_:%[0-9]+]]:sgpr_32 = S_MUL_I32 [[S_LOAD_DWORD_IMM]], [[S_BCNT1_I32_B64_]]
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MUL_I32_]]
+ ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY1]], killed [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0
+ %1:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ %4:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %5:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0
+ %6:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1(p4), 44, 0
+ %7:sgpr_32 = WAVE_REDUCE_ADD_PSEUDO_I32 killed %6, 1, implicit $exec
+ %8:vgpr_32 = COPY %7
+ GLOBAL_STORE_DWORD_SADDR killed %4, killed %8, killed %5, 0, 0, implicit $exec
+ S_ENDPGM 0
+
+...
+
+---
+name: divergent_value
+machineFunctionInfo:
+ isEntryFunction: true
+tracksRegLiveness: true
+body: |
+ ; GCN-LABEL: name: divergent_value
+ ; GCN: bb.0.entry:
+ ; GCN-NEXT: successors: %bb.2(0x80000000)
+ ; GCN-NEXT: liveins: $vgpr0, $sgpr0_sgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0
+ ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
+ ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
+ ; GCN-NEXT: S_BRANCH %bb.2
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.2:
+ ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[PHI:%[0-9]+]]:sgpr_32 = PHI [[S_MOV_B32_]], %bb.0, %4, %bb.2
+ ; GCN-NEXT: [[PHI1:%[0-9]+]]:sreg_64_xexec = PHI [[S_MOV_B64_]], %bb.0, %11, %bb.2
+ ; GCN-NEXT: [[S_FF1_I32_B64_:%[0-9]+]]:sgpr_32 = S_FF1_I32_B64 [[PHI1]]
+ ; GCN-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sgpr_32 = V_READLANE_B32 [[COPY1]], [[S_FF1_I32_B64_]]
+ ; GCN-NEXT: [[S_ADD_I32_:%[0-9]+]]:sgpr_32 = S_ADD_I32 [[PHI]], [[V_READLANE_B32_]], implicit-def $scc
+ ; GCN-NEXT: [[S_BITSET0_B64_:%[0-9]+]]:sreg_64_xexec = S_BITSET0_B64 [[S_FF1_I32_B64_]], [[PHI1]]
+ ; GCN-NEXT: S_CMP_LG_U64 [[S_BITSET0_B64_]], 0, implicit-def $scc
+ ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.3:
+ ; GCN-NEXT: successors: %bb.1(0x80000000)
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
+ ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY2]], killed [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.1:
+ ; GCN-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY1]], %bb.3
+ ; GCN-NEXT: S_ENDPGM 0
+ bb.0.entry:
+ liveins: $vgpr0, $sgpr0_sgpr1
+ %1:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ %0:vgpr_32 = COPY $vgpr0
+ %4:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0
+ %5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %6:sgpr_32 = WAVE_REDUCE_ADD_PSEUDO_I32 %0, 1, implicit $exec
+ %7:vgpr_32 = COPY %6
+ GLOBAL_STORE_DWORD_SADDR killed %5, killed %7, killed %4, 0, 0, implicit $exec
+ bb.1:
+ %8:vgpr_32 = PHI %0, %bb.0
+ S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.and.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.and.mir
new file mode 100644
index 0000000000000..8ab1905295a8d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.and.mir
@@ -0,0 +1,89 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -mtriple=amdgcn -run-pass=finalize-isel %s -o - | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -passes=finalize-isel %s -o - | FileCheck -check-prefix=GCN %s
+
+---
+name: uniform_value
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ bb.0.entry:
+ liveins: $sgpr0_sgpr1
+
+ ; GCN-LABEL: name: uniform_value
+ ; GCN: liveins: $sgpr0_sgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0
+ ; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 44, 0
+ ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 [[S_LOAD_DWORD_IMM]]
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY1]], killed [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0
+ %1:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ %4:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %5:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0
+ %6:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1(p4), 44, 0
+ %7:sgpr_32 = WAVE_REDUCE_AND_PSEUDO_B32 killed %6, 1, implicit $exec
+ %8:vgpr_32 = COPY %7
+ GLOBAL_STORE_DWORD_SADDR killed %4, killed %8, killed %5, 0, 0, implicit $exec
+ S_ENDPGM 0
+
+...
+
+---
+name: divergent_value
+machineFunctionInfo:
+ isEntryFunction: true
+tracksRegLiveness: true
+body: |
+ ; GCN-LABEL: name: divergent_value
+ ; GCN: bb.0.entry:
+ ; GCN-NEXT: successors: %bb.2(0x80000000)
+ ; GCN-NEXT: liveins: $vgpr0, $sgpr0_sgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0
+ ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
+ ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 4294967295
+ ; GCN-NEXT: S_BRANCH %bb.2
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.2:
+ ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[PHI:%[0-9]+]]:sgpr_32 = PHI [[S_MOV_B32_]], %bb.0, %4, %bb.2
+ ; GCN-NEXT: [[PHI1:%[0-9]+]]:sreg_64_xexec = PHI [[S_MOV_B64_]], %bb.0, %11, %bb.2
+ ; GCN-NEXT: [[S_FF1_I32_B64_:%[0-9]+]]:sgpr_32 = S_FF1_I32_B64 [[PHI1]]
+ ; GCN-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sgpr_32 = V_READLANE_B32 [[COPY1]], [[S_FF1_I32_B64_]]
+ ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sgpr_32 = S_AND_B32 [[PHI]], [[V_READLANE_B32_]], implicit-def $scc
+ ; GCN-NEXT: [[S_BITSET0_B64_:%[0-9]+]]:sreg_64_xexec = S_BITSET0_B64 [[S_FF1_I32_B64_]], [[PHI1]]
+ ; GCN-NEXT: S_CMP_LG_U64 [[S_BITSET0_B64_]], 0, implicit-def $scc
+ ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.3:
+ ; GCN-NEXT: successors: %bb.1(0x80000000)
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_AND_B32_]]
+ ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY2]], killed [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.1:
+ ; GCN-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY1]], %bb.3
+ ; GCN-NEXT: S_ENDPGM 0
+ bb.0.entry:
+ liveins: $vgpr0, $sgpr0_sgpr1
+ %1:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ %0:vgpr_32 = COPY $vgpr0
+ %4:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0
+ %5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %6:sgpr_32 = WAVE_REDUCE_AND_PSEUDO_B32 %0, 1, implicit $exec
+ %7:vgpr_32 = COPY %6
+ GLOBAL_STORE_DWORD_SADDR killed %5, killed %7, killed %4, 0, 0, implicit $exec
+ bb.1:
+ %8:vgpr_32 = PHI %0, %bb.0
+ S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.max.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.max.mir
new file mode 100644
index 0000000000000..a5f8b0dbe52b2
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.max.mir
@@ -0,0 +1,89 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -mtriple=amdgcn -run-pass=finalize-isel %s -o - | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -passes=finalize-isel %s -o - | FileCheck -check-prefix=GCN %s
+
+---
+name: uniform_value
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ bb.0.entry:
+ liveins: $sgpr0_sgpr1
+
+ ; GCN-LABEL: name: uniform_value
+ ; GCN: liveins: $sgpr0_sgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0
+ ; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 44, 0
+ ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 [[S_LOAD_DWORD_IMM]]
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY1]], killed [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0
+ %1:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ %4:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %5:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0
+ %6:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1(p4), 44, 0
+ %7:sgpr_32 = WAVE_REDUCE_MAX_PSEUDO_I32 killed %6, 1, implicit $exec
+ %8:vgpr_32 = COPY %7
+ GLOBAL_STORE_DWORD_SADDR killed %4, killed %8, killed %5, 0, 0, implicit $exec
+ S_ENDPGM 0
+
+...
+
+---
+name: divergent_value
+machineFunctionInfo:
+ isEntryFunction: true
+tracksRegLiveness: true
+body: |
+ ; GCN-LABEL: name: divergent_value
+ ; GCN: bb.0.entry:
+ ; GCN-NEXT: successors: %bb.2(0x80000000)
+ ; GCN-NEXT: liveins: $vgpr0, $sgpr0_sgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0
+ ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
+ ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 2147483648
+ ; GCN-NEXT: S_BRANCH %bb.2
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.2:
+ ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[PHI:%[0-9]+]]:sgpr_32 = PHI [[S_MOV_B32_]], %bb.0, %4, %bb.2
+ ; GCN-NEXT: [[PHI1:%[0-9]+]]:sreg_64_xexec = PHI [[S_MOV_B64_]], %bb.0, %11, %bb.2
+ ; GCN-NEXT: [[S_FF1_I32_B64_:%[0-9]+]]:sgpr_32 = S_FF1_I32_B64 [[PHI1]]
+ ; GCN-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sgpr_32 = V_READLANE_B32 [[COPY1]], [[S_FF1_I32_B64_]]
+ ; GCN-NEXT: [[S_MAX_I32_:%[0-9]+]]:sgpr_32 = S_MAX_I32 [[PHI]], [[V_READLANE_B32_]], implicit-def $scc
+ ; GCN-NEXT: [[S_BITSET0_B64_:%[0-9]+]]:sreg_64_xexec = S_BITSET0_B64 [[S_FF1_I32_B64_]], [[PHI1]]
+ ; GCN-NEXT: S_CMP_LG_U64 [[S_BITSET0_B64_]], 0, implicit-def $scc
+ ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.3:
+ ; GCN-NEXT: successors: %bb.1(0x80000000)
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MAX_I32_]]
+ ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY2]], killed [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.1:
+ ; GCN-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY1]], %bb.3
+ ; GCN-NEXT: S_ENDPGM 0
+ bb.0.entry:
+ liveins: $vgpr0, $sgpr0_sgpr1
+ %1:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ %0:vgpr_32 = COPY $vgpr0
+ %4:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0
+ %5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %6:sgpr_32 = WAVE_REDUCE_MAX_PSEUDO_I32 %0, 1, implicit $exec
+ %7:vgpr_32 = COPY %6
+ GLOBAL_STORE_DWORD_SADDR killed %5, killed %7, killed %4, 0, 0, implicit $exec
+ bb.1:
+ %8:vgpr_32 = PHI %0, %bb.0
+ S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.min.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.min.mir
new file mode 100644
index 0000000000000..7a0a522d1d3dd
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.min.mir
@@ -0,0 +1,89 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -mtriple=amdgcn -run-pass=finalize-isel %s -o - | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -passes=finalize-isel %s -o - | FileCheck -check-prefix=GCN %s
+
+---
+name: uniform_value
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ bb.0.entry:
+ liveins: $sgpr0_sgpr1
+
+ ; GCN-LABEL: name: uniform_value
+ ; GCN: liveins: $sgpr0_sgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0
+ ; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 44, 0
+ ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 [[S_LOAD_DWORD_IMM]]
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY1]], killed [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0
+ %1:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ %4:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %5:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0
+ %6:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1(p4), 44, 0
+ %7:sgpr_32 = WAVE_REDUCE_MIN_PSEUDO_I32 killed %6, 1, implicit $exec
+ %8:vgpr_32 = COPY %7
+ GLOBAL_STORE_DWORD_SADDR killed %4, killed %8, killed %5, 0, 0, implicit $exec
+ S_ENDPGM 0
+
+...
+
+---
+name: divergent_value
+machineFunctionInfo:
+ isEntryFunction: true
+tracksRegLiveness: true
+body: |
+ ; GCN-LABEL: name: divergent_value
+ ; GCN: bb.0.entry:
+ ; GCN-NEXT: successors: %bb.2(0x80000000)
+ ; GCN-NEXT: liveins: $vgpr0, $sgpr0_sgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0
+ ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
+ ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 2147483647
+ ; GCN-NEXT: S_BRANCH %bb.2
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.2:
+ ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[PHI:%[0-9]+]]:sgpr_32 = PHI [[S_MOV_B32_]], %bb.0, %4, %bb.2
+ ; GCN-NEXT: [[PHI1:%[0-9]+]]:sreg_64_xexec = PHI [[S_MOV_B64_]], %bb.0, %11, %bb.2
+ ; GCN-NEXT: [[S_FF1_I32_B64_:%[0-9]+]]:sgpr_32 = S_FF1_I32_B64 [[PHI1]]
+ ; GCN-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sgpr_32 = V_READLANE_B32 [[COPY1]], [[S_FF1_I32_B64_]]
+ ; GCN-NEXT: [[S_MIN_I32_:%[0-9]+]]:sgpr_32 = S_MIN_I32 [[PHI]], [[V_READLANE_B32_]], implicit-def $scc
+ ; GCN-NEXT: [[S_BITSET0_B64_:%[0-9]+]]:sreg_64_xexec = S_BITSET0_B64 [[S_FF1_I32_B64_]], [[PHI1]]
+ ; GCN-NEXT: S_CMP_LG_U64 [[S_BITSET0_B64_]], 0, implicit-def $scc
+ ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.3:
+ ; GCN-NEXT: successors: %bb.1(0x80000000)
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MIN_I32_]]
+ ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY2]], killed [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.1:
+ ; GCN-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY1]], %bb.3
+ ; GCN-NEXT: S_ENDPGM 0
+ bb.0.entry:
+ liveins: $vgpr0, $sgpr0_sgpr1
+ %1:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ %0:vgpr_32 = COPY $vgpr0
+ %4:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0
+ %5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %6:sgpr_32 = WAVE_REDUCE_MIN_PSEUDO_I32 %0, 1, implicit $exec
+ %7:vgpr_32 = COPY %6
+ GLOBAL_STORE_DWORD_SADDR killed %5, killed %7, killed %4, 0, 0, implicit $exec
+ bb.1:
+ %8:vgpr_32 = PHI %0, %bb.0
+ S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.or.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.or.mir
new file mode 100644
index 0000000000000..4711240af3ecf
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.or.mir
@@ -0,0 +1,89 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -mtriple=amdgcn -run-pass=finalize-isel %s -o - | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -passes=finalize-isel %s -o - | FileCheck -check-prefix=GCN %s
+
+---
+name: uniform_value
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ bb.0.entry:
+ liveins: $sgpr0_sgpr1
+
+ ; GCN-LABEL: name: uniform_value
+ ; GCN: liveins: $sgpr0_sgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0
+ ; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 44, 0
+ ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 [[S_LOAD_DWORD_IMM]]
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY1]], killed [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0
+ %1:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ %4:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %5:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0
+ %6:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1(p4), 44, 0
+ %7:sgpr_32 = WAVE_REDUCE_OR_PSEUDO_B32 killed %6, 1, implicit $exec
+ %8:vgpr_32 = COPY %7
+ GLOBAL_STORE_DWORD_SADDR killed %4, killed %8, killed %5, 0, 0, implicit $exec
+ S_ENDPGM 0
+
+...
+
+---
+name: divergent_value
+machineFunctionInfo:
+ isEntryFunction: true
+tracksRegLiveness: true
+body: |
+ ; GCN-LABEL: name: divergent_value
+ ; GCN: bb.0.entry:
+ ; GCN-NEXT: successors: %bb.2(0x80000000)
+ ; GCN-NEXT: liveins: $vgpr0, $sgpr0_sgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0
+ ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
+ ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
+ ; GCN-NEXT: S_BRANCH %bb.2
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.2:
+ ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[PHI:%[0-9]+]]:sgpr_32 = PHI [[S_MOV_B32_]], %bb.0, %4, %bb.2
+ ; GCN-NEXT: [[PHI1:%[0-9]+]]:sreg_64_xexec = PHI [[S_MOV_B64_]], %bb.0, %11, %bb.2
+ ; GCN-NEXT: [[S_FF1_I32_B64_:%[0-9]+]]:sgpr_32 = S_FF1_I32_B64 [[PHI1]]
+ ; GCN-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sgpr_32 = V_READLANE_B32 [[COPY1]], [[S_FF1_I32_B64_]]
+ ; GCN-NEXT: [[S_OR_B32_:%[0-9]+]]:sgpr_32 = S_OR_B32 [[PHI]], [[V_READLANE_B32_]], implicit-def $scc
+ ; GCN-NEXT: [[S_BITSET0_B64_:%[0-9]+]]:sreg_64_xexec = S_BITSET0_B64 [[S_FF1_I32_B64_]], [[PHI1]]
+ ; GCN-NEXT: S_CMP_LG_U64 [[S_BITSET0_B64_]], 0, implicit-def $scc
+ ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.3:
+ ; GCN-NEXT: successors: %bb.1(0x80000000)
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_OR_B32_]]
+ ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY2]], killed [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.1:
+ ; GCN-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY1]], %bb.3
+ ; GCN-NEXT: S_ENDPGM 0
+ bb.0.entry:
+ liveins: $vgpr0, $sgpr0_sgpr1
+ %1:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ %0:vgpr_32 = COPY $vgpr0
+ %4:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0
+ %5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %6:sgpr_32 = WAVE_REDUCE_OR_PSEUDO_B32 %0, 1, implicit $exec
+ %7:vgpr_32 = COPY %6
+ GLOBAL_STORE_DWORD_SADDR killed %5, killed %7, killed %4, 0, 0, implicit $exec
+ bb.1:
+ %8:vgpr_32 = PHI %0, %bb.0
+ S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.sub.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.sub.mir
new file mode 100644
index 0000000000000..7b729baaaf21f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.sub.mir
@@ -0,0 +1,92 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -mtriple=amdgcn -run-pass=finalize-isel %s -o - | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -passes=finalize-isel %s -o - | FileCheck -check-prefix=GCN %s
+
+---
+name: uniform_value
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ bb.0.entry:
+ liveins: $sgpr0_sgpr1
+
+ ; GCN-LABEL: name: uniform_value
+ ; GCN: liveins: $sgpr0_sgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0
+ ; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 44, 0
+ ; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
+ ; GCN-NEXT: [[S_BCNT1_I32_B64_:%[0-9]+]]:sgpr_32 = S_BCNT1_I32_B64 [[S_MOV_B64_]], implicit-def $scc
+ ; GCN-NEXT: [[S_MUL_I32_:%[0-9]+]]:sgpr_32 = S_MUL_I32 -1, [[S_LOAD_DWORD_IMM]]
+ ; GCN-NEXT: [[S_MUL_I32_1:%[0-9]+]]:sgpr_32 = S_MUL_I32 [[S_MUL_I32_]], [[S_BCNT1_I32_B64_]]
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MUL_I32_1]]
+ ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY1]], killed [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0
+ %1:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ %4:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %5:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0
+ %6:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1(p4), 44, 0
+ %7:sgpr_32 = WAVE_REDUCE_SUB_PSEUDO_I32 killed %6, 1, implicit $exec
+ %8:vgpr_32 = COPY %7
+ GLOBAL_STORE_DWORD_SADDR killed %4, killed %8, killed %5, 0, 0, implicit $exec
+ S_ENDPGM 0
+
+...
+
+---
+name: divergent_value
+machineFunctionInfo:
+ isEntryFunction: true
+tracksRegLiveness: true
+body: |
+ ; GCN-LABEL: name: divergent_value
+ ; GCN: bb.0.entry:
+ ; GCN-NEXT: successors: %bb.2(0x80000000)
+ ; GCN-NEXT: liveins: $vgpr0, $sgpr0_sgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0
+ ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
+ ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
+ ; GCN-NEXT: S_BRANCH %bb.2
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.2:
+ ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[PHI:%[0-9]+]]:sgpr_32 = PHI [[S_MOV_B32_]], %bb.0, %4, %bb.2
+ ; GCN-NEXT: [[PHI1:%[0-9]+]]:sreg_64_xexec = PHI [[S_MOV_B64_]], %bb.0, %11, %bb.2
+ ; GCN-NEXT: [[S_FF1_I32_B64_:%[0-9]+]]:sgpr_32 = S_FF1_I32_B64 [[PHI1]]
+ ; GCN-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sgpr_32 = V_READLANE_B32 [[COPY1]], [[S_FF1_I32_B64_]]
+ ; GCN-NEXT: [[S_SUB_I32_:%[0-9]+]]:sgpr_32 = S_SUB_I32 [[PHI]], [[V_READLANE_B32_]], implicit-def $scc
+ ; GCN-NEXT: [[S_BITSET0_B64_:%[0-9]+]]:sreg_64_xexec = S_BITSET0_B64 [[S_FF1_I32_B64_]], [[PHI1]]
+ ; GCN-NEXT: S_CMP_LG_U64 [[S_BITSET0_B64_]], 0, implicit-def $scc
+ ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.3:
+ ; GCN-NEXT: successors: %bb.1(0x80000000)
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_SUB_I32_]]
+ ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY2]], killed [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.1:
+ ; GCN-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY1]], %bb.3
+ ; GCN-NEXT: S_ENDPGM 0
+ bb.0.entry:
+ liveins: $vgpr0, $sgpr0_sgpr1
+ %1:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ %0:vgpr_32 = COPY $vgpr0
+ %4:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0
+ %5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %6:sgpr_32 = WAVE_REDUCE_SUB_PSEUDO_I32 %0, 1, implicit $exec
+ %7:vgpr_32 = COPY %6
+ GLOBAL_STORE_DWORD_SADDR killed %5, killed %7, killed %4, 0, 0, implicit $exec
+ bb.1:
+ %8:vgpr_32 = PHI %0, %bb.0
+ S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.umax.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.umax.mir
index 0733d34f5e366..72792edee1089 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.umax.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.umax.mir
@@ -1,6 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
-# RUN: llc -mtriple=amdgcn -run-pass=finalize-isel -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
-# RUN: llc -mtriple=amdgcn -passes=finalize-isel -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -run-pass=finalize-isel %s -o - | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -passes=finalize-isel %s -o - | FileCheck -check-prefix=GCN %s
---
name: uniform_value
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.umin.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.umin.mir
index 486c08335b170..4ba296985b4a7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.umin.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.umin.mir
@@ -1,6 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
-# RUN: llc -mtriple=amdgcn -run-pass=finalize-isel -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
-# RUN: llc -mtriple=amdgcn -passes=finalize-isel -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -run-pass=finalize-isel %s -o - | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -passes=finalize-isel %s -o - | FileCheck -check-prefix=GCN %s
---
name: uniform_value
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.xor.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.xor.mir
new file mode 100644
index 0000000000000..3dbf0c063af92
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.xor.mir
@@ -0,0 +1,92 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -mtriple=amdgcn -run-pass=finalize-isel %s -o - | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -passes=finalize-isel %s -o - | FileCheck -check-prefix=GCN %s
+
+---
+name: uniform_value
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ bb.0.entry:
+ liveins: $sgpr0_sgpr1
+
+ ; GCN-LABEL: name: uniform_value
+ ; GCN: liveins: $sgpr0_sgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0
+ ; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 44, 0
+ ; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
+ ; GCN-NEXT: [[S_BCNT1_I32_B64_:%[0-9]+]]:sgpr_32 = S_BCNT1_I32_B64 [[S_MOV_B64_]], implicit-def $scc
+ ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sgpr_32 = S_AND_B32 [[S_BCNT1_I32_B64_]], 1, implicit-def $scc
+ ; GCN-NEXT: [[S_MUL_I32_:%[0-9]+]]:sgpr_32 = S_MUL_I32 [[S_LOAD_DWORD_IMM]], [[S_AND_B32_]]
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MUL_I32_]]
+ ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY1]], killed [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0
+ %1:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ %4:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %5:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0
+ %6:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1(p4), 44, 0
+ %7:sgpr_32 = WAVE_REDUCE_XOR_PSEUDO_B32 killed %6, 1, implicit $exec
+ %8:vgpr_32 = COPY %7
+ GLOBAL_STORE_DWORD_SADDR killed %4, killed %8, killed %5, 0, 0, implicit $exec
+ S_ENDPGM 0
+
+...
+
+---
+name: divergent_value
+machineFunctionInfo:
+ isEntryFunction: true
+tracksRegLiveness: true
+body: |
+ ; GCN-LABEL: name: divergent_value
+ ; GCN: bb.0.entry:
+ ; GCN-NEXT: successors: %bb.2(0x80000000)
+ ; GCN-NEXT: liveins: $vgpr0, $sgpr0_sgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0
+ ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
+ ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
+ ; GCN-NEXT: S_BRANCH %bb.2
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.2:
+ ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[PHI:%[0-9]+]]:sgpr_32 = PHI [[S_MOV_B32_]], %bb.0, %4, %bb.2
+ ; GCN-NEXT: [[PHI1:%[0-9]+]]:sreg_64_xexec = PHI [[S_MOV_B64_]], %bb.0, %11, %bb.2
+ ; GCN-NEXT: [[S_FF1_I32_B64_:%[0-9]+]]:sgpr_32 = S_FF1_I32_B64 [[PHI1]]
+ ; GCN-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sgpr_32 = V_READLANE_B32 [[COPY1]], [[S_FF1_I32_B64_]]
+ ; GCN-NEXT: [[S_XOR_B32_:%[0-9]+]]:sgpr_32 = S_XOR_B32 [[PHI]], [[V_READLANE_B32_]], implicit-def $scc
+ ; GCN-NEXT: [[S_BITSET0_B64_:%[0-9]+]]:sreg_64_xexec = S_BITSET0_B64 [[S_FF1_I32_B64_]], [[PHI1]]
+ ; GCN-NEXT: S_CMP_LG_U64 [[S_BITSET0_B64_]], 0, implicit-def $scc
+ ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.3:
+ ; GCN-NEXT: successors: %bb.1(0x80000000)
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_XOR_B32_]]
+ ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY2]], killed [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.1:
+ ; GCN-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY1]], %bb.3
+ ; GCN-NEXT: S_ENDPGM 0
+ bb.0.entry:
+ liveins: $vgpr0, $sgpr0_sgpr1
+ %1:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ %0:vgpr_32 = COPY $vgpr0
+ %4:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM %1(p4), 36, 0
+ %5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %6:sgpr_32 = WAVE_REDUCE_XOR_PSEUDO_B32 %0, 1, implicit $exec
+ %7:vgpr_32 = COPY %6
+ GLOBAL_STORE_DWORD_SADDR killed %5, killed %7, killed %4, 0, 0, implicit $exec
+ bb.1:
+ %8:vgpr_32 = PHI %0, %bb.0
+ S_ENDPGM 0
+
+...
>From 792a0c3043742404ff5f9a033517dbd872f76d8a Mon Sep 17 00:00:00 2001
From: easyonaadit <aaditya.alokdeshpande at amd.com>
Date: Wed, 12 Feb 2025 17:21:29 +0530
Subject: [PATCH 2/7] Renamed function to getIdentityValueForWaveReduction
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 6ade924794368..e7f95e22f4002 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5038,7 +5038,7 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
return LoopBB;
}
-static uint32_t getInitialValueForWaveReduction(unsigned Opc) {
+static uint32_t getIdentityValueForWaveReduction(unsigned Opc) {
switch (Opc) {
case AMDGPU::S_MIN_U32:
return std::numeric_limits<uint32_t>::max();
@@ -5056,7 +5056,7 @@ static uint32_t getInitialValueForWaveReduction(unsigned Opc) {
case AMDGPU::S_AND_B32:
return std::numeric_limits<uint32_t>::max();
default:
- llvm_unreachable("Unexpected opcode in getInitialValueForWaveReduction");
+ llvm_unreachable("Unexpected opcode in getIdentityValueForWaveReduction");
}
}
@@ -5184,7 +5184,7 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
// Create initail values of induction variable from Exec, Accumulator and
// insert branch instr to newly created ComputeBlock
- uint32_t InitalValue = getInitialValueForWaveReduction(Opc);
+ uint32_t InitalValue = getIdentityValueForWaveReduction(Opc);
auto TmpSReg =
BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
>From 8892ab560e0529fe89416507caa38d0f0d0d71c0 Mon Sep 17 00:00:00 2001
From: Aaditya <115080342+easyonaadit at users.noreply.github.com>
Date: Tue, 17 Jun 2025 11:49:06 +0530
Subject: [PATCH 3/7] Update llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Co-authored-by: Matt Arsenault <arsenm2 at gmail.com>
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index e7f95e22f4002..ddb0685df7403 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5097,7 +5097,7 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
bool IsWave32 = ST.isWave32();
unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
- unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+ MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
unsigned CountReg =
IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
>From 810b942d7fa8dd92da9ac3097d0def787a8aa3ca Mon Sep 17 00:00:00 2001
From: Aaditya <Aaditya.AlokDeshpande at amd.com>
Date: Tue, 17 Jun 2025 14:29:18 +0530
Subject: [PATCH 4/7] Address Review Comments
---
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 14 ++++++++------
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 2 +-
2 files changed, 9 insertions(+), 7 deletions(-)
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index be2f25d034dee..62852d2d64822 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2342,14 +2342,16 @@ class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic<
],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<1>>]>;
-multiclass AMDGPUWaveReduceOps<list<string> Operations> {
- foreach Op = Operations in { def Op : AMDGPUWaveReduce; }
+multiclass AMDGPUWaveReduceOps {
+ foreach Op = [
+ "umin", "min", "umax", "max", "uadd", "add", "usub", "sub", "and", "or",
+ "xor"
+ ] in {
+ def Op : AMDGPUWaveReduce;
+ }
}
-defvar Operations = [
- "umin", "min", "umax", "max", "uadd", "add", "usub", "sub", "and", "or", "xor"
-];
-defm int_amdgcn_wave_reduce_ : AMDGPUWaveReduceOps<Operations>;
+defm int_amdgcn_wave_reduce_ : AMDGPUWaveReduceOps;
def int_amdgcn_readfirstlane :
Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index ddb0685df7403..3056b9d4a3c7b 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5182,7 +5182,7 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
- // Create initail values of induction variable from Exec, Accumulator and
+ // Create initial values of induction variable from Exec, Accumulator and
// insert branch instr to newly created ComputeBlock
uint32_t InitalValue = getIdentityValueForWaveReduction(Opc);
auto TmpSReg =
>From 869e4cea1817025858d7aa47367e7de5cd345502 Mon Sep 17 00:00:00 2001
From: Aaditya <Aaditya.AlokDeshpande at amd.com>
Date: Wed, 18 Jun 2025 11:43:39 +0530
Subject: [PATCH 5/7] Remove Redundant Ops
---
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 6 +-
.../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 2 -
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 6 +-
llvm/lib/Target/AMDGPU/SIInstructions.td | 16 +-
.../CodeGen/AMDGPU/llvm.amdgcn.reduce.uadd.ll | 1240 ----------------
.../CodeGen/AMDGPU/llvm.amdgcn.reduce.usub.ll | 1286 -----------------
6 files changed, 10 insertions(+), 2546 deletions(-)
delete mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.uadd.ll
delete mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.usub.ll
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 62852d2d64822..08a5001ad5aa2 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2343,10 +2343,8 @@ class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic<
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<1>>]>;
multiclass AMDGPUWaveReduceOps {
- foreach Op = [
- "umin", "min", "umax", "max", "uadd", "add", "usub", "sub", "and", "or",
- "xor"
- ] in {
+ foreach Op =
+ ["umin", "min", "umax", "max", "add", "sub", "and", "or", "xor"] in {
def Op : AMDGPUWaveReduce;
}
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 5084d2106f232..5591e40c198e8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -5007,9 +5007,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
break;
}
case Intrinsic::amdgcn_wave_reduce_add:
- case Intrinsic::amdgcn_wave_reduce_uadd:
case Intrinsic::amdgcn_wave_reduce_sub:
- case Intrinsic::amdgcn_wave_reduce_usub:
case Intrinsic::amdgcn_wave_reduce_min:
case Intrinsic::amdgcn_wave_reduce_umin:
case Intrinsic::amdgcn_wave_reduce_max:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 3056b9d4a3c7b..e4d362d29885b 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5097,7 +5097,7 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
bool IsWave32 = ST.isWave32();
unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
- MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+ unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
unsigned CountReg =
IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
@@ -5262,12 +5262,8 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
- case AMDGPU::WAVE_REDUCE_UADD_PSEUDO_U32:
- return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
- case AMDGPU::WAVE_REDUCE_USUB_PSEUDO_U32:
- return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index c10250852ea7b..f9b19e2297585 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -306,9 +306,9 @@ def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)),
// clang-format off
defvar int_amdgcn_wave_reduce_ = "int_amdgcn_wave_reduce_";
multiclass
- AMDGPUWaveReducePseudoGenerator<string Op, string DataType, string Size> {
+ AMDGPUWaveReducePseudoGenerator<string Op, string DataType> {
let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
- def !toupper(Op) #"_PSEUDO_" #DataType #Size
+ def !toupper(Op) #"_PSEUDO_" #DataType
: VPseudoInstSI<(outs SGPR_32 : $sdst),
(ins VSrc_b32 : $src, VSrc_b32 : $strategy),
[(set i32 : $sdst, (!cast<AMDGPUWaveReduce>(int_amdgcn_wave_reduce_ #Op) i32 : $src, i32 : $strategy))]> {}
@@ -317,17 +317,15 @@ multiclass
// clang-format on
// Input list : [Operation_name,
-// type - Signed(I)/Unsigned(U)/Float(F)/Bitwise(B),
-// Size_in_bits]
+// type - Signed(I)/Unsigned(U)/Float(F)/Bitwise(B)]
defvar Operations = [
- ["umin", "U", "32"], ["min", "I", "32"], ["umax", "U", "32"],
- ["max", "I", "32"], ["uadd", "U", "32"], ["add", "I", "32"],
- ["usub", "U", "32"], ["sub", "I", "32"], ["and", "B", "32"],
- ["or", "B", "32"], ["xor", "B", "32"]
+ ["umin", "U32"], ["min", "I32"], ["umax", "U32"], ["max", "I32"],
+ ["add", "I32"], ["sub", "I32"], ["and", "B32"], ["or", "B32"],
+ ["xor", "B32"]
];
foreach Op = Operations in {
- defm WAVE_REDUCE_ : AMDGPUWaveReducePseudoGenerator<Op[0], Op[1], Op[2]>;
+ defm WAVE_REDUCE_ : AMDGPUWaveReducePseudoGenerator<Op[0], Op[1]>;
}
let usesCustomInserter = 1, Defs = [VCC] in {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.uadd.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.uadd.ll
deleted file mode 100644
index d7b0ebb709a77..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.uadd.ll
+++ /dev/null
@@ -1,1240 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 < %s | FileCheck -check-prefixes=GFX8DAGISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 < %s | FileCheck -check-prefixes=GFX8GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 < %s | FileCheck -check-prefixes=GFX9DAGISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 < %s | FileCheck -check-prefixes=GFX9GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s
-
-declare i32 @llvm.amdgcn.wave.reduce.uadd.i32(i32, i32 immarg)
-declare i32 @llvm.amdgcn.workitem.id.x()
-
-define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
-; GFX8DAGISEL-LABEL: uniform_value:
-; GFX8DAGISEL: ; %bb.0: ; %entry
-; GFX8DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
-; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8DAGISEL-NEXT: s_mul_i32 s2, s6, s2
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
-; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
-; GFX8DAGISEL-NEXT: s_endpgm
-;
-; GFX8GISEL-LABEL: uniform_value:
-; GFX8GISEL: ; %bb.0: ; %entry
-; GFX8GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
-; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8GISEL-NEXT: s_mul_i32 s2, s6, s2
-; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2
-; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
-; GFX8GISEL-NEXT: s_endpgm
-;
-; GFX9DAGISEL-LABEL: uniform_value:
-; GFX9DAGISEL: ; %bb.0: ; %entry
-; GFX9DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
-; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9DAGISEL-NEXT: s_mul_i32 s2, s6, s2
-; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
-; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX9DAGISEL-NEXT: s_endpgm
-;
-; GFX9GISEL-LABEL: uniform_value:
-; GFX9GISEL: ; %bb.0: ; %entry
-; GFX9GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
-; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9GISEL-NEXT: s_mul_i32 s2, s6, s2
-; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
-; GFX9GISEL-NEXT: s_endpgm
-;
-; GFX1064DAGISEL-LABEL: uniform_value:
-; GFX1064DAGISEL: ; %bb.0: ; %entry
-; GFX1064DAGISEL-NEXT: s_clause 0x1
-; GFX1064DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
-; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s6, s2
-; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX1064DAGISEL-NEXT: s_endpgm
-;
-; GFX1064GISEL-LABEL: uniform_value:
-; GFX1064GISEL: ; %bb.0: ; %entry
-; GFX1064GISEL-NEXT: s_clause 0x1
-; GFX1064GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
-; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064GISEL-NEXT: s_mul_i32 s2, s6, s2
-; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
-; GFX1064GISEL-NEXT: s_endpgm
-;
-; GFX1032DAGISEL-LABEL: uniform_value:
-; GFX1032DAGISEL: ; %bb.0: ; %entry
-; GFX1032DAGISEL-NEXT: s_clause 0x1
-; GFX1032DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
-; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3
-; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s2, s3
-; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2
-; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX1032DAGISEL-NEXT: s_endpgm
-;
-; GFX1032GISEL-LABEL: uniform_value:
-; GFX1032GISEL: ; %bb.0: ; %entry
-; GFX1032GISEL-NEXT: s_clause 0x1
-; GFX1032GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
-; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s3, s3
-; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032GISEL-NEXT: s_mul_i32 s2, s2, s3
-; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
-; GFX1032GISEL-NEXT: s_endpgm
-;
-; GFX1164DAGISEL-LABEL: uniform_value:
-; GFX1164DAGISEL: ; %bb.0: ; %entry
-; GFX1164DAGISEL-NEXT: s_clause 0x1
-; GFX1164DAGISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c
-; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s6, s2
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1164DAGISEL-NEXT: s_endpgm
-;
-; GFX1164GISEL-LABEL: uniform_value:
-; GFX1164GISEL: ; %bb.0: ; %entry
-; GFX1164GISEL-NEXT: s_clause 0x1
-; GFX1164GISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c
-; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164GISEL-NEXT: s_mul_i32 s2, s6, s2
-; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
-; GFX1164GISEL-NEXT: s_endpgm
-;
-; GFX1132DAGISEL-LABEL: uniform_value:
-; GFX1132DAGISEL: ; %bb.0: ; %entry
-; GFX1132DAGISEL-NEXT: s_clause 0x1
-; GFX1132DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
-; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3
-; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s2, s3
-; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1132DAGISEL-NEXT: s_endpgm
-;
-; GFX1132GISEL-LABEL: uniform_value:
-; GFX1132GISEL: ; %bb.0: ; %entry
-; GFX1132GISEL-NEXT: s_clause 0x1
-; GFX1132GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
-; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s3, s3
-; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132GISEL-NEXT: s_mul_i32 s2, s2, s3
-; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
-; GFX1132GISEL-NEXT: s_endpgm
-entry:
- %result = call i32 @llvm.amdgcn.wave.reduce.uadd.i32(i32 %in, i32 1)
- store i32 %result, ptr addrspace(1) %out
- ret void
-}
-
-define amdgpu_kernel void @const_value(ptr addrspace(1) %out) {
-; GFX8DAGISEL-LABEL: const_value:
-; GFX8DAGISEL: ; %bb.0: ; %entry
-; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX8DAGISEL-NEXT: s_mulk_i32 s2, 0x7b
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
-; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
-; GFX8DAGISEL-NEXT: s_endpgm
-;
-; GFX8GISEL-LABEL: const_value:
-; GFX8GISEL: ; %bb.0: ; %entry
-; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX8GISEL-NEXT: s_mulk_i32 s2, 0x7b
-; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2
-; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
-; GFX8GISEL-NEXT: s_endpgm
-;
-; GFX9DAGISEL-LABEL: const_value:
-; GFX9DAGISEL: ; %bb.0: ; %entry
-; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX9DAGISEL-NEXT: s_mulk_i32 s2, 0x7b
-; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
-; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX9DAGISEL-NEXT: s_endpgm
-;
-; GFX9GISEL-LABEL: const_value:
-; GFX9GISEL: ; %bb.0: ; %entry
-; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX9GISEL-NEXT: s_mulk_i32 s2, 0x7b
-; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
-; GFX9GISEL-NEXT: s_endpgm
-;
-; GFX1064DAGISEL-LABEL: const_value:
-; GFX1064DAGISEL: ; %bb.0: ; %entry
-; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1064DAGISEL-NEXT: s_mulk_i32 s2, 0x7b
-; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX1064DAGISEL-NEXT: s_endpgm
-;
-; GFX1064GISEL-LABEL: const_value:
-; GFX1064GISEL: ; %bb.0: ; %entry
-; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1064GISEL-NEXT: s_mulk_i32 s2, 0x7b
-; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
-; GFX1064GISEL-NEXT: s_endpgm
-;
-; GFX1032DAGISEL-LABEL: const_value:
-; GFX1032DAGISEL: ; %bb.0: ; %entry
-; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
-; GFX1032DAGISEL-NEXT: s_mulk_i32 s2, 0x7b
-; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2
-; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX1032DAGISEL-NEXT: s_endpgm
-;
-; GFX1032GISEL-LABEL: const_value:
-; GFX1032GISEL: ; %bb.0: ; %entry
-; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
-; GFX1032GISEL-NEXT: s_mulk_i32 s2, 0x7b
-; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
-; GFX1032GISEL-NEXT: s_endpgm
-;
-; GFX1164DAGISEL-LABEL: const_value:
-; GFX1164DAGISEL: ; %bb.0: ; %entry
-; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1164DAGISEL-NEXT: s_mulk_i32 s2, 0x7b
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1164DAGISEL-NEXT: s_endpgm
-;
-; GFX1164GISEL-LABEL: const_value:
-; GFX1164GISEL: ; %bb.0: ; %entry
-; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1164GISEL-NEXT: s_mulk_i32 s2, 0x7b
-; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
-; GFX1164GISEL-NEXT: s_endpgm
-;
-; GFX1132DAGISEL-LABEL: const_value:
-; GFX1132DAGISEL: ; %bb.0: ; %entry
-; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
-; GFX1132DAGISEL-NEXT: s_mulk_i32 s2, 0x7b
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1132DAGISEL-NEXT: s_endpgm
-;
-; GFX1132GISEL-LABEL: const_value:
-; GFX1132GISEL: ; %bb.0: ; %entry
-; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
-; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1132GISEL-NEXT: s_mulk_i32 s2, 0x7b
-; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
-; GFX1132GISEL-NEXT: s_endpgm
- entry:
- %result = call i32 @llvm.amdgcn.wave.reduce.uadd.i32(i32 123, i32 1)
- store i32 %result, ptr addrspace(1) %out
- ret void
-}
-
-define amdgpu_kernel void @poison_value(ptr addrspace(1) %out, i32 %in) {
-; GFX8DAGISEL-LABEL: poison_value:
-; GFX8DAGISEL: ; %bb.0: ; %entry
-; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8DAGISEL-NEXT: s_mul_i32 s2, s0, s2
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
-; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
-; GFX8DAGISEL-NEXT: s_endpgm
-;
-; GFX8GISEL-LABEL: poison_value:
-; GFX8GISEL: ; %bb.0: ; %entry
-; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8GISEL-NEXT: s_mul_i32 s2, s0, s2
-; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2
-; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
-; GFX8GISEL-NEXT: s_endpgm
-;
-; GFX9DAGISEL-LABEL: poison_value:
-; GFX9DAGISEL: ; %bb.0: ; %entry
-; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9DAGISEL-NEXT: s_mul_i32 s2, s0, s2
-; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
-; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX9DAGISEL-NEXT: s_endpgm
-;
-; GFX9GISEL-LABEL: poison_value:
-; GFX9GISEL: ; %bb.0: ; %entry
-; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9GISEL-NEXT: s_mul_i32 s2, s0, s2
-; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
-; GFX9GISEL-NEXT: s_endpgm
-;
-; GFX1064DAGISEL-LABEL: poison_value:
-; GFX1064DAGISEL: ; %bb.0: ; %entry
-; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s0, s2
-; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX1064DAGISEL-NEXT: s_endpgm
-;
-; GFX1064GISEL-LABEL: poison_value:
-; GFX1064GISEL: ; %bb.0: ; %entry
-; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064GISEL-NEXT: s_mul_i32 s2, s0, s2
-; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
-; GFX1064GISEL-NEXT: s_endpgm
-;
-; GFX1032DAGISEL-LABEL: poison_value:
-; GFX1032DAGISEL: ; %bb.0: ; %entry
-; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
-; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s0, s2
-; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2
-; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX1032DAGISEL-NEXT: s_endpgm
-;
-; GFX1032GISEL-LABEL: poison_value:
-; GFX1032GISEL: ; %bb.0: ; %entry
-; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
-; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032GISEL-NEXT: s_mul_i32 s2, s0, s2
-; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
-; GFX1032GISEL-NEXT: s_endpgm
-;
-; GFX1164DAGISEL-LABEL: poison_value:
-; GFX1164DAGISEL: ; %bb.0: ; %entry
-; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s0, s2
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1164DAGISEL-NEXT: s_endpgm
-;
-; GFX1164GISEL-LABEL: poison_value:
-; GFX1164GISEL: ; %bb.0: ; %entry
-; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164GISEL-NEXT: s_mul_i32 s2, s0, s2
-; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
-; GFX1164GISEL-NEXT: s_endpgm
-;
-; GFX1132DAGISEL-LABEL: poison_value:
-; GFX1132DAGISEL: ; %bb.0: ; %entry
-; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
-; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s0, s2
-; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1132DAGISEL-NEXT: s_endpgm
-;
-; GFX1132GISEL-LABEL: poison_value:
-; GFX1132GISEL: ; %bb.0: ; %entry
-; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
-; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132GISEL-NEXT: s_mul_i32 s2, s0, s2
-; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
-; GFX1132GISEL-NEXT: s_endpgm
-entry:
- %result = call i32 @llvm.amdgcn.wave.reduce.uadd.i32(i32 poison, i32 1)
- store i32 %result, ptr addrspace(1) %out
- ret void
-}
-
-define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
-; GFX8DAGISEL-LABEL: divergent_value:
-; GFX8DAGISEL: ; %bb.0: ; %entry
-; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX8DAGISEL-NEXT: s_mov_b32 s4, 0
-; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
-; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
-; GFX8DAGISEL-NEXT: s_add_i32 s4, s4, s6
-; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
-; GFX8DAGISEL-NEXT: ; %bb.2:
-; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4
-; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
-; GFX8DAGISEL-NEXT: s_endpgm
-;
-; GFX8GISEL-LABEL: divergent_value:
-; GFX8GISEL: ; %bb.0: ; %entry
-; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX8GISEL-NEXT: s_mov_b32 s4, 0
-; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX8GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
-; GFX8GISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s5
-; GFX8GISEL-NEXT: s_add_i32 s4, s4, s6
-; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB3_1
-; GFX8GISEL-NEXT: ; %bb.2:
-; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4
-; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
-; GFX8GISEL-NEXT: s_endpgm
-;
-; GFX9DAGISEL-LABEL: divergent_value:
-; GFX9DAGISEL: ; %bb.0: ; %entry
-; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX9DAGISEL-NEXT: s_mov_b32 s4, 0
-; GFX9DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
-; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
-; GFX9DAGISEL-NEXT: s_add_i32 s4, s4, s6
-; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
-; GFX9DAGISEL-NEXT: ; %bb.2:
-; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4
-; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
-; GFX9DAGISEL-NEXT: s_endpgm
-;
-; GFX9GISEL-LABEL: divergent_value:
-; GFX9GISEL: ; %bb.0: ; %entry
-; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX9GISEL-NEXT: s_mov_b32 s4, 0
-; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX9GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
-; GFX9GISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s5
-; GFX9GISEL-NEXT: s_add_i32 s4, s4, s6
-; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB3_1
-; GFX9GISEL-NEXT: ; %bb.2:
-; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4
-; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
-; GFX9GISEL-NEXT: s_endpgm
-;
-; GFX1064DAGISEL-LABEL: divergent_value:
-; GFX1064DAGISEL: ; %bb.0: ; %entry
-; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064DAGISEL-NEXT: s_mov_b32 s4, 0
-; GFX1064DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
-; GFX1064DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
-; GFX1064DAGISEL-NEXT: s_add_i32 s4, s4, s6
-; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
-; GFX1064DAGISEL-NEXT: ; %bb.2:
-; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s4
-; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
-; GFX1064DAGISEL-NEXT: s_endpgm
-;
-; GFX1064GISEL-LABEL: divergent_value:
-; GFX1064GISEL: ; %bb.0: ; %entry
-; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064GISEL-NEXT: s_mov_b32 s4, 0
-; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1064GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
-; GFX1064GISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s5
-; GFX1064GISEL-NEXT: s_add_i32 s4, s4, s6
-; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB3_1
-; GFX1064GISEL-NEXT: ; %bb.2:
-; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4
-; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
-; GFX1064GISEL-NEXT: s_endpgm
-;
-; GFX1032DAGISEL-LABEL: divergent_value:
-; GFX1032DAGISEL: ; %bb.0: ; %entry
-; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032DAGISEL-NEXT: s_mov_b32 s2, 0
-; GFX1032DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s4, s3
-; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
-; GFX1032DAGISEL-NEXT: s_bitset0_b32 s3, s4
-; GFX1032DAGISEL-NEXT: s_add_i32 s2, s2, s5
-; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
-; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
-; GFX1032DAGISEL-NEXT: ; %bb.2:
-; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
-; GFX1032DAGISEL-NEXT: s_endpgm
-;
-; GFX1032GISEL-LABEL: divergent_value:
-; GFX1032GISEL: ; %bb.0: ; %entry
-; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032GISEL-NEXT: s_mov_b32 s2, 0
-; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1032GISEL-NEXT: s_ff1_i32_b32 s4, s3
-; GFX1032GISEL-NEXT: v_readlane_b32 s5, v0, s4
-; GFX1032GISEL-NEXT: s_bitset0_b32 s3, s4
-; GFX1032GISEL-NEXT: s_add_i32 s2, s2, s5
-; GFX1032GISEL-NEXT: s_cmp_lg_u32 s3, 0
-; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB3_1
-; GFX1032GISEL-NEXT: ; %bb.2:
-; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
-; GFX1032GISEL-NEXT: s_endpgm
-;
-; GFX1164DAGISEL-LABEL: divergent_value:
-; GFX1164DAGISEL: ; %bb.0: ; %entry
-; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164DAGISEL-NEXT: s_mov_b32 s4, 0
-; GFX1164DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT: s_add_i32 s4, s4, s6
-; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
-; GFX1164DAGISEL-NEXT: ; %bb.2:
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s4
-; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1]
-; GFX1164DAGISEL-NEXT: s_endpgm
-;
-; GFX1164GISEL-LABEL: divergent_value:
-; GFX1164GISEL: ; %bb.0: ; %entry
-; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164GISEL-NEXT: s_mov_b32 s4, 0
-; GFX1164GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
-; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5
-; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164GISEL-NEXT: s_add_i32 s4, s4, s6
-; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1
-; GFX1164GISEL-NEXT: ; %bb.2:
-; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4
-; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
-; GFX1164GISEL-NEXT: s_endpgm
-;
-; GFX1132DAGISEL-LABEL: divergent_value:
-; GFX1132DAGISEL: ; %bb.0: ; %entry
-; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132DAGISEL-NEXT: s_mov_b32 s2, 0
-; GFX1132DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s3
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
-; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT: s_add_i32 s2, s2, s5
-; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
-; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
-; GFX1132DAGISEL-NEXT: ; %bb.2:
-; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1]
-; GFX1132DAGISEL-NEXT: s_endpgm
-;
-; GFX1132GISEL-LABEL: divergent_value:
-; GFX1132GISEL: ; %bb.0: ; %entry
-; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132GISEL-NEXT: s_mov_b32 s2, 0
-; GFX1132GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s3
-; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4
-; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4
-; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132GISEL-NEXT: s_add_i32 s2, s2, s5
-; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0
-; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1
-; GFX1132GISEL-NEXT: ; %bb.2:
-; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
-; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
-; GFX1132GISEL-NEXT: s_endpgm
-entry:
- %id.x = call i32 @llvm.amdgcn.workitem.id.x()
- %result = call i32 @llvm.amdgcn.wave.reduce.uadd.i32(i32 %id.x, i32 1)
- store i32 %result, ptr addrspace(1) %out
- ret void
-}
-
-define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
-; GFX8DAGISEL-LABEL: divergent_cfg:
-; GFX8DAGISEL: ; %bb.0: ; %entry
-; GFX8DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
-; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr2
-; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; GFX8DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_2
-; GFX8DAGISEL-NEXT: ; %bb.1: ; %else
-; GFX8DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
-; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr0
-; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8DAGISEL-NEXT: s_mul_i32 s2, s6, s2
-; GFX8DAGISEL-NEXT: .LBB4_2: ; %Flow
-; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s2
-; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_6
-; GFX8DAGISEL-NEXT: ; %bb.3: ; %if
-; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX8DAGISEL-NEXT: s_mov_b32 s6, 0
-; GFX8DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
-; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
-; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
-; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
-; GFX8DAGISEL-NEXT: s_add_i32 s6, s6, s8
-; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
-; GFX8DAGISEL-NEXT: ; %bb.5:
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6
-; GFX8DAGISEL-NEXT: .LBB4_6: ; %endif
-; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0
-; GFX8DAGISEL-NEXT: flat_store_dword v[2:3], v1
-; GFX8DAGISEL-NEXT: s_endpgm
-;
-; GFX8GISEL-LABEL: divergent_cfg:
-; GFX8GISEL: ; %bb.0: ; %entry
-; GFX8GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
-; GFX8GISEL-NEXT: ; implicit-def: $sgpr6
-; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_2
-; GFX8GISEL-NEXT: ; %bb.1: ; %else
-; GFX8GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
-; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX8GISEL-NEXT: ; implicit-def: $vgpr0
-; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8GISEL-NEXT: s_mul_i32 s6, s6, s2
-; GFX8GISEL-NEXT: .LBB4_2: ; %Flow
-; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_5
-; GFX8GISEL-NEXT: ; %bb.3: ; %if
-; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX8GISEL-NEXT: s_mov_b32 s6, 0
-; GFX8GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
-; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
-; GFX8GISEL-NEXT: v_readlane_b32 s8, v0, s7
-; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s7
-; GFX8GISEL-NEXT: s_add_i32 s6, s6, s8
-; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_4
-; GFX8GISEL-NEXT: .LBB4_5: ; %endif
-; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6
-; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
-; GFX8GISEL-NEXT: s_endpgm
-;
-; GFX9DAGISEL-LABEL: divergent_cfg:
-; GFX9DAGISEL: ; %bb.0: ; %entry
-; GFX9DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
-; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr2
-; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; GFX9DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_2
-; GFX9DAGISEL-NEXT: ; %bb.1: ; %else
-; GFX9DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
-; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr0
-; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9DAGISEL-NEXT: s_mul_i32 s2, s6, s2
-; GFX9DAGISEL-NEXT: .LBB4_2: ; %Flow
-; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
-; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
-; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_6
-; GFX9DAGISEL-NEXT: ; %bb.3: ; %if
-; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX9DAGISEL-NEXT: s_mov_b32 s6, 0
-; GFX9DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
-; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
-; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
-; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
-; GFX9DAGISEL-NEXT: s_add_i32 s6, s6, s8
-; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
-; GFX9DAGISEL-NEXT: ; %bb.5:
-; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6
-; GFX9DAGISEL-NEXT: .LBB4_6: ; %endif
-; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX9DAGISEL-NEXT: s_endpgm
-;
-; GFX9GISEL-LABEL: divergent_cfg:
-; GFX9GISEL: ; %bb.0: ; %entry
-; GFX9GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
-; GFX9GISEL-NEXT: ; implicit-def: $sgpr6
-; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_2
-; GFX9GISEL-NEXT: ; %bb.1: ; %else
-; GFX9GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
-; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX9GISEL-NEXT: ; implicit-def: $vgpr0
-; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9GISEL-NEXT: s_mul_i32 s6, s6, s2
-; GFX9GISEL-NEXT: .LBB4_2: ; %Flow
-; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_5
-; GFX9GISEL-NEXT: ; %bb.3: ; %if
-; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX9GISEL-NEXT: s_mov_b32 s6, 0
-; GFX9GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
-; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
-; GFX9GISEL-NEXT: v_readlane_b32 s8, v0, s7
-; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s7
-; GFX9GISEL-NEXT: s_add_i32 s6, s6, s8
-; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_4
-; GFX9GISEL-NEXT: .LBB4_5: ; %endif
-; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6
-; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
-; GFX9GISEL-NEXT: s_endpgm
-;
-; GFX1064DAGISEL-LABEL: divergent_cfg:
-; GFX1064DAGISEL: ; %bb.0: ; %entry
-; GFX1064DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
-; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr2
-; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; GFX1064DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_2
-; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else
-; GFX1064DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
-; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064DAGISEL-NEXT: ; implicit-def: $vgpr0
-; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s6, s2
-; GFX1064DAGISEL-NEXT: .LBB4_2: ; %Flow
-; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
-; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_6
-; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if
-; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064DAGISEL-NEXT: s_mov_b32 s6, 0
-; GFX1064DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
-; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
-; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
-; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
-; GFX1064DAGISEL-NEXT: s_add_i32 s6, s6, s8
-; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
-; GFX1064DAGISEL-NEXT: ; %bb.5:
-; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6
-; GFX1064DAGISEL-NEXT: .LBB4_6: ; %endif
-; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX1064DAGISEL-NEXT: s_endpgm
-;
-; GFX1064GISEL-LABEL: divergent_cfg:
-; GFX1064GISEL: ; %bb.0: ; %entry
-; GFX1064GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
-; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6
-; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_2
-; GFX1064GISEL-NEXT: ; %bb.1: ; %else
-; GFX1064GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
-; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0
-; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064GISEL-NEXT: s_mul_i32 s6, s6, s2
-; GFX1064GISEL-NEXT: .LBB4_2: ; %Flow
-; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_5
-; GFX1064GISEL-NEXT: ; %bb.3: ; %if
-; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064GISEL-NEXT: s_mov_b32 s6, 0
-; GFX1064GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
-; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
-; GFX1064GISEL-NEXT: v_readlane_b32 s8, v0, s7
-; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s7
-; GFX1064GISEL-NEXT: s_add_i32 s6, s6, s8
-; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_4
-; GFX1064GISEL-NEXT: .LBB4_5: ; %endif
-; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6
-; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
-; GFX1064GISEL-NEXT: s_endpgm
-;
-; GFX1032DAGISEL-LABEL: divergent_cfg:
-; GFX1032DAGISEL: ; %bb.0: ; %entry
-; GFX1032DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc_lo, 15, v0
-; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr1
-; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX1032DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_2
-; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else
-; GFX1032DAGISEL-NEXT: s_load_dword s1, s[4:5], 0x2c
-; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX1032DAGISEL-NEXT: ; implicit-def: $vgpr0
-; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
-; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032DAGISEL-NEXT: s_mul_i32 s1, s1, s2
-; GFX1032DAGISEL-NEXT: .LBB4_2: ; %Flow
-; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, s0
-; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_6
-; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if
-; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX1032DAGISEL-NEXT: s_mov_b32 s1, 0
-; GFX1032DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
-; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s3, s2
-; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s3
-; GFX1032DAGISEL-NEXT: s_bitset0_b32 s2, s3
-; GFX1032DAGISEL-NEXT: s_add_i32 s1, s1, s6
-; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
-; GFX1032DAGISEL-NEXT: ; %bb.5:
-; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX1032DAGISEL-NEXT: .LBB4_6: ; %endif
-; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX1032DAGISEL-NEXT: s_endpgm
-;
-; GFX1032GISEL-LABEL: divergent_cfg:
-; GFX1032GISEL: ; %bb.0: ; %entry
-; GFX1032GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v0
-; GFX1032GISEL-NEXT: ; implicit-def: $sgpr0
-; GFX1032GISEL-NEXT: s_and_saveexec_b32 s1, vcc_lo
-; GFX1032GISEL-NEXT: s_xor_b32 s1, exec_lo, s1
-; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_2
-; GFX1032GISEL-NEXT: ; %bb.1: ; %else
-; GFX1032GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c
-; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0
-; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
-; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032GISEL-NEXT: s_mul_i32 s0, s0, s2
-; GFX1032GISEL-NEXT: .LBB4_2: ; %Flow
-; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s1, s1
-; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_5
-; GFX1032GISEL-NEXT: ; %bb.3: ; %if
-; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX1032GISEL-NEXT: s_mov_b32 s0, 0
-; GFX1032GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
-; GFX1032GISEL-NEXT: s_ff1_i32_b32 s3, s2
-; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s3
-; GFX1032GISEL-NEXT: s_bitset0_b32 s2, s3
-; GFX1032GISEL-NEXT: s_add_i32 s0, s0, s6
-; GFX1032GISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_4
-; GFX1032GISEL-NEXT: .LBB4_5: ; %endif
-; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX1032GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
-; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[2:3]
-; GFX1032GISEL-NEXT: s_endpgm
-;
-; GFX1164DAGISEL-LABEL: divergent_cfg:
-; GFX1164DAGISEL: ; %bb.0: ; %entry
-; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec
-; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr2
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
-; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_2
-; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else
-; GFX1164DAGISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c
-; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164DAGISEL-NEXT: ; implicit-def: $vgpr0
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s6, s2
-; GFX1164DAGISEL-NEXT: .LBB4_2: ; %Flow
-; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_6
-; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if
-; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164DAGISEL-NEXT: s_mov_b32 s6, 0
-; GFX1164DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
-; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
-; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
-; GFX1164DAGISEL-NEXT: s_add_i32 s6, s6, s8
-; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
-; GFX1164DAGISEL-NEXT: ; %bb.5:
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6
-; GFX1164DAGISEL-NEXT: .LBB4_6: ; %endif
-; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1164DAGISEL-NEXT: s_endpgm
-;
-; GFX1164GISEL-LABEL: divergent_cfg:
-; GFX1164GISEL: ; %bb.0: ; %entry
-; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec
-; GFX1164GISEL-NEXT: ; implicit-def: $sgpr6
-; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
-; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_2
-; GFX1164GISEL-NEXT: ; %bb.1: ; %else
-; GFX1164GISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c
-; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0
-; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164GISEL-NEXT: s_mul_i32 s6, s6, s2
-; GFX1164GISEL-NEXT: .LBB4_2: ; %Flow
-; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1]
-; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_5
-; GFX1164GISEL-NEXT: ; %bb.3: ; %if
-; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164GISEL-NEXT: s_mov_b32 s6, 0
-; GFX1164GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
-; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
-; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7
-; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s7
-; GFX1164GISEL-NEXT: s_add_i32 s6, s6, s8
-; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_4
-; GFX1164GISEL-NEXT: .LBB4_5: ; %endif
-; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6
-; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
-; GFX1164GISEL-NEXT: s_endpgm
-;
-; GFX1132DAGISEL-LABEL: divergent_cfg:
-; GFX1132DAGISEL: ; %bb.0: ; %entry
-; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1132DAGISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr1
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
-; GFX1132DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_2
-; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else
-; GFX1132DAGISEL-NEXT: s_load_b32 s1, s[4:5], 0x2c
-; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132DAGISEL-NEXT: ; implicit-def: $vgpr0
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
-; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132DAGISEL-NEXT: s_mul_i32 s1, s1, s2
-; GFX1132DAGISEL-NEXT: .LBB4_2: ; %Flow
-; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, s0
-; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_6
-; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if
-; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0
-; GFX1132DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
-; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s3
-; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3
-; GFX1132DAGISEL-NEXT: s_add_i32 s1, s1, s6
-; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
-; GFX1132DAGISEL-NEXT: ; %bb.5:
-; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX1132DAGISEL-NEXT: .LBB4_6: ; %endif
-; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1132DAGISEL-NEXT: s_endpgm
-;
-; GFX1132GISEL-LABEL: divergent_cfg:
-; GFX1132GISEL: ; %bb.0: ; %entry
-; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1132GISEL-NEXT: s_mov_b32 s1, exec_lo
-; GFX1132GISEL-NEXT: ; implicit-def: $sgpr0
-; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
-; GFX1132GISEL-NEXT: s_xor_b32 s1, exec_lo, s1
-; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_2
-; GFX1132GISEL-NEXT: ; %bb.1: ; %else
-; GFX1132GISEL-NEXT: s_load_b32 s0, s[4:5], 0x2c
-; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0
-; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
-; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132GISEL-NEXT: s_mul_i32 s0, s0, s2
-; GFX1132GISEL-NEXT: .LBB4_2: ; %Flow
-; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s1, s1
-; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_5
-; GFX1132GISEL-NEXT: ; %bb.3: ; %if
-; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132GISEL-NEXT: s_mov_b32 s0, 0
-; GFX1132GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
-; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2
-; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s3
-; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3
-; GFX1132GISEL-NEXT: s_add_i32 s0, s0, s6
-; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_4
-; GFX1132GISEL-NEXT: .LBB4_5: ; %endif
-; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24
-; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
-; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
-; GFX1132GISEL-NEXT: s_endpgm
- entry:
- %tid = call i32 @llvm.amdgcn.workitem.id.x()
- %d_cmp = icmp ult i32 %tid, 16
- br i1 %d_cmp, label %if, label %else
-
-if:
- %reducedValTid = call i32 @llvm.amdgcn.wave.reduce.uadd.i32(i32 %tid, i32 1)
- br label %endif
-
-else:
- %reducedValIn = call i32 @llvm.amdgcn.wave.reduce.uadd.i32(i32 %in, i32 1)
- br label %endif
-
-endif:
- %combine = phi i32 [%reducedValTid, %if], [%reducedValIn, %else]
- store i32 %combine, ptr addrspace(1) %out
- ret void
-}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX10DAGISEL: {{.*}}
-; GFX10GISEL: {{.*}}
-; GFX11DAGISEL: {{.*}}
-; GFX11GISEL: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.usub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.usub.ll
deleted file mode 100644
index daff91c48c814..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.usub.ll
+++ /dev/null
@@ -1,1286 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 < %s | FileCheck -check-prefixes=GFX8DAGISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 < %s | FileCheck -check-prefixes=GFX8GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 < %s | FileCheck -check-prefixes=GFX9DAGISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 < %s | FileCheck -check-prefixes=GFX9GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s
-
-define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
-; GFX8DAGISEL-LABEL: uniform_value:
-; GFX8DAGISEL: ; %bb.0: ; %entry
-; GFX8DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
-; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8DAGISEL-NEXT: s_mul_i32 s3, s6, -1
-; GFX8DAGISEL-NEXT: s_mul_i32 s2, s3, s2
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
-; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
-; GFX8DAGISEL-NEXT: s_endpgm
-;
-; GFX8GISEL-LABEL: uniform_value:
-; GFX8GISEL: ; %bb.0: ; %entry
-; GFX8GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
-; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8GISEL-NEXT: s_mul_i32 s3, s6, -1
-; GFX8GISEL-NEXT: s_mul_i32 s2, s3, s2
-; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2
-; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
-; GFX8GISEL-NEXT: s_endpgm
-;
-; GFX9DAGISEL-LABEL: uniform_value:
-; GFX9DAGISEL: ; %bb.0: ; %entry
-; GFX9DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
-; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9DAGISEL-NEXT: s_mul_i32 s3, s6, -1
-; GFX9DAGISEL-NEXT: s_mul_i32 s2, s3, s2
-; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
-; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX9DAGISEL-NEXT: s_endpgm
-;
-; GFX9GISEL-LABEL: uniform_value:
-; GFX9GISEL: ; %bb.0: ; %entry
-; GFX9GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
-; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9GISEL-NEXT: s_mul_i32 s3, s6, -1
-; GFX9GISEL-NEXT: s_mul_i32 s2, s3, s2
-; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
-; GFX9GISEL-NEXT: s_endpgm
-;
-; GFX1064DAGISEL-LABEL: uniform_value:
-; GFX1064DAGISEL: ; %bb.0: ; %entry
-; GFX1064DAGISEL-NEXT: s_clause 0x1
-; GFX1064DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
-; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064DAGISEL-NEXT: s_mul_i32 s3, s6, -1
-; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s3, s2
-; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX1064DAGISEL-NEXT: s_endpgm
-;
-; GFX1064GISEL-LABEL: uniform_value:
-; GFX1064GISEL: ; %bb.0: ; %entry
-; GFX1064GISEL-NEXT: s_clause 0x1
-; GFX1064GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
-; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064GISEL-NEXT: s_mul_i32 s3, s6, -1
-; GFX1064GISEL-NEXT: s_mul_i32 s2, s3, s2
-; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
-; GFX1064GISEL-NEXT: s_endpgm
-;
-; GFX1032DAGISEL-LABEL: uniform_value:
-; GFX1032DAGISEL: ; %bb.0: ; %entry
-; GFX1032DAGISEL-NEXT: s_clause 0x1
-; GFX1032DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
-; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3
-; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s2, -1
-; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s2, s3
-; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2
-; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX1032DAGISEL-NEXT: s_endpgm
-;
-; GFX1032GISEL-LABEL: uniform_value:
-; GFX1032GISEL: ; %bb.0: ; %entry
-; GFX1032GISEL-NEXT: s_clause 0x1
-; GFX1032GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
-; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s3, s3
-; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032GISEL-NEXT: s_mul_i32 s2, s2, -1
-; GFX1032GISEL-NEXT: s_mul_i32 s2, s2, s3
-; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
-; GFX1032GISEL-NEXT: s_endpgm
-;
-; GFX1164DAGISEL-LABEL: uniform_value:
-; GFX1164DAGISEL: ; %bb.0: ; %entry
-; GFX1164DAGISEL-NEXT: s_clause 0x1
-; GFX1164DAGISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c
-; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: s_mul_i32 s3, s6, -1
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s3, s2
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1164DAGISEL-NEXT: s_endpgm
-;
-; GFX1164GISEL-LABEL: uniform_value:
-; GFX1164GISEL: ; %bb.0: ; %entry
-; GFX1164GISEL-NEXT: s_clause 0x1
-; GFX1164GISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c
-; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164GISEL-NEXT: s_mul_i32 s3, s6, -1
-; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1164GISEL-NEXT: s_mul_i32 s2, s3, s2
-; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
-; GFX1164GISEL-NEXT: s_endpgm
-;
-; GFX1132DAGISEL-LABEL: uniform_value:
-; GFX1132DAGISEL: ; %bb.0: ; %entry
-; GFX1132DAGISEL-NEXT: s_clause 0x1
-; GFX1132DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
-; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3
-; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s2, -1
-; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s2, s3
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1132DAGISEL-NEXT: s_endpgm
-;
-; GFX1132GISEL-LABEL: uniform_value:
-; GFX1132GISEL: ; %bb.0: ; %entry
-; GFX1132GISEL-NEXT: s_clause 0x1
-; GFX1132GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
-; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s3, s3
-; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132GISEL-NEXT: s_mul_i32 s2, s2, -1
-; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1132GISEL-NEXT: s_mul_i32 s2, s2, s3
-; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
-; GFX1132GISEL-NEXT: s_endpgm
-entry:
- %result = call i32 @llvm.amdgcn.wave.reduce.usub.i32(i32 %in, i32 1)
- store i32 %result, ptr addrspace(1) %out
- ret void
-}
-
-define amdgpu_kernel void @const_value(ptr addrspace(1) %out) {
-; GFX8DAGISEL-LABEL: const_value:
-; GFX8DAGISEL: ; %bb.0: ; %entry
-; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX8DAGISEL-NEXT: s_mul_i32 s4, -1, 0x7b
-; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX8DAGISEL-NEXT: s_mul_i32 s2, s4, s2
-; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
-; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
-; GFX8DAGISEL-NEXT: s_endpgm
-;
-; GFX8GISEL-LABEL: const_value:
-; GFX8GISEL: ; %bb.0: ; %entry
-; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX8GISEL-NEXT: s_mul_i32 s3, -1, 0x7b
-; GFX8GISEL-NEXT: s_mul_i32 s2, s3, s2
-; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2
-; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
-; GFX8GISEL-NEXT: s_endpgm
-;
-; GFX9DAGISEL-LABEL: const_value:
-; GFX9DAGISEL: ; %bb.0: ; %entry
-; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX9DAGISEL-NEXT: s_mul_i32 s3, -1, 0x7b
-; GFX9DAGISEL-NEXT: s_mul_i32 s2, s3, s2
-; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
-; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX9DAGISEL-NEXT: s_endpgm
-;
-; GFX9GISEL-LABEL: const_value:
-; GFX9GISEL: ; %bb.0: ; %entry
-; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX9GISEL-NEXT: s_mul_i32 s3, -1, 0x7b
-; GFX9GISEL-NEXT: s_mul_i32 s2, s3, s2
-; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
-; GFX9GISEL-NEXT: s_endpgm
-;
-; GFX1064DAGISEL-LABEL: const_value:
-; GFX1064DAGISEL: ; %bb.0: ; %entry
-; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1064DAGISEL-NEXT: s_mul_i32 s3, -1, 0x7b
-; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s3, s2
-; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX1064DAGISEL-NEXT: s_endpgm
-;
-; GFX1064GISEL-LABEL: const_value:
-; GFX1064GISEL: ; %bb.0: ; %entry
-; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1064GISEL-NEXT: s_mul_i32 s3, -1, 0x7b
-; GFX1064GISEL-NEXT: s_mul_i32 s2, s3, s2
-; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
-; GFX1064GISEL-NEXT: s_endpgm
-;
-; GFX1032DAGISEL-LABEL: const_value:
-; GFX1032DAGISEL: ; %bb.0: ; %entry
-; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX1032DAGISEL-NEXT: s_mul_i32 s3, -1, 0x7b
-; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
-; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s3, s2
-; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2
-; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX1032DAGISEL-NEXT: s_endpgm
-;
-; GFX1032GISEL-LABEL: const_value:
-; GFX1032GISEL: ; %bb.0: ; %entry
-; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX1032GISEL-NEXT: s_mul_i32 s3, -1, 0x7b
-; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
-; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032GISEL-NEXT: s_mul_i32 s2, s3, s2
-; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
-; GFX1032GISEL-NEXT: s_endpgm
-;
-; GFX1164DAGISEL-LABEL: const_value:
-; GFX1164DAGISEL: ; %bb.0: ; %entry
-; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1164DAGISEL-NEXT: s_mul_i32 s3, -1, 0x7b
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s3, s2
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1164DAGISEL-NEXT: s_endpgm
-;
-; GFX1164GISEL-LABEL: const_value:
-; GFX1164GISEL: ; %bb.0: ; %entry
-; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1164GISEL-NEXT: s_mul_i32 s3, -1, 0x7b
-; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1164GISEL-NEXT: s_mul_i32 s2, s3, s2
-; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
-; GFX1164GISEL-NEXT: s_endpgm
-;
-; GFX1132DAGISEL-LABEL: const_value:
-; GFX1132DAGISEL: ; %bb.0: ; %entry
-; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132DAGISEL-NEXT: s_mul_i32 s3, -1, 0x7b
-; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s3, s2
-; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1132DAGISEL-NEXT: s_endpgm
-;
-; GFX1132GISEL-LABEL: const_value:
-; GFX1132GISEL: ; %bb.0: ; %entry
-; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132GISEL-NEXT: s_mul_i32 s3, -1, 0x7b
-; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
-; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1132GISEL-NEXT: s_mul_i32 s2, s3, s2
-; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
-; GFX1132GISEL-NEXT: s_endpgm
- entry:
- %result = call i32 @llvm.amdgcn.wave.reduce.usub.i32(i32 123, i32 1)
- store i32 %result, ptr addrspace(1) %out
- ret void
-}
-
-define amdgpu_kernel void @poison_value(ptr addrspace(1) %out, i32 %in) {
-; GFX8DAGISEL-LABEL: poison_value:
-; GFX8DAGISEL: ; %bb.0: ; %entry
-; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8DAGISEL-NEXT: s_mul_i32 s4, s0, -1
-; GFX8DAGISEL-NEXT: s_mul_i32 s2, s4, s2
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
-; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
-; GFX8DAGISEL-NEXT: s_endpgm
-;
-; GFX8GISEL-LABEL: poison_value:
-; GFX8GISEL: ; %bb.0: ; %entry
-; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8GISEL-NEXT: s_mul_i32 s3, s0, -1
-; GFX8GISEL-NEXT: s_mul_i32 s2, s3, s2
-; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2
-; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
-; GFX8GISEL-NEXT: s_endpgm
-;
-; GFX9DAGISEL-LABEL: poison_value:
-; GFX9DAGISEL: ; %bb.0: ; %entry
-; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9DAGISEL-NEXT: s_mul_i32 s3, s0, -1
-; GFX9DAGISEL-NEXT: s_mul_i32 s2, s3, s2
-; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
-; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX9DAGISEL-NEXT: s_endpgm
-;
-; GFX9GISEL-LABEL: poison_value:
-; GFX9GISEL: ; %bb.0: ; %entry
-; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9GISEL-NEXT: s_mul_i32 s3, s0, -1
-; GFX9GISEL-NEXT: s_mul_i32 s2, s3, s2
-; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
-; GFX9GISEL-NEXT: s_endpgm
-;
-; GFX1064DAGISEL-LABEL: poison_value:
-; GFX1064DAGISEL: ; %bb.0: ; %entry
-; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064DAGISEL-NEXT: s_mul_i32 s3, s0, -1
-; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s3, s2
-; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX1064DAGISEL-NEXT: s_endpgm
-;
-; GFX1064GISEL-LABEL: poison_value:
-; GFX1064GISEL: ; %bb.0: ; %entry
-; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064GISEL-NEXT: s_mul_i32 s3, s0, -1
-; GFX1064GISEL-NEXT: s_mul_i32 s2, s3, s2
-; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
-; GFX1064GISEL-NEXT: s_endpgm
-;
-; GFX1032DAGISEL-LABEL: poison_value:
-; GFX1032DAGISEL: ; %bb.0: ; %entry
-; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
-; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032DAGISEL-NEXT: s_mul_i32 s3, s0, -1
-; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s3, s2
-; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2
-; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX1032DAGISEL-NEXT: s_endpgm
-;
-; GFX1032GISEL-LABEL: poison_value:
-; GFX1032GISEL: ; %bb.0: ; %entry
-; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
-; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032GISEL-NEXT: s_mul_i32 s3, s0, -1
-; GFX1032GISEL-NEXT: s_mul_i32 s2, s3, s2
-; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
-; GFX1032GISEL-NEXT: s_endpgm
-;
-; GFX1164DAGISEL-LABEL: poison_value:
-; GFX1164DAGISEL: ; %bb.0: ; %entry
-; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: s_mul_i32 s3, s0, -1
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s3, s2
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1164DAGISEL-NEXT: s_endpgm
-;
-; GFX1164GISEL-LABEL: poison_value:
-; GFX1164GISEL: ; %bb.0: ; %entry
-; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164GISEL-NEXT: s_mul_i32 s3, s0, -1
-; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1164GISEL-NEXT: s_mul_i32 s2, s3, s2
-; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
-; GFX1164GISEL-NEXT: s_endpgm
-;
-; GFX1132DAGISEL-LABEL: poison_value:
-; GFX1132DAGISEL: ; %bb.0: ; %entry
-; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
-; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132DAGISEL-NEXT: s_mul_i32 s3, s0, -1
-; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s3, s2
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1132DAGISEL-NEXT: s_endpgm
-;
-; GFX1132GISEL-LABEL: poison_value:
-; GFX1132GISEL: ; %bb.0: ; %entry
-; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
-; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132GISEL-NEXT: s_mul_i32 s3, s0, -1
-; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1132GISEL-NEXT: s_mul_i32 s2, s3, s2
-; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
-; GFX1132GISEL-NEXT: s_endpgm
-entry:
- %result = call i32 @llvm.amdgcn.wave.reduce.usub.i32(i32 poison, i32 1)
- store i32 %result, ptr addrspace(1) %out
- ret void
-}
-
-define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
-; GFX8DAGISEL-LABEL: divergent_value:
-; GFX8DAGISEL: ; %bb.0: ; %entry
-; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX8DAGISEL-NEXT: s_mov_b32 s4, 0
-; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
-; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
-; GFX8DAGISEL-NEXT: s_sub_i32 s4, s4, s6
-; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
-; GFX8DAGISEL-NEXT: ; %bb.2:
-; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4
-; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
-; GFX8DAGISEL-NEXT: s_endpgm
-;
-; GFX8GISEL-LABEL: divergent_value:
-; GFX8GISEL: ; %bb.0: ; %entry
-; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX8GISEL-NEXT: s_mov_b32 s4, 0
-; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX8GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
-; GFX8GISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s5
-; GFX8GISEL-NEXT: s_sub_i32 s4, s4, s6
-; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB3_1
-; GFX8GISEL-NEXT: ; %bb.2:
-; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4
-; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
-; GFX8GISEL-NEXT: s_endpgm
-;
-; GFX9DAGISEL-LABEL: divergent_value:
-; GFX9DAGISEL: ; %bb.0: ; %entry
-; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX9DAGISEL-NEXT: s_mov_b32 s4, 0
-; GFX9DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
-; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
-; GFX9DAGISEL-NEXT: s_sub_i32 s4, s4, s6
-; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
-; GFX9DAGISEL-NEXT: ; %bb.2:
-; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4
-; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
-; GFX9DAGISEL-NEXT: s_endpgm
-;
-; GFX9GISEL-LABEL: divergent_value:
-; GFX9GISEL: ; %bb.0: ; %entry
-; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX9GISEL-NEXT: s_mov_b32 s4, 0
-; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX9GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
-; GFX9GISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s5
-; GFX9GISEL-NEXT: s_sub_i32 s4, s4, s6
-; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB3_1
-; GFX9GISEL-NEXT: ; %bb.2:
-; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4
-; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
-; GFX9GISEL-NEXT: s_endpgm
-;
-; GFX1064DAGISEL-LABEL: divergent_value:
-; GFX1064DAGISEL: ; %bb.0: ; %entry
-; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064DAGISEL-NEXT: s_mov_b32 s4, 0
-; GFX1064DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
-; GFX1064DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
-; GFX1064DAGISEL-NEXT: s_sub_i32 s4, s4, s6
-; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
-; GFX1064DAGISEL-NEXT: ; %bb.2:
-; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s4
-; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
-; GFX1064DAGISEL-NEXT: s_endpgm
-;
-; GFX1064GISEL-LABEL: divergent_value:
-; GFX1064GISEL: ; %bb.0: ; %entry
-; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064GISEL-NEXT: s_mov_b32 s4, 0
-; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1064GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
-; GFX1064GISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s5
-; GFX1064GISEL-NEXT: s_sub_i32 s4, s4, s6
-; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB3_1
-; GFX1064GISEL-NEXT: ; %bb.2:
-; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4
-; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
-; GFX1064GISEL-NEXT: s_endpgm
-;
-; GFX1032DAGISEL-LABEL: divergent_value:
-; GFX1032DAGISEL: ; %bb.0: ; %entry
-; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032DAGISEL-NEXT: s_mov_b32 s2, 0
-; GFX1032DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s4, s3
-; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
-; GFX1032DAGISEL-NEXT: s_bitset0_b32 s3, s4
-; GFX1032DAGISEL-NEXT: s_sub_i32 s2, s2, s5
-; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
-; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
-; GFX1032DAGISEL-NEXT: ; %bb.2:
-; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
-; GFX1032DAGISEL-NEXT: s_endpgm
-;
-; GFX1032GISEL-LABEL: divergent_value:
-; GFX1032GISEL: ; %bb.0: ; %entry
-; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032GISEL-NEXT: s_mov_b32 s2, 0
-; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1032GISEL-NEXT: s_ff1_i32_b32 s4, s3
-; GFX1032GISEL-NEXT: v_readlane_b32 s5, v0, s4
-; GFX1032GISEL-NEXT: s_bitset0_b32 s3, s4
-; GFX1032GISEL-NEXT: s_sub_i32 s2, s2, s5
-; GFX1032GISEL-NEXT: s_cmp_lg_u32 s3, 0
-; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB3_1
-; GFX1032GISEL-NEXT: ; %bb.2:
-; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
-; GFX1032GISEL-NEXT: s_endpgm
-;
-; GFX1164DAGISEL-LABEL: divergent_value:
-; GFX1164DAGISEL: ; %bb.0: ; %entry
-; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164DAGISEL-NEXT: s_mov_b32 s4, 0
-; GFX1164DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT: s_sub_i32 s4, s4, s6
-; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
-; GFX1164DAGISEL-NEXT: ; %bb.2:
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s4
-; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1]
-; GFX1164DAGISEL-NEXT: s_endpgm
-;
-; GFX1164GISEL-LABEL: divergent_value:
-; GFX1164GISEL: ; %bb.0: ; %entry
-; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164GISEL-NEXT: s_mov_b32 s4, 0
-; GFX1164GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
-; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5
-; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164GISEL-NEXT: s_sub_i32 s4, s4, s6
-; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1
-; GFX1164GISEL-NEXT: ; %bb.2:
-; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4
-; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
-; GFX1164GISEL-NEXT: s_endpgm
-;
-; GFX1132DAGISEL-LABEL: divergent_value:
-; GFX1132DAGISEL: ; %bb.0: ; %entry
-; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132DAGISEL-NEXT: s_mov_b32 s2, 0
-; GFX1132DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s3
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
-; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT: s_sub_i32 s2, s2, s5
-; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
-; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
-; GFX1132DAGISEL-NEXT: ; %bb.2:
-; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1]
-; GFX1132DAGISEL-NEXT: s_endpgm
-;
-; GFX1132GISEL-LABEL: divergent_value:
-; GFX1132GISEL: ; %bb.0: ; %entry
-; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132GISEL-NEXT: s_mov_b32 s2, 0
-; GFX1132GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s3
-; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4
-; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4
-; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132GISEL-NEXT: s_sub_i32 s2, s2, s5
-; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0
-; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1
-; GFX1132GISEL-NEXT: ; %bb.2:
-; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
-; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
-; GFX1132GISEL-NEXT: s_endpgm
-entry:
- %id.x = call i32 @llvm.amdgcn.workitem.id.x()
- %result = call i32 @llvm.amdgcn.wave.reduce.usub.i32(i32 %id.x, i32 1)
- store i32 %result, ptr addrspace(1) %out
- ret void
-}
-
-define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
-; GFX8DAGISEL-LABEL: divergent_cfg:
-; GFX8DAGISEL: ; %bb.0: ; %entry
-; GFX8DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
-; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr2
-; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; GFX8DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_2
-; GFX8DAGISEL-NEXT: ; %bb.1: ; %else
-; GFX8DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
-; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr0
-; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8DAGISEL-NEXT: s_mul_i32 s3, s6, -1
-; GFX8DAGISEL-NEXT: s_mul_i32 s2, s3, s2
-; GFX8DAGISEL-NEXT: .LBB4_2: ; %Flow
-; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s2
-; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_6
-; GFX8DAGISEL-NEXT: ; %bb.3: ; %if
-; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX8DAGISEL-NEXT: s_mov_b32 s6, 0
-; GFX8DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
-; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
-; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
-; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
-; GFX8DAGISEL-NEXT: s_sub_i32 s6, s6, s8
-; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
-; GFX8DAGISEL-NEXT: ; %bb.5:
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6
-; GFX8DAGISEL-NEXT: .LBB4_6: ; %endif
-; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0
-; GFX8DAGISEL-NEXT: flat_store_dword v[2:3], v1
-; GFX8DAGISEL-NEXT: s_endpgm
-;
-; GFX8GISEL-LABEL: divergent_cfg:
-; GFX8GISEL: ; %bb.0: ; %entry
-; GFX8GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
-; GFX8GISEL-NEXT: ; implicit-def: $sgpr6
-; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_2
-; GFX8GISEL-NEXT: ; %bb.1: ; %else
-; GFX8GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
-; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX8GISEL-NEXT: ; implicit-def: $vgpr0
-; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8GISEL-NEXT: s_mul_i32 s3, s6, -1
-; GFX8GISEL-NEXT: s_mul_i32 s6, s3, s2
-; GFX8GISEL-NEXT: .LBB4_2: ; %Flow
-; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_5
-; GFX8GISEL-NEXT: ; %bb.3: ; %if
-; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX8GISEL-NEXT: s_mov_b32 s6, 0
-; GFX8GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
-; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
-; GFX8GISEL-NEXT: v_readlane_b32 s8, v0, s7
-; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s7
-; GFX8GISEL-NEXT: s_sub_i32 s6, s6, s8
-; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_4
-; GFX8GISEL-NEXT: .LBB4_5: ; %endif
-; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6
-; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
-; GFX8GISEL-NEXT: s_endpgm
-;
-; GFX9DAGISEL-LABEL: divergent_cfg:
-; GFX9DAGISEL: ; %bb.0: ; %entry
-; GFX9DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
-; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr2
-; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; GFX9DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_2
-; GFX9DAGISEL-NEXT: ; %bb.1: ; %else
-; GFX9DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
-; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr0
-; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9DAGISEL-NEXT: s_mul_i32 s3, s6, -1
-; GFX9DAGISEL-NEXT: s_mul_i32 s2, s3, s2
-; GFX9DAGISEL-NEXT: .LBB4_2: ; %Flow
-; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
-; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
-; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_6
-; GFX9DAGISEL-NEXT: ; %bb.3: ; %if
-; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX9DAGISEL-NEXT: s_mov_b32 s6, 0
-; GFX9DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
-; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
-; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
-; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
-; GFX9DAGISEL-NEXT: s_sub_i32 s6, s6, s8
-; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
-; GFX9DAGISEL-NEXT: ; %bb.5:
-; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6
-; GFX9DAGISEL-NEXT: .LBB4_6: ; %endif
-; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX9DAGISEL-NEXT: s_endpgm
-;
-; GFX9GISEL-LABEL: divergent_cfg:
-; GFX9GISEL: ; %bb.0: ; %entry
-; GFX9GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
-; GFX9GISEL-NEXT: ; implicit-def: $sgpr6
-; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_2
-; GFX9GISEL-NEXT: ; %bb.1: ; %else
-; GFX9GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
-; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX9GISEL-NEXT: ; implicit-def: $vgpr0
-; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9GISEL-NEXT: s_mul_i32 s3, s6, -1
-; GFX9GISEL-NEXT: s_mul_i32 s6, s3, s2
-; GFX9GISEL-NEXT: .LBB4_2: ; %Flow
-; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_5
-; GFX9GISEL-NEXT: ; %bb.3: ; %if
-; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX9GISEL-NEXT: s_mov_b32 s6, 0
-; GFX9GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
-; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
-; GFX9GISEL-NEXT: v_readlane_b32 s8, v0, s7
-; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s7
-; GFX9GISEL-NEXT: s_sub_i32 s6, s6, s8
-; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_4
-; GFX9GISEL-NEXT: .LBB4_5: ; %endif
-; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6
-; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
-; GFX9GISEL-NEXT: s_endpgm
-;
-; GFX1064DAGISEL-LABEL: divergent_cfg:
-; GFX1064DAGISEL: ; %bb.0: ; %entry
-; GFX1064DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
-; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr2
-; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; GFX1064DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_2
-; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else
-; GFX1064DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
-; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064DAGISEL-NEXT: ; implicit-def: $vgpr0
-; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064DAGISEL-NEXT: s_mul_i32 s3, s6, -1
-; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s3, s2
-; GFX1064DAGISEL-NEXT: .LBB4_2: ; %Flow
-; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
-; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_6
-; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if
-; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064DAGISEL-NEXT: s_mov_b32 s6, 0
-; GFX1064DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
-; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
-; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
-; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
-; GFX1064DAGISEL-NEXT: s_sub_i32 s6, s6, s8
-; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
-; GFX1064DAGISEL-NEXT: ; %bb.5:
-; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6
-; GFX1064DAGISEL-NEXT: .LBB4_6: ; %endif
-; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX1064DAGISEL-NEXT: s_endpgm
-;
-; GFX1064GISEL-LABEL: divergent_cfg:
-; GFX1064GISEL: ; %bb.0: ; %entry
-; GFX1064GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
-; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6
-; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_2
-; GFX1064GISEL-NEXT: ; %bb.1: ; %else
-; GFX1064GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
-; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0
-; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064GISEL-NEXT: s_mul_i32 s3, s6, -1
-; GFX1064GISEL-NEXT: s_mul_i32 s6, s3, s2
-; GFX1064GISEL-NEXT: .LBB4_2: ; %Flow
-; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_5
-; GFX1064GISEL-NEXT: ; %bb.3: ; %if
-; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064GISEL-NEXT: s_mov_b32 s6, 0
-; GFX1064GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
-; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
-; GFX1064GISEL-NEXT: v_readlane_b32 s8, v0, s7
-; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s7
-; GFX1064GISEL-NEXT: s_sub_i32 s6, s6, s8
-; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_4
-; GFX1064GISEL-NEXT: .LBB4_5: ; %endif
-; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6
-; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
-; GFX1064GISEL-NEXT: s_endpgm
-;
-; GFX1032DAGISEL-LABEL: divergent_cfg:
-; GFX1032DAGISEL: ; %bb.0: ; %entry
-; GFX1032DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc_lo, 15, v0
-; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr1
-; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX1032DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_2
-; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else
-; GFX1032DAGISEL-NEXT: s_load_dword s1, s[4:5], 0x2c
-; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX1032DAGISEL-NEXT: ; implicit-def: $vgpr0
-; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
-; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032DAGISEL-NEXT: s_mul_i32 s1, s1, -1
-; GFX1032DAGISEL-NEXT: s_mul_i32 s1, s1, s2
-; GFX1032DAGISEL-NEXT: .LBB4_2: ; %Flow
-; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, s0
-; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_6
-; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if
-; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX1032DAGISEL-NEXT: s_mov_b32 s1, 0
-; GFX1032DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
-; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s3, s2
-; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s3
-; GFX1032DAGISEL-NEXT: s_bitset0_b32 s2, s3
-; GFX1032DAGISEL-NEXT: s_sub_i32 s1, s1, s6
-; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
-; GFX1032DAGISEL-NEXT: ; %bb.5:
-; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX1032DAGISEL-NEXT: .LBB4_6: ; %endif
-; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX1032DAGISEL-NEXT: s_endpgm
-;
-; GFX1032GISEL-LABEL: divergent_cfg:
-; GFX1032GISEL: ; %bb.0: ; %entry
-; GFX1032GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v0
-; GFX1032GISEL-NEXT: ; implicit-def: $sgpr0
-; GFX1032GISEL-NEXT: s_and_saveexec_b32 s1, vcc_lo
-; GFX1032GISEL-NEXT: s_xor_b32 s1, exec_lo, s1
-; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_2
-; GFX1032GISEL-NEXT: ; %bb.1: ; %else
-; GFX1032GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c
-; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0
-; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
-; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032GISEL-NEXT: s_mul_i32 s0, s0, -1
-; GFX1032GISEL-NEXT: s_mul_i32 s0, s0, s2
-; GFX1032GISEL-NEXT: .LBB4_2: ; %Flow
-; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s1, s1
-; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_5
-; GFX1032GISEL-NEXT: ; %bb.3: ; %if
-; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX1032GISEL-NEXT: s_mov_b32 s0, 0
-; GFX1032GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
-; GFX1032GISEL-NEXT: s_ff1_i32_b32 s3, s2
-; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s3
-; GFX1032GISEL-NEXT: s_bitset0_b32 s2, s3
-; GFX1032GISEL-NEXT: s_sub_i32 s0, s0, s6
-; GFX1032GISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_4
-; GFX1032GISEL-NEXT: .LBB4_5: ; %endif
-; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX1032GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
-; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[2:3]
-; GFX1032GISEL-NEXT: s_endpgm
-;
-; GFX1164DAGISEL-LABEL: divergent_cfg:
-; GFX1164DAGISEL: ; %bb.0: ; %entry
-; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec
-; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr2
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
-; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_2
-; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else
-; GFX1164DAGISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c
-; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164DAGISEL-NEXT: ; implicit-def: $vgpr0
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: s_mul_i32 s3, s6, -1
-; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s3, s2
-; GFX1164DAGISEL-NEXT: .LBB4_2: ; %Flow
-; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_6
-; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if
-; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164DAGISEL-NEXT: s_mov_b32 s6, 0
-; GFX1164DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
-; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
-; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
-; GFX1164DAGISEL-NEXT: s_sub_i32 s6, s6, s8
-; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
-; GFX1164DAGISEL-NEXT: ; %bb.5:
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6
-; GFX1164DAGISEL-NEXT: .LBB4_6: ; %endif
-; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1164DAGISEL-NEXT: s_endpgm
-;
-; GFX1164GISEL-LABEL: divergent_cfg:
-; GFX1164GISEL: ; %bb.0: ; %entry
-; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec
-; GFX1164GISEL-NEXT: ; implicit-def: $sgpr6
-; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
-; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_2
-; GFX1164GISEL-NEXT: ; %bb.1: ; %else
-; GFX1164GISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c
-; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0
-; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164GISEL-NEXT: s_mul_i32 s3, s6, -1
-; GFX1164GISEL-NEXT: s_mul_i32 s6, s3, s2
-; GFX1164GISEL-NEXT: .LBB4_2: ; %Flow
-; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1]
-; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_5
-; GFX1164GISEL-NEXT: ; %bb.3: ; %if
-; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164GISEL-NEXT: s_mov_b32 s6, 0
-; GFX1164GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
-; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
-; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7
-; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s7
-; GFX1164GISEL-NEXT: s_sub_i32 s6, s6, s8
-; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_4
-; GFX1164GISEL-NEXT: .LBB4_5: ; %endif
-; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6
-; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
-; GFX1164GISEL-NEXT: s_endpgm
-;
-; GFX1132DAGISEL-LABEL: divergent_cfg:
-; GFX1132DAGISEL: ; %bb.0: ; %entry
-; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1132DAGISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr1
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
-; GFX1132DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_2
-; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else
-; GFX1132DAGISEL-NEXT: s_load_b32 s1, s[4:5], 0x2c
-; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132DAGISEL-NEXT: ; implicit-def: $vgpr0
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
-; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132DAGISEL-NEXT: s_mul_i32 s1, s1, -1
-; GFX1132DAGISEL-NEXT: s_mul_i32 s1, s1, s2
-; GFX1132DAGISEL-NEXT: .LBB4_2: ; %Flow
-; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, s0
-; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_6
-; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if
-; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0
-; GFX1132DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
-; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s3
-; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3
-; GFX1132DAGISEL-NEXT: s_sub_i32 s1, s1, s6
-; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
-; GFX1132DAGISEL-NEXT: ; %bb.5:
-; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX1132DAGISEL-NEXT: .LBB4_6: ; %endif
-; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1132DAGISEL-NEXT: s_endpgm
-;
-; GFX1132GISEL-LABEL: divergent_cfg:
-; GFX1132GISEL: ; %bb.0: ; %entry
-; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1132GISEL-NEXT: s_mov_b32 s1, exec_lo
-; GFX1132GISEL-NEXT: ; implicit-def: $sgpr0
-; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
-; GFX1132GISEL-NEXT: s_xor_b32 s1, exec_lo, s1
-; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_2
-; GFX1132GISEL-NEXT: ; %bb.1: ; %else
-; GFX1132GISEL-NEXT: s_load_b32 s0, s[4:5], 0x2c
-; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0
-; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
-; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132GISEL-NEXT: s_mul_i32 s0, s0, -1
-; GFX1132GISEL-NEXT: s_mul_i32 s0, s0, s2
-; GFX1132GISEL-NEXT: .LBB4_2: ; %Flow
-; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s1, s1
-; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_5
-; GFX1132GISEL-NEXT: ; %bb.3: ; %if
-; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132GISEL-NEXT: s_mov_b32 s0, 0
-; GFX1132GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
-; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2
-; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s3
-; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3
-; GFX1132GISEL-NEXT: s_sub_i32 s0, s0, s6
-; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_4
-; GFX1132GISEL-NEXT: .LBB4_5: ; %endif
-; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24
-; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
-; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
-; GFX1132GISEL-NEXT: s_endpgm
- entry:
- %tid = call i32 @llvm.amdgcn.workitem.id.x()
- %d_cmp = icmp ult i32 %tid, 16
- br i1 %d_cmp, label %if, label %else
-
-if:
- %reducedValTid = call i32 @llvm.amdgcn.wave.reduce.usub.i32(i32 %tid, i32 1)
- br label %endif
-
-else:
- %reducedValIn = call i32 @llvm.amdgcn.wave.reduce.usub.i32(i32 %in, i32 1)
- br label %endif
-
-endif:
- %combine = phi i32 [%reducedValTid, %if], [%reducedValIn, %else]
- store i32 %combine, ptr addrspace(1) %out
- ret void
-}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX10DAGISEL: {{.*}}
-; GFX10GISEL: {{.*}}
-; GFX11DAGISEL: {{.*}}
-; GFX11GISEL: {{.*}}
>From 44e2454a39508177ab8dc1d4d55ad8e43755dd19 Mon Sep 17 00:00:00 2001
From: Aaditya <Aaditya.AlokDeshpande at amd.com>
Date: Wed, 18 Jun 2025 15:11:29 +0530
Subject: [PATCH 6/7] Rebase against main
---
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll | 12 ++++--------
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll | 12 ++++--------
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll | 12 ++++--------
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll | 12 ++++--------
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll | 12 ++++--------
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll | 12 ++++--------
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll | 12 ++++--------
.../CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.add.mir | 2 +-
.../CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.and.mir | 2 +-
.../CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.max.mir | 2 +-
.../CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.min.mir | 2 +-
.../CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.or.mir | 2 +-
.../CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.sub.mir | 2 +-
.../CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.xor.mir | 2 +-
14 files changed, 35 insertions(+), 63 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll
index e93e8d4108ba0..4d2b18134c899 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll
@@ -665,7 +665,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164DAGISEL-NEXT: s_add_i32 s4, s4, s6
; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
@@ -686,7 +685,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5
; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5
-; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164GISEL-NEXT: s_add_i32 s4, s4, s6
; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1
@@ -708,7 +706,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132DAGISEL-NEXT: s_add_i32 s2, s2, s5
; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
@@ -729,7 +726,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4
; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4
-; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132GISEL-NEXT: s_add_i32 s2, s2, s5
; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0
; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1
@@ -1075,7 +1071,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1164DAGISEL-NEXT: s_mov_b32 s6, 0
; GFX1164DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX1164DAGISEL-NEXT: s_add_i32 s6, s6, s8
@@ -1116,7 +1112,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1164GISEL-NEXT: s_mov_b32 s6, 0
; GFX1164GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
-; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX1164GISEL-NEXT: s_add_i32 s6, s6, s8
@@ -1158,7 +1154,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0
; GFX1132DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s3
; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1132DAGISEL-NEXT: s_add_i32 s1, s1, s6
@@ -1199,7 +1195,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1132GISEL-NEXT: s_mov_b32 s0, 0
; GFX1132GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2
-; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s3
; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1132GISEL-NEXT: s_add_i32 s0, s0, s6
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll
index ac11fa6a5a7a5..a06dd92b98892 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll
@@ -459,7 +459,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164DAGISEL-NEXT: s_and_b32 s4, s4, s6
; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
@@ -480,7 +479,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5
; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5
-; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164GISEL-NEXT: s_and_b32 s4, s4, s6
; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1
@@ -502,7 +500,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132DAGISEL-NEXT: s_and_b32 s2, s2, s5
; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
@@ -523,7 +520,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4
; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4
-; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132GISEL-NEXT: s_and_b32 s2, s2, s5
; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0
; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1
@@ -840,7 +836,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1164DAGISEL-NEXT: s_mov_b32 s6, -1
; GFX1164DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX1164DAGISEL-NEXT: s_and_b32 s6, s6, s8
@@ -878,7 +874,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1164GISEL-NEXT: s_mov_b32 s6, -1
; GFX1164GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
-; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX1164GISEL-NEXT: s_and_b32 s6, s6, s8
@@ -915,7 +911,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1132DAGISEL-NEXT: s_mov_b32 s1, -1
; GFX1132DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s3
; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1132DAGISEL-NEXT: s_and_b32 s1, s1, s6
@@ -953,7 +949,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1132GISEL-NEXT: s_mov_b32 s0, -1
; GFX1132GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2
-; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s3
; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1132GISEL-NEXT: s_and_b32 s0, s0, s6
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll
index 2df2886d068c8..a29c320f24a1f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll
@@ -459,7 +459,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164DAGISEL-NEXT: s_max_i32 s4, s4, s6
; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
@@ -480,7 +479,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5
; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5
-; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164GISEL-NEXT: s_max_i32 s4, s4, s6
; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1
@@ -502,7 +500,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132DAGISEL-NEXT: s_max_i32 s2, s2, s5
; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
@@ -523,7 +520,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4
; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4
-; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132GISEL-NEXT: s_max_i32 s2, s2, s5
; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0
; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1
@@ -840,7 +836,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1164DAGISEL-NEXT: s_brev_b32 s6, 1
; GFX1164DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX1164DAGISEL-NEXT: s_max_i32 s6, s6, s8
@@ -878,7 +874,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1164GISEL-NEXT: s_brev_b32 s6, 1
; GFX1164GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
-; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX1164GISEL-NEXT: s_max_i32 s6, s6, s8
@@ -915,7 +911,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1132DAGISEL-NEXT: s_brev_b32 s1, 1
; GFX1132DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s3
; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1132DAGISEL-NEXT: s_max_i32 s1, s1, s6
@@ -953,7 +949,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1132GISEL-NEXT: s_brev_b32 s0, 1
; GFX1132GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2
-; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s3
; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1132GISEL-NEXT: s_max_i32 s0, s0, s6
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll
index 45335c7234956..2f81fbd3b9788 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll
@@ -459,7 +459,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164DAGISEL-NEXT: s_min_i32 s4, s4, s6
; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
@@ -480,7 +479,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5
; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5
-; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164GISEL-NEXT: s_min_i32 s4, s4, s6
; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1
@@ -502,7 +500,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132DAGISEL-NEXT: s_min_i32 s2, s2, s5
; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
@@ -523,7 +520,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4
; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4
-; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132GISEL-NEXT: s_min_i32 s2, s2, s5
; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0
; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1
@@ -840,7 +836,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1164DAGISEL-NEXT: s_brev_b32 s6, -2
; GFX1164DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX1164DAGISEL-NEXT: s_min_i32 s6, s6, s8
@@ -878,7 +874,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1164GISEL-NEXT: s_brev_b32 s6, -2
; GFX1164GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
-; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX1164GISEL-NEXT: s_min_i32 s6, s6, s8
@@ -915,7 +911,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1132DAGISEL-NEXT: s_brev_b32 s1, -2
; GFX1132DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s3
; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1132DAGISEL-NEXT: s_min_i32 s1, s1, s6
@@ -953,7 +949,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1132GISEL-NEXT: s_brev_b32 s0, -2
; GFX1132GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2
-; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s3
; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1132GISEL-NEXT: s_min_i32 s0, s0, s6
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll
index 590c5cc383d43..1b817da5a3017 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll
@@ -459,7 +459,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164DAGISEL-NEXT: s_or_b32 s4, s4, s6
; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
@@ -480,7 +479,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5
; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5
-; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164GISEL-NEXT: s_or_b32 s4, s4, s6
; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1
@@ -502,7 +500,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132DAGISEL-NEXT: s_or_b32 s2, s2, s5
; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
@@ -523,7 +520,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4
; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4
-; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132GISEL-NEXT: s_or_b32 s2, s2, s5
; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0
; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1
@@ -840,7 +836,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1164DAGISEL-NEXT: s_mov_b32 s6, 0
; GFX1164DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX1164DAGISEL-NEXT: s_or_b32 s6, s6, s8
@@ -878,7 +874,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1164GISEL-NEXT: s_mov_b32 s6, 0
; GFX1164GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
-; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX1164GISEL-NEXT: s_or_b32 s6, s6, s8
@@ -915,7 +911,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0
; GFX1132DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s3
; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1132DAGISEL-NEXT: s_or_b32 s1, s1, s6
@@ -953,7 +949,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1132GISEL-NEXT: s_mov_b32 s0, 0
; GFX1132GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2
-; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s3
; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1132GISEL-NEXT: s_or_b32 s0, s0, s6
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll
index 1377dc51c4b34..2fb291ccd8ce5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll
@@ -702,7 +702,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164DAGISEL-NEXT: s_sub_i32 s4, s4, s6
; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
@@ -723,7 +722,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5
; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5
-; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164GISEL-NEXT: s_sub_i32 s4, s4, s6
; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1
@@ -745,7 +743,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132DAGISEL-NEXT: s_sub_i32 s2, s2, s5
; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
@@ -766,7 +763,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4
; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4
-; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132GISEL-NEXT: s_sub_i32 s2, s2, s5
; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0
; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1
@@ -1121,7 +1117,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1164DAGISEL-NEXT: s_mov_b32 s6, 0
; GFX1164DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX1164DAGISEL-NEXT: s_sub_i32 s6, s6, s8
@@ -1163,7 +1159,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1164GISEL-NEXT: s_mov_b32 s6, 0
; GFX1164GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
-; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX1164GISEL-NEXT: s_sub_i32 s6, s6, s8
@@ -1206,7 +1202,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0
; GFX1132DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s3
; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1132DAGISEL-NEXT: s_sub_i32 s1, s1, s6
@@ -1248,7 +1244,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1132GISEL-NEXT: s_mov_b32 s0, 0
; GFX1132GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2
-; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s3
; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1132GISEL-NEXT: s_sub_i32 s0, s0, s6
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll
index 6194aea667953..5e74de1e6956c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll
@@ -706,7 +706,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164DAGISEL-NEXT: s_xor_b32 s4, s4, s6
; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
@@ -727,7 +726,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5
; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5
-; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164GISEL-NEXT: s_xor_b32 s4, s4, s6
; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1
@@ -749,7 +747,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132DAGISEL-NEXT: s_xor_b32 s2, s2, s5
; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
@@ -770,7 +767,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4
; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4
-; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132GISEL-NEXT: s_xor_b32 s2, s2, s5
; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0
; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1
@@ -1125,7 +1121,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1164DAGISEL-NEXT: s_mov_b32 s6, 0
; GFX1164DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX1164DAGISEL-NEXT: s_xor_b32 s6, s6, s8
@@ -1167,7 +1163,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1164GISEL-NEXT: s_mov_b32 s6, 0
; GFX1164GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
-; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX1164GISEL-NEXT: s_xor_b32 s6, s6, s8
@@ -1210,7 +1206,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0
; GFX1132DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s3
; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1132DAGISEL-NEXT: s_xor_b32 s1, s1, s6
@@ -1252,7 +1248,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1132GISEL-NEXT: s_mov_b32 s0, 0
; GFX1132GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2
-; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s3
; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1132GISEL-NEXT: s_xor_b32 s0, s0, s6
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.add.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.add.mir
index ed9005010a71d..e471226622389 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.add.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.add.mir
@@ -59,7 +59,7 @@ body: |
; GCN-NEXT: [[PHI:%[0-9]+]]:sgpr_32 = PHI [[S_MOV_B32_]], %bb.0, %4, %bb.2
; GCN-NEXT: [[PHI1:%[0-9]+]]:sreg_64_xexec = PHI [[S_MOV_B64_]], %bb.0, %11, %bb.2
; GCN-NEXT: [[S_FF1_I32_B64_:%[0-9]+]]:sgpr_32 = S_FF1_I32_B64 [[PHI1]]
- ; GCN-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sgpr_32 = V_READLANE_B32 [[COPY1]], [[S_FF1_I32_B64_]]
+ ; GCN-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READLANE_B32 [[COPY1]], [[S_FF1_I32_B64_]]
; GCN-NEXT: [[S_ADD_I32_:%[0-9]+]]:sgpr_32 = S_ADD_I32 [[PHI]], [[V_READLANE_B32_]], implicit-def $scc
; GCN-NEXT: [[S_BITSET0_B64_:%[0-9]+]]:sreg_64_xexec = S_BITSET0_B64 [[S_FF1_I32_B64_]], [[PHI1]]
; GCN-NEXT: S_CMP_LG_U64 [[S_BITSET0_B64_]], 0, implicit-def $scc
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.and.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.and.mir
index 8ab1905295a8d..df4db4cc57658 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.and.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.and.mir
@@ -58,7 +58,7 @@ body: |
; GCN-NEXT: [[PHI:%[0-9]+]]:sgpr_32 = PHI [[S_MOV_B32_]], %bb.0, %4, %bb.2
; GCN-NEXT: [[PHI1:%[0-9]+]]:sreg_64_xexec = PHI [[S_MOV_B64_]], %bb.0, %11, %bb.2
; GCN-NEXT: [[S_FF1_I32_B64_:%[0-9]+]]:sgpr_32 = S_FF1_I32_B64 [[PHI1]]
- ; GCN-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sgpr_32 = V_READLANE_B32 [[COPY1]], [[S_FF1_I32_B64_]]
+ ; GCN-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READLANE_B32 [[COPY1]], [[S_FF1_I32_B64_]]
; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sgpr_32 = S_AND_B32 [[PHI]], [[V_READLANE_B32_]], implicit-def $scc
; GCN-NEXT: [[S_BITSET0_B64_:%[0-9]+]]:sreg_64_xexec = S_BITSET0_B64 [[S_FF1_I32_B64_]], [[PHI1]]
; GCN-NEXT: S_CMP_LG_U64 [[S_BITSET0_B64_]], 0, implicit-def $scc
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.max.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.max.mir
index a5f8b0dbe52b2..2f32bdce743fe 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.max.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.max.mir
@@ -58,7 +58,7 @@ body: |
; GCN-NEXT: [[PHI:%[0-9]+]]:sgpr_32 = PHI [[S_MOV_B32_]], %bb.0, %4, %bb.2
; GCN-NEXT: [[PHI1:%[0-9]+]]:sreg_64_xexec = PHI [[S_MOV_B64_]], %bb.0, %11, %bb.2
; GCN-NEXT: [[S_FF1_I32_B64_:%[0-9]+]]:sgpr_32 = S_FF1_I32_B64 [[PHI1]]
- ; GCN-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sgpr_32 = V_READLANE_B32 [[COPY1]], [[S_FF1_I32_B64_]]
+ ; GCN-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READLANE_B32 [[COPY1]], [[S_FF1_I32_B64_]]
; GCN-NEXT: [[S_MAX_I32_:%[0-9]+]]:sgpr_32 = S_MAX_I32 [[PHI]], [[V_READLANE_B32_]], implicit-def $scc
; GCN-NEXT: [[S_BITSET0_B64_:%[0-9]+]]:sreg_64_xexec = S_BITSET0_B64 [[S_FF1_I32_B64_]], [[PHI1]]
; GCN-NEXT: S_CMP_LG_U64 [[S_BITSET0_B64_]], 0, implicit-def $scc
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.min.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.min.mir
index 7a0a522d1d3dd..6c154403bbd9f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.min.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.min.mir
@@ -58,7 +58,7 @@ body: |
; GCN-NEXT: [[PHI:%[0-9]+]]:sgpr_32 = PHI [[S_MOV_B32_]], %bb.0, %4, %bb.2
; GCN-NEXT: [[PHI1:%[0-9]+]]:sreg_64_xexec = PHI [[S_MOV_B64_]], %bb.0, %11, %bb.2
; GCN-NEXT: [[S_FF1_I32_B64_:%[0-9]+]]:sgpr_32 = S_FF1_I32_B64 [[PHI1]]
- ; GCN-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sgpr_32 = V_READLANE_B32 [[COPY1]], [[S_FF1_I32_B64_]]
+ ; GCN-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READLANE_B32 [[COPY1]], [[S_FF1_I32_B64_]]
; GCN-NEXT: [[S_MIN_I32_:%[0-9]+]]:sgpr_32 = S_MIN_I32 [[PHI]], [[V_READLANE_B32_]], implicit-def $scc
; GCN-NEXT: [[S_BITSET0_B64_:%[0-9]+]]:sreg_64_xexec = S_BITSET0_B64 [[S_FF1_I32_B64_]], [[PHI1]]
; GCN-NEXT: S_CMP_LG_U64 [[S_BITSET0_B64_]], 0, implicit-def $scc
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.or.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.or.mir
index 4711240af3ecf..32d5a91d17e43 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.or.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.or.mir
@@ -58,7 +58,7 @@ body: |
; GCN-NEXT: [[PHI:%[0-9]+]]:sgpr_32 = PHI [[S_MOV_B32_]], %bb.0, %4, %bb.2
; GCN-NEXT: [[PHI1:%[0-9]+]]:sreg_64_xexec = PHI [[S_MOV_B64_]], %bb.0, %11, %bb.2
; GCN-NEXT: [[S_FF1_I32_B64_:%[0-9]+]]:sgpr_32 = S_FF1_I32_B64 [[PHI1]]
- ; GCN-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sgpr_32 = V_READLANE_B32 [[COPY1]], [[S_FF1_I32_B64_]]
+ ; GCN-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READLANE_B32 [[COPY1]], [[S_FF1_I32_B64_]]
; GCN-NEXT: [[S_OR_B32_:%[0-9]+]]:sgpr_32 = S_OR_B32 [[PHI]], [[V_READLANE_B32_]], implicit-def $scc
; GCN-NEXT: [[S_BITSET0_B64_:%[0-9]+]]:sreg_64_xexec = S_BITSET0_B64 [[S_FF1_I32_B64_]], [[PHI1]]
; GCN-NEXT: S_CMP_LG_U64 [[S_BITSET0_B64_]], 0, implicit-def $scc
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.sub.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.sub.mir
index 7b729baaaf21f..d6ed8463ce1ff 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.sub.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.sub.mir
@@ -61,7 +61,7 @@ body: |
; GCN-NEXT: [[PHI:%[0-9]+]]:sgpr_32 = PHI [[S_MOV_B32_]], %bb.0, %4, %bb.2
; GCN-NEXT: [[PHI1:%[0-9]+]]:sreg_64_xexec = PHI [[S_MOV_B64_]], %bb.0, %11, %bb.2
; GCN-NEXT: [[S_FF1_I32_B64_:%[0-9]+]]:sgpr_32 = S_FF1_I32_B64 [[PHI1]]
- ; GCN-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sgpr_32 = V_READLANE_B32 [[COPY1]], [[S_FF1_I32_B64_]]
+ ; GCN-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READLANE_B32 [[COPY1]], [[S_FF1_I32_B64_]]
; GCN-NEXT: [[S_SUB_I32_:%[0-9]+]]:sgpr_32 = S_SUB_I32 [[PHI]], [[V_READLANE_B32_]], implicit-def $scc
; GCN-NEXT: [[S_BITSET0_B64_:%[0-9]+]]:sreg_64_xexec = S_BITSET0_B64 [[S_FF1_I32_B64_]], [[PHI1]]
; GCN-NEXT: S_CMP_LG_U64 [[S_BITSET0_B64_]], 0, implicit-def $scc
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.xor.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.xor.mir
index 3dbf0c063af92..2ee21097b67f8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.xor.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.xor.mir
@@ -61,7 +61,7 @@ body: |
; GCN-NEXT: [[PHI:%[0-9]+]]:sgpr_32 = PHI [[S_MOV_B32_]], %bb.0, %4, %bb.2
; GCN-NEXT: [[PHI1:%[0-9]+]]:sreg_64_xexec = PHI [[S_MOV_B64_]], %bb.0, %11, %bb.2
; GCN-NEXT: [[S_FF1_I32_B64_:%[0-9]+]]:sgpr_32 = S_FF1_I32_B64 [[PHI1]]
- ; GCN-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sgpr_32 = V_READLANE_B32 [[COPY1]], [[S_FF1_I32_B64_]]
+ ; GCN-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READLANE_B32 [[COPY1]], [[S_FF1_I32_B64_]]
; GCN-NEXT: [[S_XOR_B32_:%[0-9]+]]:sgpr_32 = S_XOR_B32 [[PHI]], [[V_READLANE_B32_]], implicit-def $scc
; GCN-NEXT: [[S_BITSET0_B64_:%[0-9]+]]:sreg_64_xexec = S_BITSET0_B64 [[S_FF1_I32_B64_]], [[PHI1]]
; GCN-NEXT: S_CMP_LG_U64 [[S_BITSET0_B64_]], 0, implicit-def $scc
>From 0b4673a00b57bd43e07f754a4f0420b449cbf0dc Mon Sep 17 00:00:00 2001
From: Aaditya <Aaditya.AlokDeshpande at amd.com>
Date: Wed, 18 Jun 2025 15:20:01 +0530
Subject: [PATCH 7/7] Address Review Comments
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index e4d362d29885b..b81af82096a6d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5097,7 +5097,7 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
bool IsWave32 = ST.isWave32();
unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
- unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+ MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
unsigned CountReg =
IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
More information about the llvm-commits
mailing list