[llvm-branch-commits] [llvm] [AMDGPU] Extending wave reduction intrinsics for `i64` types - 2 (PR #151309)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Wed Jul 30 03:46:57 PDT 2025
https://github.com/easyonaadit created https://github.com/llvm/llvm-project/pull/151309
Supporting Arithemtic Operations: `add`, `sub`
>From fb0a84b999ea6e7bec64b902f4a06befaa2b7648 Mon Sep 17 00:00:00 2001
From: Aaditya <Aaditya.AlokDeshpande at amd.com>
Date: Sat, 19 Jul 2025 12:48:18 +0530
Subject: [PATCH] [AMDGPU] Extending wave reduction intrinsics for `i64` types
- 2
Supporting Arithemtic Operations: `add`, `sub`
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 155 +-
llvm/lib/Target/AMDGPU/SIInstructions.td | 2 +
.../CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll | 2589 +++++++++++++++
.../CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll | 2897 +++++++++++++++++
4 files changed, 5618 insertions(+), 25 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 0f529ef362199..56d8e739b6493 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5107,7 +5107,9 @@ static uint32_t getIdentityValueForWaveReduction(unsigned Opc) {
case AMDGPU::V_CMP_GT_I64_e64: // max.i64
return std::numeric_limits<int32_t>::min();
case AMDGPU::S_ADD_I32:
+ case AMDGPU::S_ADD_U64_PSEUDO:
case AMDGPU::S_SUB_I32:
+ case AMDGPU::S_SUB_U64_PSEUDO:
case AMDGPU::S_OR_B32:
case AMDGPU::S_XOR_B32:
return std::numeric_limits<uint32_t>::min();
@@ -5153,11 +5155,14 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
}
case AMDGPU::S_XOR_B32:
case AMDGPU::S_ADD_I32:
- case AMDGPU::S_SUB_I32: {
+ case AMDGPU::S_ADD_U64_PSEUDO:
+ case AMDGPU::S_SUB_I32:
+ case AMDGPU::S_SUB_U64_PSEUDO: {
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
- Register ActiveLanes = MRI.createVirtualRegister(DstRegClass);
+ Register ActiveLanes =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
bool IsWave32 = ST.isWave32();
unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
@@ -5165,39 +5170,39 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
unsigned CountReg =
IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
- auto Exec =
BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
- auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
- .addReg(Exec->getOperand(0).getReg());
+ auto NewAccumulator =
+ BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
+ .addReg(ExecMask);
+
+ switch (Opc) {
+ case AMDGPU::S_XOR_B32: {
+ // Performing an XOR operation on a uniform value
+ // depends on the parity of the number of active lanes.
+ // For even parity, the result will be 0, for odd
+ // parity the result will be the same as the input value.
+ Register ParityRegister =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
- switch (Opc) {
- case AMDGPU::S_XOR_B32: {
- // Performing an XOR operation on a uniform value
- // depends on the parity of the number of active lanes.
- // For even parity, the result will be 0, for odd
- // parity the result will be the same as the input value.
- Register ParityRegister = MRI.createVirtualRegister(DstRegClass);
-
- auto ParityReg =
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
.addReg(NewAccumulator->getOperand(0).getReg())
- .addImm(1);
- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
- .addReg(SrcReg)
- .addReg(ParityReg->getOperand(0).getReg());
- break;
- }
+ .addImm(1)
+ .setOperandDead(3); // Dead scc
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
+ .addReg(SrcReg)
+ .addReg(ParityRegister);
+ break;
+ }
case AMDGPU::S_SUB_I32: {
Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
// Take the negation of the source operand.
- auto InvertedValReg =
- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal)
- .addImm(-1)
- .addReg(SrcReg);
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedVal)
+ .addImm(0)
+ .addReg(SrcReg);
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
- .addReg(InvertedValReg->getOperand(0).getReg())
+ .addReg(NegatedVal)
.addReg(NewAccumulator->getOperand(0).getReg());
break;
}
@@ -5207,6 +5212,74 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
.addReg(NewAccumulator->getOperand(0).getReg());
break;
}
+ case AMDGPU::S_ADD_U64_PSEUDO:
+ case AMDGPU::S_SUB_U64_PSEUDO: {
+ Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register Op1H_Op0L_Reg =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register Op1L_Op0H_Reg =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register AddReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register NegatedValLo =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register NegatedValHi =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+
+ const TargetRegisterClass *Src1RC = MRI.getRegClass(SrcReg);
+ const TargetRegisterClass *Src1SubRC =
+ TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
+
+ MachineOperand Op1L = TII->buildExtractSubRegOrImm(
+ MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
+ MachineOperand Op1H = TII->buildExtractSubRegOrImm(
+ MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
+
+ if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedValLo)
+ .addImm(0)
+ .addReg(NewAccumulator->getOperand(0).getReg());
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ASHR_I32), NegatedValHi)
+ .addReg(NegatedValLo)
+ .addImm(31)
+ .setOperandDead(3); // Dead scc
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1L_Op0H_Reg)
+ .add(Op1L)
+ .addReg(NegatedValHi);
+ }
+ Register LowOpcode = Opc == AMDGPU::S_SUB_U64_PSEUDO
+ ? NegatedValLo
+ : NewAccumulator->getOperand(0).getReg();
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
+ .add(Op1L)
+ .addReg(LowOpcode);
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_HI_U32), CarryReg)
+ .add(Op1L)
+ .addReg(LowOpcode);
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1H_Op0L_Reg)
+ .add(Op1H)
+ .addReg(LowOpcode);
+
+ Register HiVal = Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), HiVal)
+ .addReg(CarryReg)
+ .addReg(Op1H_Op0L_Reg)
+ .setOperandDead(3); // Dead scc
+
+ if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), DestSub1)
+ .addReg(HiVal)
+ .addReg(Op1L_Op0H_Reg)
+ .setOperandDead(3); // Dead scc
+ }
+ BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
+ .addReg(DestSub0)
+ .addImm(AMDGPU::sub0)
+ .addReg(DestSub1)
+ .addImm(AMDGPU::sub1);
+ break;
+ }
}
RetBB = &BB;
}
@@ -5374,6 +5447,34 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
.addReg(Accumulator->getOperand(0).getReg());
break;
}
+ case ::AMDGPU::S_ADD_U64_PSEUDO:
+ case ::AMDGPU::S_SUB_U64_PSEUDO: {
+ unsigned newOpc1 = Opc == AMDGPU::S_ADD_U64_PSEUDO ? AMDGPU::S_ADD_U32
+ : AMDGPU::S_SUB_U32;
+ unsigned newOpc2 = Opc == AMDGPU::S_ADD_U64_PSEUDO ? AMDGPU::S_ADDC_U32
+ : AMDGPU::S_SUBB_U32;
+ Register DestLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register DestHi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ MachineOperand Accumlo = TII->buildExtractSubRegOrImm(
+ MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub0,
+ &AMDGPU::SReg_32RegClass);
+ MachineOperand Accumhi = TII->buildExtractSubRegOrImm(
+ MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub1,
+ &AMDGPU::SReg_32RegClass);
+ BuildMI(*ComputeLoop, I, DL, TII->get(newOpc1), DestLo)
+ .add(Accumlo)
+ .addReg(LaneValueLo->getOperand(0).getReg());
+ BuildMI(*ComputeLoop, I, DL, TII->get(newOpc2), DestHi)
+ .add(Accumhi)
+ .addReg(LaneValueHi->getOperand(0).getReg());
+ NewAccumulator = BuildMI(*ComputeLoop, I, DL,
+ TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
+ .addReg(DestLo)
+ .addImm(AMDGPU::sub0)
+ .addReg(DestHi)
+ .addImm(AMDGPU::sub1);
+ break;
+ }
}
}
// Manipulate the iterator to get the next active lane
@@ -5429,8 +5530,12 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
+ case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
+ case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 25a491ab87a06..64697673fa1b1 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -345,6 +345,8 @@ defvar Operations = [
WaveReduceOp<"min", "I64", i64, SGPR_64, VSrc_b64>,
WaveReduceOp<"umax", "U64", i64, SGPR_64, VSrc_b64>,
WaveReduceOp<"max", "I64", i64, SGPR_64, VSrc_b64>,
+ WaveReduceOp<"add", "U64", i64, SGPR_64, VSrc_b64>,
+ WaveReduceOp<"sub", "U64", i64, SGPR_64, VSrc_b64>,
];
foreach Op = Operations in {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll
new file mode 100644
index 0000000000000..b6af8b4bb798d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll
@@ -0,0 +1,2589 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 < %s | FileCheck -check-prefixes=GFX8DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 < %s | FileCheck -check-prefixes=GFX8GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 < %s | FileCheck -check-prefixes=GFX9DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 < %s | FileCheck -check-prefixes=GFX9GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s
+
+define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: uniform_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: s_mul_i32 s2, s6, s2
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: uniform_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: s_mul_i32 s2, s6, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: uniform_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: s_mul_i32 s2, s6, s2
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: uniform_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: s_mul_i32 s2, s6, s2
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: uniform_value:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_clause 0x1
+; GFX1064DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s6, s2
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: uniform_value:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_clause 0x1
+; GFX1064GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: s_mul_i32 s2, s6, s2
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: uniform_value:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_clause 0x1
+; GFX1032DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s2, s3
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: uniform_value:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_clause 0x1
+; GFX1032GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: s_mul_i32 s2, s2, s3
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: uniform_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_clause 0x1
+; GFX1164DAGISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s6, s2
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: uniform_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_clause 0x1
+; GFX1164GISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: s_mul_i32 s2, s6, s2
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: uniform_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_clause 0x1
+; GFX1132DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s2, s3
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: uniform_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_clause 0x1
+; GFX1132GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: s_mul_i32 s2, s2, s3
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_endpgm
+entry:
+ %result = call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 %in, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @const_value(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: const_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8DAGISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: const_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8GISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: const_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9DAGISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: const_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9GISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: const_value:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: const_value:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: const_value:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032DAGISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: const_value:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032GISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: const_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: const_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: const_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132DAGISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: const_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_endpgm
+entry:
+ %result = call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 123, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @poison_value(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: poison_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: poison_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: poison_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: poison_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: poison_value:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: poison_value:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: poison_value:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: poison_value:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: poison_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: poison_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: poison_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: poison_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_endpgm
+entry:
+ %result = call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 poison, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: divergent_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_mov_b32 s4, 0
+; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX8DAGISEL-NEXT: s_add_i32 s4, s4, s6
+; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX8DAGISEL-NEXT: ; %bb.2:
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_mov_b32 s4, 0
+; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX8GISEL-NEXT: s_add_i32 s4, s4, s6
+; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX8GISEL-NEXT: ; %bb.2:
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_mov_b32 s4, 0
+; GFX9DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX9DAGISEL-NEXT: s_add_i32 s4, s4, s6
+; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX9DAGISEL-NEXT: ; %bb.2:
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_mov_b32 s4, 0
+; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX9GISEL-NEXT: s_add_i32 s4, s4, s6
+; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX9GISEL-NEXT: ; %bb.2:
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_value:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: s_mov_b32 s4, 0
+; GFX1064DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1064DAGISEL-NEXT: s_add_i32 s4, s4, s6
+; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1064DAGISEL-NEXT: ; %bb.2:
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_value:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: s_mov_b32 s4, 0
+; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1064GISEL-NEXT: s_add_i32 s4, s4, s6
+; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1064GISEL-NEXT: ; %bb.2:
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_value:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, 0
+; GFX1032DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1032DAGISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1032DAGISEL-NEXT: s_add_i32 s2, s2, s5
+; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1032DAGISEL-NEXT: ; %bb.2:
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_value:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032GISEL-NEXT: s_mov_b32 s2, 0
+; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032GISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1032GISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1032GISEL-NEXT: s_add_i32 s2, s2, s5
+; GFX1032GISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1032GISEL-NEXT: ; %bb.2:
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: s_mov_b32 s4, 0
+; GFX1164DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164DAGISEL-NEXT: s_add_i32 s4, s4, s6
+; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1164DAGISEL-NEXT: ; %bb.2:
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: s_mov_b32 s4, 0
+; GFX1164GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164GISEL-NEXT: s_add_i32 s4, s4, s6
+; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1164GISEL-NEXT: ; %bb.2:
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132DAGISEL-NEXT: s_mov_b32 s2, 0
+; GFX1132DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s3
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1132DAGISEL-NEXT: s_add_i32 s2, s2, s5
+; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1132DAGISEL-NEXT: ; %bb.2:
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132GISEL-NEXT: s_mov_b32 s2, 0
+; GFX1132GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s3
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1132GISEL-NEXT: s_add_i32 s2, s2, s5
+; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1132GISEL-NEXT: ; %bb.2:
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_endpgm
+entry:
+ %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+ %result = call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 %id.x, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: divergent_cfg:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr2
+; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX8DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX8DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX8DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: s_mul_i32 s2, s6, s2
+; GFX8DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX8DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_mov_b32 s6, 0
+; GFX8DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX8DAGISEL-NEXT: s_add_i32 s6, s6, s8
+; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX8DAGISEL-NEXT: ; %bb.5:
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX8DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT: flat_store_dword v[2:3], v1
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_cfg:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
+; GFX8GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX8GISEL-NEXT: ; %bb.1: ; %else
+; GFX8GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: s_mul_i32 s6, s6, s2
+; GFX8GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX8GISEL-NEXT: ; %bb.3: ; %if
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_mov_b32 s6, 0
+; GFX8GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX8GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX8GISEL-NEXT: s_add_i32 s6, s6, s8
+; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX8GISEL-NEXT: .LBB4_5: ; %endif
+; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_cfg:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr2
+; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX9DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX9DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: s_mul_i32 s2, s6, s2
+; GFX9DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX9DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_mov_b32 s6, 0
+; GFX9DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX9DAGISEL-NEXT: s_add_i32 s6, s6, s8
+; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX9DAGISEL-NEXT: ; %bb.5:
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX9DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_cfg:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
+; GFX9GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX9GISEL-NEXT: ; %bb.1: ; %else
+; GFX9GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: s_mul_i32 s6, s6, s2
+; GFX9GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX9GISEL-NEXT: ; %bb.3: ; %if
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_mov_b32 s6, 0
+; GFX9GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX9GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX9GISEL-NEXT: s_add_i32 s6, s6, s8
+; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX9GISEL-NEXT: .LBB4_5: ; %endif
+; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_cfg:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr2
+; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1064DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s6, s2
+; GFX1064DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: s_mov_b32 s6, 0
+; GFX1064DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX1064DAGISEL-NEXT: s_add_i32 s6, s6, s8
+; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1064DAGISEL-NEXT: ; %bb.5:
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX1064DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_cfg:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
+; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1064GISEL-NEXT: ; %bb.1: ; %else
+; GFX1064GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: s_mul_i32 s6, s6, s2
+; GFX1064GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1064GISEL-NEXT: ; %bb.3: ; %if
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: s_mov_b32 s6, 0
+; GFX1064GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX1064GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX1064GISEL-NEXT: s_add_i32 s6, s6, s8
+; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1064GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_cfg:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc_lo, 15, v0
+; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr1
+; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1032DAGISEL-NEXT: s_load_dword s1, s[4:5], 0x2c
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: s_mul_i32 s1, s1, s2
+; GFX1032DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, s0
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT: s_mov_b32 s1, 0
+; GFX1032DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s3, s2
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1032DAGISEL-NEXT: s_bitset0_b32 s2, s3
+; GFX1032DAGISEL-NEXT: s_add_i32 s1, s1, s6
+; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1032DAGISEL-NEXT: ; %bb.5:
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1032DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_cfg:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v0
+; GFX1032GISEL-NEXT: ; implicit-def: $sgpr0
+; GFX1032GISEL-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX1032GISEL-NEXT: s_xor_b32 s1, exec_lo, s1
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1032GISEL-NEXT: ; %bb.1: ; %else
+; GFX1032GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c
+; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: s_mul_i32 s0, s0, s2
+; GFX1032GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s1, s1
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1032GISEL-NEXT: ; %bb.3: ; %if
+; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT: s_mov_b32 s0, 0
+; GFX1032GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: s_ff1_i32_b32 s3, s2
+; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1032GISEL-NEXT: s_bitset0_b32 s2, s3
+; GFX1032GISEL-NEXT: s_add_i32 s0, s0, s6
+; GFX1032GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1032GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_cfg:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr2
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
+; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1164DAGISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s6, s2
+; GFX1164DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: s_mov_b32 s6, 0
+; GFX1164DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX1164DAGISEL-NEXT: s_add_i32 s6, s6, s8
+; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1164DAGISEL-NEXT: ; %bb.5:
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX1164DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_cfg:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
+; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1164GISEL-NEXT: ; %bb.1: ; %else
+; GFX1164GISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: s_mul_i32 s6, s6, s2
+; GFX1164GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1]
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1164GISEL-NEXT: ; %bb.3: ; %if
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: s_mov_b32 s6, 0
+; GFX1164GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX1164GISEL-NEXT: s_add_i32 s6, s6, s8
+; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1164GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_cfg:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr1
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
+; GFX1132DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1132DAGISEL-NEXT: s_load_b32 s1, s[4:5], 0x2c
+; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: s_mul_i32 s1, s1, s2
+; GFX1132DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, s0
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0
+; GFX1132DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3
+; GFX1132DAGISEL-NEXT: s_add_i32 s1, s1, s6
+; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1132DAGISEL-NEXT: ; %bb.5:
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1132DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_cfg:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT: s_mov_b32 s1, exec_lo
+; GFX1132GISEL-NEXT: ; implicit-def: $sgpr0
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
+; GFX1132GISEL-NEXT: s_xor_b32 s1, exec_lo, s1
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1132GISEL-NEXT: ; %bb.1: ; %else
+; GFX1132GISEL-NEXT: s_load_b32 s0, s[4:5], 0x2c
+; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: s_mul_i32 s0, s0, s2
+; GFX1132GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s1, s1
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1132GISEL-NEXT: ; %bb.3: ; %if
+; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT: s_mov_b32 s0, 0
+; GFX1132GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3
+; GFX1132GISEL-NEXT: s_add_i32 s0, s0, s6
+; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1132GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
+; GFX1132GISEL-NEXT: s_endpgm
+entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %d_cmp = icmp ult i32 %tid, 16
+ br i1 %d_cmp, label %if, label %else
+
+if:
+ %reducedValTid = call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 %tid, i32 1)
+ br label %endif
+
+else:
+ %reducedValIn = call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 %in, i32 1)
+ br label %endif
+
+endif:
+ %combine = phi i32 [%reducedValTid, %if], [%reducedValIn, %else]
+ store i32 %combine, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) {
+; GFX8DAGISEL-LABEL: uniform_value_i64:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: s_mul_i32 s0, s2, s4
+; GFX8DAGISEL-NEXT: s_mul_hi_u32 s1, s2, s4
+; GFX8DAGISEL-NEXT: s_mul_i32 s2, s3, s4
+; GFX8DAGISEL-NEXT: s_add_u32 s1, s1, s2
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: uniform_value_i64:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s5, s[4:5]
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: s_mul_i32 s4, s2, s5
+; GFX8GISEL-NEXT: s_mul_hi_u32 s2, s2, s5
+; GFX8GISEL-NEXT: s_mul_i32 s3, s3, s5
+; GFX8GISEL-NEXT: s_add_u32 s5, s2, s3
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s5
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX8GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: uniform_value_i64:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s5, s[4:5]
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: s_mul_i32 s4, s2, s5
+; GFX9DAGISEL-NEXT: s_mul_hi_u32 s2, s2, s5
+; GFX9DAGISEL-NEXT: s_mul_i32 s3, s3, s5
+; GFX9DAGISEL-NEXT: s_add_u32 s5, s2, s3
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s5
+; GFX9DAGISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: uniform_value_i64:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s5, s[4:5]
+; GFX9GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: s_mul_i32 s4, s2, s5
+; GFX9GISEL-NEXT: s_mul_hi_u32 s2, s2, s5
+; GFX9GISEL-NEXT: s_mul_i32 s3, s3, s5
+; GFX9GISEL-NEXT: s_add_u32 s5, s2, s3
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s5
+; GFX9GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: uniform_value_i64:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: s_mul_hi_u32 s5, s2, s4
+; GFX1064DAGISEL-NEXT: s_mul_i32 s3, s3, s4
+; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s2, s4
+; GFX1064DAGISEL-NEXT: s_add_u32 s3, s5, s3
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX1064DAGISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: uniform_value_i64:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: s_mul_hi_u32 s5, s2, s4
+; GFX1064GISEL-NEXT: s_mul_i32 s3, s3, s4
+; GFX1064GISEL-NEXT: s_mul_i32 s2, s2, s4
+; GFX1064GISEL-NEXT: s_add_u32 s3, s5, s3
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX1064GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: uniform_value_i64:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s4, s4
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: s_mul_hi_u32 s5, s2, s4
+; GFX1032DAGISEL-NEXT: s_mul_i32 s3, s3, s4
+; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s2, s4
+; GFX1032DAGISEL-NEXT: s_add_u32 s3, s5, s3
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032DAGISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: uniform_value_i64:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s4, s4
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: s_mul_hi_u32 s5, s2, s4
+; GFX1032GISEL-NEXT: s_mul_i32 s3, s3, s4
+; GFX1032GISEL-NEXT: s_mul_i32 s2, s2, s4
+; GFX1032GISEL-NEXT: s_add_u32 s3, s5, s3
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: uniform_value_i64:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: s_mul_hi_u32 s5, s2, s4
+; GFX1164DAGISEL-NEXT: s_mul_i32 s3, s3, s4
+; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s2, s4
+; GFX1164DAGISEL-NEXT: s_add_u32 s3, s5, s3
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX1164DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: uniform_value_i64:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: s_mul_hi_u32 s5, s2, s4
+; GFX1164GISEL-NEXT: s_mul_i32 s3, s3, s4
+; GFX1164GISEL-NEXT: s_mul_i32 s2, s2, s4
+; GFX1164GISEL-NEXT: s_add_u32 s3, s5, s3
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX1164GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: uniform_value_i64:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s4, s4
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: s_mul_hi_u32 s5, s2, s4
+; GFX1132DAGISEL-NEXT: s_mul_i32 s3, s3, s4
+; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s2, s4
+; GFX1132DAGISEL-NEXT: s_add_u32 s3, s5, s3
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX1132DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: uniform_value_i64:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132GISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s4, s4
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: s_mul_hi_u32 s5, s2, s4
+; GFX1132GISEL-NEXT: s_mul_i32 s3, s3, s4
+; GFX1132GISEL-NEXT: s_mul_i32 s2, s2, s4
+; GFX1132GISEL-NEXT: s_add_u32 s3, s5, s3
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1132GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132GISEL-NEXT: s_endpgm
+entry:
+ %result = call i64 @llvm.amdgcn.wave.reduce.add.i64(i64 %in, i32 1)
+ store i64 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @const_value_i64(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: const_value_i64:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s3, s[2:3]
+; GFX8DAGISEL-NEXT: s_mul_i32 s2, s3, 0x7b
+; GFX8DAGISEL-NEXT: s_mul_hi_u32 s4, 0x7b, s3
+; GFX8DAGISEL-NEXT: s_mul_i32 s3, s3, 0
+; GFX8DAGISEL-NEXT: s_add_u32 s3, s4, s3
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s3
+; GFX8DAGISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: const_value_i64:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s3, s[2:3]
+; GFX8GISEL-NEXT: s_mul_i32 s2, s3, 0x7b
+; GFX8GISEL-NEXT: s_mul_hi_u32 s4, 0x7b, s3
+; GFX8GISEL-NEXT: s_mul_i32 s3, s3, 0
+; GFX8GISEL-NEXT: s_add_u32 s3, s4, s3
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX8GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: const_value_i64:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s3, s[2:3]
+; GFX9DAGISEL-NEXT: s_mul_i32 s2, s3, 0x7b
+; GFX9DAGISEL-NEXT: s_mul_hi_u32 s4, 0x7b, s3
+; GFX9DAGISEL-NEXT: s_mul_i32 s3, s3, 0
+; GFX9DAGISEL-NEXT: s_add_u32 s3, s4, s3
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: const_value_i64:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s3, s[2:3]
+; GFX9GISEL-NEXT: s_mul_i32 s2, s3, 0x7b
+; GFX9GISEL-NEXT: s_mul_hi_u32 s4, 0x7b, s3
+; GFX9GISEL-NEXT: s_mul_i32 s3, s3, 0
+; GFX9GISEL-NEXT: s_add_u32 s3, s4, s3
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX9GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: const_value_i64:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT: s_mul_hi_u32 s3, 0x7b, s2
+; GFX1064DAGISEL-NEXT: s_mul_i32 s4, s2, 0
+; GFX1064DAGISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1064DAGISEL-NEXT: s_add_u32 s3, s3, s4
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: const_value_i64:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT: s_mul_hi_u32 s3, 0x7b, s2
+; GFX1064GISEL-NEXT: s_mul_i32 s4, s2, 0
+; GFX1064GISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1064GISEL-NEXT: s_add_u32 s3, s3, s4
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: const_value_i64:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032DAGISEL-NEXT: s_mul_hi_u32 s3, 0x7b, s2
+; GFX1032DAGISEL-NEXT: s_mul_i32 s4, s2, 0
+; GFX1032DAGISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1032DAGISEL-NEXT: s_add_u32 s3, s3, s4
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: const_value_i64:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032GISEL-NEXT: s_mul_hi_u32 s3, 0x7b, s2
+; GFX1032GISEL-NEXT: s_mul_i32 s4, s2, 0
+; GFX1032GISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1032GISEL-NEXT: s_add_u32 s3, s3, s4
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: const_value_i64:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: s_mul_hi_u32 s3, 0x7b, s2
+; GFX1164DAGISEL-NEXT: s_mul_i32 s4, s2, 0
+; GFX1164DAGISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1164DAGISEL-NEXT: s_add_u32 s3, s3, s4
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: const_value_i64:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: s_mul_hi_u32 s3, 0x7b, s2
+; GFX1164GISEL-NEXT: s_mul_i32 s4, s2, 0
+; GFX1164GISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1164GISEL-NEXT: s_add_u32 s3, s3, s4
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: const_value_i64:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132DAGISEL-NEXT: s_mul_hi_u32 s3, 0x7b, s2
+; GFX1132DAGISEL-NEXT: s_mul_i32 s4, s2, 0
+; GFX1132DAGISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1132DAGISEL-NEXT: s_add_u32 s3, s3, s4
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: const_value_i64:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: s_mul_hi_u32 s3, 0x7b, s2
+; GFX1132GISEL-NEXT: s_mul_i32 s4, s2, 0
+; GFX1132GISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1132GISEL-NEXT: s_add_u32 s3, s3, s4
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132GISEL-NEXT: s_endpgm
+entry:
+ %result = call i64 @llvm.amdgcn.wave.reduce.add.i64(i64 123, i32 1)
+ store i64 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @poison_value_i64(ptr addrspace(1) %out, i64 %in) {
+; GFX8DAGISEL-LABEL: poison_value_i64:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s3, s[2:3]
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: s_mul_i32 s2, s0, s3
+; GFX8DAGISEL-NEXT: s_mul_hi_u32 s4, s0, s3
+; GFX8DAGISEL-NEXT: s_mul_i32 s3, s1, s3
+; GFX8DAGISEL-NEXT: s_add_u32 s3, s4, s3
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s3
+; GFX8DAGISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: poison_value_i64:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s3, s[2:3]
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: s_mul_i32 s2, s0, s3
+; GFX8GISEL-NEXT: s_mul_hi_u32 s4, s0, s3
+; GFX8GISEL-NEXT: s_mul_i32 s3, s1, s3
+; GFX8GISEL-NEXT: s_add_u32 s3, s4, s3
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX8GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: poison_value_i64:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s3, s[2:3]
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: s_mul_i32 s2, s0, s3
+; GFX9DAGISEL-NEXT: s_mul_hi_u32 s4, s0, s3
+; GFX9DAGISEL-NEXT: s_mul_i32 s3, s1, s3
+; GFX9DAGISEL-NEXT: s_add_u32 s3, s4, s3
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX9DAGISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: poison_value_i64:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s3, s[2:3]
+; GFX9GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: s_mul_i32 s2, s0, s3
+; GFX9GISEL-NEXT: s_mul_hi_u32 s4, s0, s3
+; GFX9GISEL-NEXT: s_mul_i32 s3, s1, s3
+; GFX9GISEL-NEXT: s_add_u32 s3, s4, s3
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX9GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: poison_value_i64:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: s_mul_hi_u32 s3, s0, s2
+; GFX1064DAGISEL-NEXT: s_mul_i32 s4, s1, s2
+; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1064DAGISEL-NEXT: s_add_u32 s3, s3, s4
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX1064DAGISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: poison_value_i64:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: s_mul_hi_u32 s3, s0, s2
+; GFX1064GISEL-NEXT: s_mul_i32 s4, s1, s2
+; GFX1064GISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1064GISEL-NEXT: s_add_u32 s3, s3, s4
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX1064GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: poison_value_i64:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: s_mul_hi_u32 s3, s0, s2
+; GFX1032DAGISEL-NEXT: s_mul_i32 s4, s1, s2
+; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1032DAGISEL-NEXT: s_add_u32 s3, s3, s4
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032DAGISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: poison_value_i64:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: s_mul_hi_u32 s3, s0, s2
+; GFX1032GISEL-NEXT: s_mul_i32 s4, s1, s2
+; GFX1032GISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1032GISEL-NEXT: s_add_u32 s3, s3, s4
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: poison_value_i64:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: s_mul_hi_u32 s3, s0, s2
+; GFX1164DAGISEL-NEXT: s_mul_i32 s4, s1, s2
+; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1164DAGISEL-NEXT: s_add_u32 s3, s3, s4
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX1164DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: poison_value_i64:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: s_mul_hi_u32 s3, s0, s2
+; GFX1164GISEL-NEXT: s_mul_i32 s4, s1, s2
+; GFX1164GISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1164GISEL-NEXT: s_add_u32 s3, s3, s4
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX1164GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: poison_value_i64:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: s_mul_hi_u32 s3, s0, s2
+; GFX1132DAGISEL-NEXT: s_mul_i32 s4, s1, s2
+; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1132DAGISEL-NEXT: s_add_u32 s3, s3, s4
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX1132DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: poison_value_i64:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: s_mul_hi_u32 s3, s0, s2
+; GFX1132GISEL-NEXT: s_mul_i32 s4, s1, s2
+; GFX1132GISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1132GISEL-NEXT: s_add_u32 s3, s3, s4
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1132GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132GISEL-NEXT: s_endpgm
+entry:
+ %result = call i64 @llvm.amdgcn.wave.reduce.add.i64(i64 poison, i32 1)
+ store i64 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
+; GFX8DAGISEL-LABEL: divergent_value_i64:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8DAGISEL-NEXT: s_mov_b32 s4, 0
+; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec
+; GFX8DAGISEL-NEXT: s_mov_b32 s5, s4
+; GFX8DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s8, s[6:7]
+; GFX8DAGISEL-NEXT: v_readlane_b32 s9, v2, s8
+; GFX8DAGISEL-NEXT: v_readlane_b32 s10, v3, s8
+; GFX8DAGISEL-NEXT: s_add_u32 s4, s4, s9
+; GFX8DAGISEL-NEXT: s_bitset0_b64 s[6:7], s8
+; GFX8DAGISEL-NEXT: s_addc_u32 s5, s5, s10
+; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX8DAGISEL-NEXT: ; %bb.2:
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s5
+; GFX8DAGISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX8DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8GISEL-LABEL: divergent_value_i64:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8GISEL-NEXT: s_mov_b32 s4, 0
+; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec
+; GFX8GISEL-NEXT: s_mov_b32 s5, s4
+; GFX8GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: s_ff1_i32_b64 s8, s[6:7]
+; GFX8GISEL-NEXT: v_readlane_b32 s9, v2, s8
+; GFX8GISEL-NEXT: v_readlane_b32 s10, v3, s8
+; GFX8GISEL-NEXT: s_add_u32 s4, s4, s9
+; GFX8GISEL-NEXT: s_bitset0_b64 s[6:7], s8
+; GFX8GISEL-NEXT: s_addc_u32 s5, s5, s10
+; GFX8GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX8GISEL-NEXT: ; %bb.2:
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s5
+; GFX8GISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX8GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9DAGISEL-LABEL: divergent_value_i64:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9DAGISEL-NEXT: s_mov_b32 s4, 0
+; GFX9DAGISEL-NEXT: s_mov_b64 s[6:7], exec
+; GFX9DAGISEL-NEXT: s_mov_b32 s5, s4
+; GFX9DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s8, s[6:7]
+; GFX9DAGISEL-NEXT: v_readlane_b32 s9, v2, s8
+; GFX9DAGISEL-NEXT: v_readlane_b32 s10, v3, s8
+; GFX9DAGISEL-NEXT: s_add_u32 s4, s4, s9
+; GFX9DAGISEL-NEXT: s_bitset0_b64 s[6:7], s8
+; GFX9DAGISEL-NEXT: s_addc_u32 s5, s5, s10
+; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX9DAGISEL-NEXT: ; %bb.2:
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v3, s5
+; GFX9DAGISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9GISEL-LABEL: divergent_value_i64:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GISEL-NEXT: s_mov_b32 s4, 0
+; GFX9GISEL-NEXT: s_mov_b64 s[6:7], exec
+; GFX9GISEL-NEXT: s_mov_b32 s5, s4
+; GFX9GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: s_ff1_i32_b64 s8, s[6:7]
+; GFX9GISEL-NEXT: v_readlane_b32 s9, v2, s8
+; GFX9GISEL-NEXT: v_readlane_b32 s10, v3, s8
+; GFX9GISEL-NEXT: s_add_u32 s4, s4, s9
+; GFX9GISEL-NEXT: s_bitset0_b64 s[6:7], s8
+; GFX9GISEL-NEXT: s_addc_u32 s5, s5, s10
+; GFX9GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX9GISEL-NEXT: ; %bb.2:
+; GFX9GISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v3, s5
+; GFX9GISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1064DAGISEL-LABEL: divergent_value_i64:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: s_mov_b32 s4, 0
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[6:7], exec
+; GFX1064DAGISEL-NEXT: s_mov_b32 s5, s4
+; GFX1064DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s8, s[6:7]
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s9, v2, s8
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s10, v3, s8
+; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[6:7], s8
+; GFX1064DAGISEL-NEXT: s_add_u32 s4, s4, s9
+; GFX1064DAGISEL-NEXT: s_addc_u32 s5, s5, s10
+; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX1064DAGISEL-NEXT: ; %bb.2:
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v3, s5
+; GFX1064DAGISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1064DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1064GISEL-LABEL: divergent_value_i64:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064GISEL-NEXT: s_mov_b32 s4, 0
+; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], exec
+; GFX1064GISEL-NEXT: s_mov_b32 s5, s4
+; GFX1064GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: s_ff1_i32_b64 s8, s[6:7]
+; GFX1064GISEL-NEXT: v_readlane_b32 s9, v2, s8
+; GFX1064GISEL-NEXT: v_readlane_b32 s10, v3, s8
+; GFX1064GISEL-NEXT: s_bitset0_b64 s[6:7], s8
+; GFX1064GISEL-NEXT: s_add_u32 s4, s4, s9
+; GFX1064GISEL-NEXT: s_addc_u32 s5, s5, s10
+; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX1064GISEL-NEXT: ; %bb.2:
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v3, s5
+; GFX1064GISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1064GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1032DAGISEL-LABEL: divergent_value_i64:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: s_mov_b32 s4, 0
+; GFX1032DAGISEL-NEXT: s_mov_b32 s6, exec_lo
+; GFX1032DAGISEL-NEXT: s_mov_b32 s5, s4
+; GFX1032DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s7, s6
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s8, v2, s7
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s9, v3, s7
+; GFX1032DAGISEL-NEXT: s_bitset0_b32 s6, s7
+; GFX1032DAGISEL-NEXT: s_add_u32 s4, s4, s8
+; GFX1032DAGISEL-NEXT: s_addc_u32 s5, s5, s9
+; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s6, 0
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX1032DAGISEL-NEXT: ; %bb.2:
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032DAGISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1032DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1032GISEL-LABEL: divergent_value_i64:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032GISEL-NEXT: s_mov_b32 s4, 0
+; GFX1032GISEL-NEXT: s_mov_b32 s6, exec_lo
+; GFX1032GISEL-NEXT: s_mov_b32 s5, s4
+; GFX1032GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: s_ff1_i32_b32 s7, s6
+; GFX1032GISEL-NEXT: v_readlane_b32 s8, v2, s7
+; GFX1032GISEL-NEXT: v_readlane_b32 s9, v3, s7
+; GFX1032GISEL-NEXT: s_bitset0_b32 s6, s7
+; GFX1032GISEL-NEXT: s_add_u32 s4, s4, s8
+; GFX1032GISEL-NEXT: s_addc_u32 s5, s5, s9
+; GFX1032GISEL-NEXT: s_cmp_lg_u32 s6, 0
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX1032GISEL-NEXT: ; %bb.2:
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032GISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1032GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1164DAGISEL-LABEL: divergent_value_i64:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: s_mov_b32 s0, 0
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: s_mov_b32 s1, s0
+; GFX1164DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s4, s[2:3]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: v_readlane_b32 s5, v2, s4
+; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v3, s4
+; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s4
+; GFX1164DAGISEL-NEXT: s_add_u32 s0, s0, s5
+; GFX1164DAGISEL-NEXT: s_addc_u32 s1, s1, s6
+; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX1164DAGISEL-NEXT: ; %bb.2:
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164DAGISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1164DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1164GISEL-LABEL: divergent_value_i64:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164GISEL-NEXT: s_mov_b32 s0, 0
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: s_mov_b32 s1, s0
+; GFX1164GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: s_ctz_i32_b64 s4, s[2:3]
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: v_readlane_b32 s5, v2, s4
+; GFX1164GISEL-NEXT: v_readlane_b32 s6, v3, s4
+; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s4
+; GFX1164GISEL-NEXT: s_add_u32 s0, s0, s5
+; GFX1164GISEL-NEXT: s_addc_u32 s1, s1, s6
+; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX1164GISEL-NEXT: ; %bb.2:
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1164GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1132DAGISEL-LABEL: divergent_value_i64:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: s_mov_b32 s0, 0
+; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT: s_mov_b32 s1, s0
+; GFX1132DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: v_readlane_b32 s4, v2, s3
+; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v3, s3
+; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3
+; GFX1132DAGISEL-NEXT: s_add_u32 s0, s0, s4
+; GFX1132DAGISEL-NEXT: s_addc_u32 s1, s1, s5
+; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX1132DAGISEL-NEXT: ; %bb.2:
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX1132DAGISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1132DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1132GISEL-LABEL: divergent_value_i64:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132GISEL-NEXT: s_mov_b32 s0, 0
+; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT: s_mov_b32 s1, s0
+; GFX1132GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: v_readlane_b32 s4, v2, s3
+; GFX1132GISEL-NEXT: v_readlane_b32 s5, v3, s3
+; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3
+; GFX1132GISEL-NEXT: s_add_u32 s0, s0, s4
+; GFX1132GISEL-NEXT: s_addc_u32 s1, s1, s5
+; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX1132GISEL-NEXT: ; %bb.2:
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX1132GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1132GISEL-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %result = call i64 @llvm.amdgcn.wave.reduce.add.i64(i64 %id.x, i32 1)
+ store i64 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 %in2) {
+; GFX8DAGISEL-LABEL: divergent_cfg_i64:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
+; GFX8DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr6_sgpr7
+; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc
+; GFX8DAGISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB9_2
+; GFX8DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec
+; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s7, s[6:7]
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: s_mul_i32 s6, s2, s7
+; GFX8DAGISEL-NEXT: s_mul_hi_u32 s2, s2, s7
+; GFX8DAGISEL-NEXT: s_mul_i32 s3, s3, s7
+; GFX8DAGISEL-NEXT: s_add_u32 s7, s2, s3
+; GFX8DAGISEL-NEXT: .LBB9_2: ; %Flow
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9]
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s7
+; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3]
+; GFX8DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec
+; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s7, s[6:7]
+; GFX8DAGISEL-NEXT: s_mul_i32 s6, s4, s7
+; GFX8DAGISEL-NEXT: s_mul_hi_u32 s4, s4, s7
+; GFX8DAGISEL-NEXT: s_mul_i32 s5, s5, s7
+; GFX8DAGISEL-NEXT: s_add_u32 s7, s4, s5
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s7
+; GFX8DAGISEL-NEXT: ; %bb.4: ; %endif
+; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_cfg_i64:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
+; GFX8GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7
+; GFX8GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc
+; GFX8GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB9_2
+; GFX8GISEL-NEXT: ; %bb.1: ; %else
+; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec
+; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s7, s[6:7]
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: s_mul_i32 s6, s2, s7
+; GFX8GISEL-NEXT: s_mul_hi_u32 s2, s2, s7
+; GFX8GISEL-NEXT: s_mul_i32 s3, s3, s7
+; GFX8GISEL-NEXT: s_add_u32 s7, s2, s3
+; GFX8GISEL-NEXT: .LBB9_2: ; %Flow
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[8:9]
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB9_4
+; GFX8GISEL-NEXT: ; %bb.3: ; %if
+; GFX8GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
+; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec
+; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s7, s[6:7]
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: s_mul_i32 s6, s4, s7
+; GFX8GISEL-NEXT: s_mul_hi_u32 s4, s4, s7
+; GFX8GISEL-NEXT: s_mul_i32 s5, s5, s7
+; GFX8GISEL-NEXT: s_add_u32 s7, s4, s5
+; GFX8GISEL-NEXT: .LBB9_4: ; %endif
+; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s7
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX8GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_cfg_i64:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr4_sgpr5
+; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc
+; GFX9DAGISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB9_2
+; GFX9DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s5, s[4:5]
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: s_mul_i32 s4, s2, s5
+; GFX9DAGISEL-NEXT: s_mul_hi_u32 s2, s2, s5
+; GFX9DAGISEL-NEXT: s_mul_i32 s3, s3, s5
+; GFX9DAGISEL-NEXT: s_add_u32 s5, s2, s3
+; GFX9DAGISEL-NEXT: .LBB9_2: ; %Flow
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9]
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s5
+; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3]
+; GFX9DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s5, s[4:5]
+; GFX9DAGISEL-NEXT: s_mul_i32 s4, s6, s5
+; GFX9DAGISEL-NEXT: s_mul_hi_u32 s6, s6, s5
+; GFX9DAGISEL-NEXT: s_mul_i32 s5, s7, s5
+; GFX9DAGISEL-NEXT: s_add_u32 s5, s6, s5
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s5
+; GFX9DAGISEL-NEXT: ; %bb.4: ; %endif
+; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX9DAGISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_cfg_i64:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
+; GFX9GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7
+; GFX9GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc
+; GFX9GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB9_2
+; GFX9GISEL-NEXT: ; %bb.1: ; %else
+; GFX9GISEL-NEXT: s_mov_b64 s[6:7], exec
+; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s7, s[6:7]
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: s_mul_i32 s6, s2, s7
+; GFX9GISEL-NEXT: s_mul_hi_u32 s2, s2, s7
+; GFX9GISEL-NEXT: s_mul_i32 s3, s3, s7
+; GFX9GISEL-NEXT: s_add_u32 s7, s2, s3
+; GFX9GISEL-NEXT: .LBB9_2: ; %Flow
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[8:9]
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB9_4
+; GFX9GISEL-NEXT: ; %bb.3: ; %if
+; GFX9GISEL-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
+; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: s_mul_i32 s6, s8, s4
+; GFX9GISEL-NEXT: s_mul_hi_u32 s5, s8, s4
+; GFX9GISEL-NEXT: s_mul_i32 s4, s9, s4
+; GFX9GISEL-NEXT: s_add_u32 s7, s5, s4
+; GFX9GISEL-NEXT: .LBB9_4: ; %endif
+; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s7
+; GFX9GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX9GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_cfg_i64:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_clause 0x1
+; GFX1064DAGISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX1064DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr8_sgpr9
+; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064DAGISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB9_2
+; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[8:9], exec
+; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s8, s[8:9]
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: s_mul_hi_u32 s9, s2, s8
+; GFX1064DAGISEL-NEXT: s_mul_i32 s3, s3, s8
+; GFX1064DAGISEL-NEXT: s_mul_i32 s8, s2, s8
+; GFX1064DAGISEL-NEXT: s_add_u32 s9, s9, s3
+; GFX1064DAGISEL-NEXT: .LBB9_2: ; %Flow
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[4:5]
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s8
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s9
+; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3]
+; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1064DAGISEL-NEXT: s_mul_hi_u32 s5, s6, s4
+; GFX1064DAGISEL-NEXT: s_mul_i32 s7, s7, s4
+; GFX1064DAGISEL-NEXT: s_mul_i32 s4, s6, s4
+; GFX1064DAGISEL-NEXT: s_add_u32 s5, s5, s7
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s5
+; GFX1064DAGISEL-NEXT: ; %bb.4: ; %endif
+; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064DAGISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_cfg_i64:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1064GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
+; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7
+; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc
+; GFX1064GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB9_2
+; GFX1064GISEL-NEXT: ; %bb.1: ; %else
+; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], exec
+; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: s_mul_hi_u32 s7, s2, s6
+; GFX1064GISEL-NEXT: s_mul_i32 s3, s3, s6
+; GFX1064GISEL-NEXT: s_mul_i32 s6, s2, s6
+; GFX1064GISEL-NEXT: s_add_u32 s7, s7, s3
+; GFX1064GISEL-NEXT: .LBB9_2: ; %Flow
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[8:9]
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB9_4
+; GFX1064GISEL-NEXT: ; %bb.3: ; %if
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: s_mul_hi_u32 s5, s6, s4
+; GFX1064GISEL-NEXT: s_mul_i32 s7, s7, s4
+; GFX1064GISEL-NEXT: s_mul_i32 s6, s6, s4
+; GFX1064GISEL-NEXT: s_add_u32 s7, s5, s7
+; GFX1064GISEL-NEXT: .LBB9_4: ; %endif
+; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, s7
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_cfg_i64:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_clause 0x1
+; GFX1032DAGISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX1032DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc_lo, 15, v0
+; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr4_sgpr5
+; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s8, vcc_lo
+; GFX1032DAGISEL-NEXT: s_xor_b32 s8, exec_lo, s8
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB9_2
+; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s4, s4
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: s_mul_hi_u32 s5, s2, s4
+; GFX1032DAGISEL-NEXT: s_mul_i32 s3, s3, s4
+; GFX1032DAGISEL-NEXT: s_mul_i32 s4, s2, s4
+; GFX1032DAGISEL-NEXT: s_add_u32 s5, s5, s3
+; GFX1032DAGISEL-NEXT: .LBB9_2: ; %Flow
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s2, s8
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s5
+; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s2
+; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032DAGISEL-NEXT: s_mul_hi_u32 s5, s6, s3
+; GFX1032DAGISEL-NEXT: s_mul_i32 s7, s7, s3
+; GFX1032DAGISEL-NEXT: s_mul_i32 s4, s6, s3
+; GFX1032DAGISEL-NEXT: s_add_u32 s5, s5, s7
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s5
+; GFX1032DAGISEL-NEXT: ; %bb.4: ; %endif
+; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032DAGISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_cfg_i64:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1032GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v0
+; GFX1032GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7
+; GFX1032GISEL-NEXT: s_and_saveexec_b32 s8, vcc_lo
+; GFX1032GISEL-NEXT: s_xor_b32 s8, exec_lo, s8
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB9_2
+; GFX1032GISEL-NEXT: ; %bb.1: ; %else
+; GFX1032GISEL-NEXT: s_mov_b32 s6, exec_lo
+; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s6, s6
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: s_mul_hi_u32 s7, s2, s6
+; GFX1032GISEL-NEXT: s_mul_i32 s3, s3, s6
+; GFX1032GISEL-NEXT: s_mul_i32 s6, s2, s6
+; GFX1032GISEL-NEXT: s_add_u32 s7, s7, s3
+; GFX1032GISEL-NEXT: .LBB9_2: ; %Flow
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s2, s8
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB9_4
+; GFX1032GISEL-NEXT: ; %bb.3: ; %if
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: s_mul_hi_u32 s4, s6, s3
+; GFX1032GISEL-NEXT: s_mul_i32 s5, s7, s3
+; GFX1032GISEL-NEXT: s_mul_i32 s6, s6, s3
+; GFX1032GISEL-NEXT: s_add_u32 s7, s4, s5
+; GFX1032GISEL-NEXT: .LBB9_4: ; %endif
+; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s7
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_cfg_i64:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_clause 0x1
+; GFX1164DAGISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[6:7], exec
+; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr8_sgpr9
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
+; GFX1164DAGISEL-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB9_2
+; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[8:9], exec
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s8, s[8:9]
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: s_mul_hi_u32 s9, s2, s8
+; GFX1164DAGISEL-NEXT: s_mul_i32 s3, s3, s8
+; GFX1164DAGISEL-NEXT: s_mul_i32 s8, s2, s8
+; GFX1164DAGISEL-NEXT: s_add_u32 s9, s9, s3
+; GFX1164DAGISEL-NEXT: .LBB9_2: ; %Flow
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[6:7]
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s8
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s9
+; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3]
+; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[6:7], exec
+; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: s_mul_hi_u32 s7, s4, s6
+; GFX1164DAGISEL-NEXT: s_mul_i32 s5, s5, s6
+; GFX1164DAGISEL-NEXT: s_mul_i32 s4, s4, s6
+; GFX1164DAGISEL-NEXT: s_add_u32 s5, s7, s5
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s5
+; GFX1164DAGISEL-NEXT: ; %bb.4: ; %endif
+; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_cfg_i64:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT: s_mov_b64 s[8:9], exec
+; GFX1164GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
+; GFX1164GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB9_2
+; GFX1164GISEL-NEXT: ; %bb.1: ; %else
+; GFX1164GISEL-NEXT: s_mov_b64 s[6:7], exec
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: s_mul_hi_u32 s7, s2, s6
+; GFX1164GISEL-NEXT: s_mul_i32 s3, s3, s6
+; GFX1164GISEL-NEXT: s_mul_i32 s6, s2, s6
+; GFX1164GISEL-NEXT: s_add_u32 s7, s7, s3
+; GFX1164GISEL-NEXT: .LBB9_2: ; %Flow
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[2:3], s[8:9]
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB9_4
+; GFX1164GISEL-NEXT: ; %bb.3: ; %if
+; GFX1164GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1164GISEL-NEXT: s_mov_b64 s[6:7], exec
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: s_mul_hi_u32 s7, s4, s6
+; GFX1164GISEL-NEXT: s_mul_i32 s5, s5, s6
+; GFX1164GISEL-NEXT: s_mul_i32 s6, s4, s6
+; GFX1164GISEL-NEXT: s_add_u32 s7, s7, s5
+; GFX1164GISEL-NEXT: .LBB9_4: ; %endif
+; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, s7
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_cfg_i64:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_clause 0x1
+; GFX1132DAGISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT: s_mov_b32 s8, exec_lo
+; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr6_sgpr7
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
+; GFX1132DAGISEL-NEXT: s_xor_b32 s8, exec_lo, s8
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB9_2
+; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1132DAGISEL-NEXT: s_mov_b32 s6, exec_lo
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s6, s6
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: s_mul_hi_u32 s7, s2, s6
+; GFX1132DAGISEL-NEXT: s_mul_i32 s3, s3, s6
+; GFX1132DAGISEL-NEXT: s_mul_i32 s6, s2, s6
+; GFX1132DAGISEL-NEXT: s_add_u32 s7, s7, s3
+; GFX1132DAGISEL-NEXT: .LBB9_2: ; %Flow
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s2, s8
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s2
+; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_mul_hi_u32 s6, s4, s3
+; GFX1132DAGISEL-NEXT: s_mul_i32 s5, s5, s3
+; GFX1132DAGISEL-NEXT: s_mul_i32 s4, s4, s3
+; GFX1132DAGISEL-NEXT: s_add_u32 s5, s6, s5
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX1132DAGISEL-NEXT: ; %bb.4: ; %endif
+; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_cfg_i64:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT: s_mov_b32 s8, exec_lo
+; GFX1132GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
+; GFX1132GISEL-NEXT: s_xor_b32 s8, exec_lo, s8
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB9_2
+; GFX1132GISEL-NEXT: ; %bb.1: ; %else
+; GFX1132GISEL-NEXT: s_mov_b32 s6, exec_lo
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s6, s6
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: s_mul_hi_u32 s7, s2, s6
+; GFX1132GISEL-NEXT: s_mul_i32 s3, s3, s6
+; GFX1132GISEL-NEXT: s_mul_i32 s6, s2, s6
+; GFX1132GISEL-NEXT: s_add_u32 s7, s7, s3
+; GFX1132GISEL-NEXT: .LBB9_2: ; %Flow
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s2, s8
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB9_4
+; GFX1132GISEL-NEXT: ; %bb.3: ; %if
+; GFX1132GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: s_mul_hi_u32 s7, s4, s3
+; GFX1132GISEL-NEXT: s_mul_i32 s5, s5, s3
+; GFX1132GISEL-NEXT: s_mul_i32 s6, s4, s3
+; GFX1132GISEL-NEXT: s_add_u32 s7, s7, s5
+; GFX1132GISEL-NEXT: .LBB9_4: ; %endif
+; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132GISEL-NEXT: s_endpgm
+entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %d_cmp = icmp ult i32 %tid, 16
+ br i1 %d_cmp, label %if, label %else
+
+if:
+ %reducedValTid = call i64 @llvm.amdgcn.wave.reduce.add.i64(i64 %in2, i32 1)
+ br label %endif
+
+else:
+ %reducedValIn = call i64 @llvm.amdgcn.wave.reduce.add.i64(i64 %in, i32 1)
+ br label %endif
+
+endif:
+ %combine = phi i64 [%reducedValTid, %if], [%reducedValIn, %else]
+ store i64 %combine, ptr addrspace(1) %out
+ ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX10DAGISEL: {{.*}}
+; GFX10GISEL: {{.*}}
+; GFX11DAGISEL: {{.*}}
+; GFX11GISEL: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll
new file mode 100644
index 0000000000000..9a0917133fc59
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll
@@ -0,0 +1,2897 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 < %s | FileCheck -check-prefixes=GFX8DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 < %s | FileCheck -check-prefixes=GFX8GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 < %s | FileCheck -check-prefixes=GFX9DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 < %s | FileCheck -check-prefixes=GFX9GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s
+
+define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: uniform_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: s_sub_i32 s3, 0, s6
+; GFX8DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: uniform_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: s_sub_i32 s3, 0, s6
+; GFX8GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: uniform_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: s_sub_i32 s3, 0, s6
+; GFX9DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: uniform_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: s_sub_i32 s3, 0, s6
+; GFX9GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: uniform_value:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_clause 0x1
+; GFX1064DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: s_sub_i32 s3, 0, s6
+; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: uniform_value:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_clause 0x1
+; GFX1064GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: s_sub_i32 s3, 0, s6
+; GFX1064GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: uniform_value:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_clause 0x1
+; GFX1032DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: s_sub_i32 s2, 0, s2
+; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s2, s3
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: uniform_value:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_clause 0x1
+; GFX1032GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: s_sub_i32 s2, 0, s2
+; GFX1032GISEL-NEXT: s_mul_i32 s2, s2, s3
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: uniform_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_clause 0x1
+; GFX1164DAGISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: s_sub_i32 s3, 0, s6
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: uniform_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_clause 0x1
+; GFX1164GISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: s_sub_i32 s3, 0, s6
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: uniform_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_clause 0x1
+; GFX1132DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: s_sub_i32 s2, 0, s2
+; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s2, s3
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: uniform_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_clause 0x1
+; GFX1132GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: s_sub_i32 s2, 0, s2
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: s_mul_i32 s2, s2, s3
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_endpgm
+entry:
+ %result = call i32 @llvm.amdgcn.wave.reduce.sub.i32(i32 %in, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @const_value(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: const_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_sub_i32 s4, 0, 0x7b
+; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8DAGISEL-NEXT: s_mul_i32 s2, s4, s2
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: const_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8GISEL-NEXT: s_sub_i32 s3, 0, 0x7b
+; GFX8GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: const_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9DAGISEL-NEXT: s_sub_i32 s3, 0, 0x7b
+; GFX9DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: const_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9GISEL-NEXT: s_sub_i32 s3, 0, 0x7b
+; GFX9GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: const_value:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT: s_sub_i32 s3, 0, 0x7b
+; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: const_value:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT: s_sub_i32 s3, 0, 0x7b
+; GFX1064GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: const_value:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT: s_sub_i32 s3, 0, 0x7b
+; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: const_value:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT: s_sub_i32 s3, 0, 0x7b
+; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: const_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT: s_sub_i32 s3, 0, 0x7b
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: const_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT: s_sub_i32 s3, 0, 0x7b
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: const_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT: s_sub_i32 s3, 0, 0x7b
+; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: const_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT: s_sub_i32 s3, 0, 0x7b
+; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_endpgm
+entry:
+ %result = call i32 @llvm.amdgcn.wave.reduce.sub.i32(i32 123, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @poison_value(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: poison_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: s_sub_i32 s4, 0, s0
+; GFX8DAGISEL-NEXT: s_mul_i32 s2, s4, s2
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: poison_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: s_sub_i32 s3, 0, s0
+; GFX8GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: poison_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: s_sub_i32 s3, 0, s0
+; GFX9DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: poison_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: s_sub_i32 s3, 0, s0
+; GFX9GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: poison_value:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: s_sub_i32 s3, 0, s0
+; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: poison_value:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: s_sub_i32 s3, 0, s0
+; GFX1064GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: poison_value:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: s_sub_i32 s3, 0, s0
+; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: poison_value:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: s_sub_i32 s3, 0, s0
+; GFX1032GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: poison_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: s_sub_i32 s3, 0, s0
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: poison_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: s_sub_i32 s3, 0, s0
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: poison_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: s_sub_i32 s3, 0, s0
+; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: poison_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: s_sub_i32 s3, 0, s0
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_endpgm
+entry:
+ %result = call i32 @llvm.amdgcn.wave.reduce.sub.i32(i32 poison, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: divergent_value:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_mov_b32 s4, 0
+; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX8DAGISEL-NEXT: s_sub_i32 s4, s4, s6
+; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX8DAGISEL-NEXT: ; %bb.2:
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_value:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_mov_b32 s4, 0
+; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX8GISEL-NEXT: s_sub_i32 s4, s4, s6
+; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX8GISEL-NEXT: ; %bb.2:
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_value:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_mov_b32 s4, 0
+; GFX9DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX9DAGISEL-NEXT: s_sub_i32 s4, s4, s6
+; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX9DAGISEL-NEXT: ; %bb.2:
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_value:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_mov_b32 s4, 0
+; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX9GISEL-NEXT: s_sub_i32 s4, s4, s6
+; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX9GISEL-NEXT: ; %bb.2:
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_value:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: s_mov_b32 s4, 0
+; GFX1064DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1064DAGISEL-NEXT: s_sub_i32 s4, s4, s6
+; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1064DAGISEL-NEXT: ; %bb.2:
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_value:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: s_mov_b32 s4, 0
+; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1064GISEL-NEXT: s_sub_i32 s4, s4, s6
+; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1064GISEL-NEXT: ; %bb.2:
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_value:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, 0
+; GFX1032DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1032DAGISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1032DAGISEL-NEXT: s_sub_i32 s2, s2, s5
+; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1032DAGISEL-NEXT: ; %bb.2:
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_value:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032GISEL-NEXT: s_mov_b32 s2, 0
+; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032GISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1032GISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1032GISEL-NEXT: s_sub_i32 s2, s2, s5
+; GFX1032GISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1032GISEL-NEXT: ; %bb.2:
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_value:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: s_mov_b32 s4, 0
+; GFX1164DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164DAGISEL-NEXT: s_sub_i32 s4, s4, s6
+; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1164DAGISEL-NEXT: ; %bb.2:
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_value:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: s_mov_b32 s4, 0
+; GFX1164GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164GISEL-NEXT: s_sub_i32 s4, s4, s6
+; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1164GISEL-NEXT: ; %bb.2:
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_value:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132DAGISEL-NEXT: s_mov_b32 s2, 0
+; GFX1132DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s3
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1132DAGISEL-NEXT: s_sub_i32 s2, s2, s5
+; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1132DAGISEL-NEXT: ; %bb.2:
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_value:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132GISEL-NEXT: s_mov_b32 s2, 0
+; GFX1132GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s3
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX1132GISEL-NEXT: s_sub_i32 s2, s2, s5
+; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1132GISEL-NEXT: ; %bb.2:
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: s_endpgm
+entry:
+ %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+ %result = call i32 @llvm.amdgcn.wave.reduce.sub.i32(i32 %id.x, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
+; GFX8DAGISEL-LABEL: divergent_cfg:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr2
+; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX8DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX8DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX8DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: s_sub_i32 s3, 0, s6
+; GFX8DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX8DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX8DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_mov_b32 s6, 0
+; GFX8DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX8DAGISEL-NEXT: s_sub_i32 s6, s6, s8
+; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX8DAGISEL-NEXT: ; %bb.5:
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX8DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT: flat_store_dword v[2:3], v1
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_cfg:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
+; GFX8GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX8GISEL-NEXT: ; %bb.1: ; %else
+; GFX8GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: s_sub_i32 s3, 0, s6
+; GFX8GISEL-NEXT: s_mul_i32 s6, s3, s2
+; GFX8GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX8GISEL-NEXT: ; %bb.3: ; %if
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_mov_b32 s6, 0
+; GFX8GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX8GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX8GISEL-NEXT: s_sub_i32 s6, s6, s8
+; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX8GISEL-NEXT: .LBB4_5: ; %endif
+; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_cfg:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr2
+; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX9DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX9DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: s_sub_i32 s3, 0, s6
+; GFX9DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX9DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX9DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_mov_b32 s6, 0
+; GFX9DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX9DAGISEL-NEXT: s_sub_i32 s6, s6, s8
+; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX9DAGISEL-NEXT: ; %bb.5:
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX9DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_cfg:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
+; GFX9GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX9GISEL-NEXT: ; %bb.1: ; %else
+; GFX9GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: s_sub_i32 s3, 0, s6
+; GFX9GISEL-NEXT: s_mul_i32 s6, s3, s2
+; GFX9GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX9GISEL-NEXT: ; %bb.3: ; %if
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_mov_b32 s6, 0
+; GFX9GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX9GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX9GISEL-NEXT: s_sub_i32 s6, s6, s8
+; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX9GISEL-NEXT: .LBB4_5: ; %endif
+; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_cfg:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr2
+; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1064DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: s_sub_i32 s3, 0, s6
+; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1064DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: s_mov_b32 s6, 0
+; GFX1064DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX1064DAGISEL-NEXT: s_sub_i32 s6, s6, s8
+; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1064DAGISEL-NEXT: ; %bb.5:
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX1064DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_cfg:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
+; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1064GISEL-NEXT: ; %bb.1: ; %else
+; GFX1064GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: s_sub_i32 s3, 0, s6
+; GFX1064GISEL-NEXT: s_mul_i32 s6, s3, s2
+; GFX1064GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1064GISEL-NEXT: ; %bb.3: ; %if
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: s_mov_b32 s6, 0
+; GFX1064GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3]
+; GFX1064GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX1064GISEL-NEXT: s_sub_i32 s6, s6, s8
+; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1064GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_cfg:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc_lo, 15, v0
+; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr1
+; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1032DAGISEL-NEXT: s_load_dword s1, s[4:5], 0x2c
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: s_sub_i32 s1, 0, s1
+; GFX1032DAGISEL-NEXT: s_mul_i32 s1, s1, s2
+; GFX1032DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, s0
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT: s_mov_b32 s1, 0
+; GFX1032DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s3, s2
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1032DAGISEL-NEXT: s_bitset0_b32 s2, s3
+; GFX1032DAGISEL-NEXT: s_sub_i32 s1, s1, s6
+; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1032DAGISEL-NEXT: ; %bb.5:
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1032DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_cfg:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v0
+; GFX1032GISEL-NEXT: ; implicit-def: $sgpr0
+; GFX1032GISEL-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX1032GISEL-NEXT: s_xor_b32 s1, exec_lo, s1
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1032GISEL-NEXT: ; %bb.1: ; %else
+; GFX1032GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c
+; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: s_sub_i32 s0, 0, s0
+; GFX1032GISEL-NEXT: s_mul_i32 s0, s0, s2
+; GFX1032GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s1, s1
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1032GISEL-NEXT: ; %bb.3: ; %if
+; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT: s_mov_b32 s0, 0
+; GFX1032GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: s_ff1_i32_b32 s3, s2
+; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1032GISEL-NEXT: s_bitset0_b32 s2, s3
+; GFX1032GISEL-NEXT: s_sub_i32 s0, s0, s6
+; GFX1032GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1032GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_cfg:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr2
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
+; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1164DAGISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: s_sub_i32 s3, 0, s6
+; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s3, s2
+; GFX1164DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: s_mov_b32 s6, 0
+; GFX1164DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX1164DAGISEL-NEXT: s_sub_i32 s6, s6, s8
+; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1164DAGISEL-NEXT: ; %bb.5:
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX1164DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_cfg:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164GISEL-NEXT: ; implicit-def: $sgpr6
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
+; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1164GISEL-NEXT: ; %bb.1: ; %else
+; GFX1164GISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: s_sub_i32 s3, 0, s6
+; GFX1164GISEL-NEXT: s_mul_i32 s6, s3, s2
+; GFX1164GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1]
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1164GISEL-NEXT: ; %bb.3: ; %if
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: s_mov_b32 s6, 0
+; GFX1164GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s7
+; GFX1164GISEL-NEXT: s_sub_i32 s6, s6, s8
+; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1164GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_cfg:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr1
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
+; GFX1132DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1132DAGISEL-NEXT: s_load_b32 s1, s[4:5], 0x2c
+; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: s_sub_i32 s1, 0, s1
+; GFX1132DAGISEL-NEXT: s_mul_i32 s1, s1, s2
+; GFX1132DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, s0
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_6
+; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0
+; GFX1132DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3
+; GFX1132DAGISEL-NEXT: s_sub_i32 s1, s1, s6
+; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1132DAGISEL-NEXT: ; %bb.5:
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1132DAGISEL-NEXT: .LBB4_6: ; %endif
+; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_cfg:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT: s_mov_b32 s1, exec_lo
+; GFX1132GISEL-NEXT: ; implicit-def: $sgpr0
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
+; GFX1132GISEL-NEXT: s_xor_b32 s1, exec_lo, s1
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1132GISEL-NEXT: ; %bb.1: ; %else
+; GFX1132GISEL-NEXT: s_load_b32 s0, s[4:5], 0x2c
+; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: s_sub_i32 s0, 0, s0
+; GFX1132GISEL-NEXT: s_mul_i32 s0, s0, s2
+; GFX1132GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s1, s1
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_5
+; GFX1132GISEL-NEXT: ; %bb.3: ; %if
+; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT: s_mov_b32 s0, 0
+; GFX1132GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s3
+; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3
+; GFX1132GISEL-NEXT: s_sub_i32 s0, s0, s6
+; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_4
+; GFX1132GISEL-NEXT: .LBB4_5: ; %endif
+; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
+; GFX1132GISEL-NEXT: s_endpgm
+entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %d_cmp = icmp ult i32 %tid, 16
+ br i1 %d_cmp, label %if, label %else
+
+if:
+ %reducedValTid = call i32 @llvm.amdgcn.wave.reduce.sub.i32(i32 %tid, i32 1)
+ br label %endif
+
+else:
+ %reducedValIn = call i32 @llvm.amdgcn.wave.reduce.sub.i32(i32 %in, i32 1)
+ br label %endif
+
+endif:
+ %combine = phi i32 [%reducedValTid, %if], [%reducedValIn, %else]
+ store i32 %combine, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) {
+; GFX8DAGISEL-LABEL: uniform_value_i64:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX8DAGISEL-NEXT: s_sub_i32 s4, 0, s4
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: s_ashr_i32 s0, s4, 31
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: s_mul_i32 s1, s2, s0
+; GFX8DAGISEL-NEXT: s_mul_i32 s0, s2, s4
+; GFX8DAGISEL-NEXT: s_mul_hi_u32 s2, s2, s4
+; GFX8DAGISEL-NEXT: s_mul_i32 s3, s3, s4
+; GFX8DAGISEL-NEXT: s_add_u32 s2, s2, s3
+; GFX8DAGISEL-NEXT: s_add_u32 s1, s2, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: uniform_value_i64:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX8GISEL-NEXT: s_sub_i32 s5, 0, s4
+; GFX8GISEL-NEXT: s_ashr_i32 s4, s5, 31
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: s_mul_i32 s6, s2, s4
+; GFX8GISEL-NEXT: s_mul_i32 s4, s2, s5
+; GFX8GISEL-NEXT: s_mul_hi_u32 s2, s2, s5
+; GFX8GISEL-NEXT: s_mul_i32 s3, s3, s5
+; GFX8GISEL-NEXT: s_add_u32 s2, s2, s3
+; GFX8GISEL-NEXT: s_add_u32 s5, s2, s6
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s5
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX8GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: uniform_value_i64:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX9DAGISEL-NEXT: s_sub_i32 s5, 0, s4
+; GFX9DAGISEL-NEXT: s_ashr_i32 s4, s5, 31
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: s_mul_i32 s6, s2, s4
+; GFX9DAGISEL-NEXT: s_mul_i32 s4, s2, s5
+; GFX9DAGISEL-NEXT: s_mul_hi_u32 s2, s2, s5
+; GFX9DAGISEL-NEXT: s_mul_i32 s3, s3, s5
+; GFX9DAGISEL-NEXT: s_add_u32 s2, s2, s3
+; GFX9DAGISEL-NEXT: s_add_u32 s5, s2, s6
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s5
+; GFX9DAGISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: uniform_value_i64:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX9GISEL-NEXT: s_sub_i32 s5, 0, s4
+; GFX9GISEL-NEXT: s_ashr_i32 s4, s5, 31
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: s_mul_i32 s6, s2, s4
+; GFX9GISEL-NEXT: s_mul_i32 s4, s2, s5
+; GFX9GISEL-NEXT: s_mul_hi_u32 s2, s2, s5
+; GFX9GISEL-NEXT: s_mul_i32 s3, s3, s5
+; GFX9GISEL-NEXT: s_add_u32 s2, s2, s3
+; GFX9GISEL-NEXT: s_add_u32 s5, s2, s6
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s5
+; GFX9GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX9GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: uniform_value_i64:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1064DAGISEL-NEXT: s_sub_i32 s4, 0, s4
+; GFX1064DAGISEL-NEXT: s_ashr_i32 s5, s4, 31
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: s_mul_hi_u32 s6, s2, s4
+; GFX1064DAGISEL-NEXT: s_mul_i32 s3, s3, s4
+; GFX1064DAGISEL-NEXT: s_mul_i32 s5, s2, s5
+; GFX1064DAGISEL-NEXT: s_add_u32 s3, s6, s3
+; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s2, s4
+; GFX1064DAGISEL-NEXT: s_add_u32 s3, s3, s5
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX1064DAGISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: uniform_value_i64:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1064GISEL-NEXT: s_sub_i32 s4, 0, s4
+; GFX1064GISEL-NEXT: s_ashr_i32 s5, s4, 31
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: s_mul_hi_u32 s6, s2, s4
+; GFX1064GISEL-NEXT: s_mul_i32 s3, s3, s4
+; GFX1064GISEL-NEXT: s_mul_i32 s5, s2, s5
+; GFX1064GISEL-NEXT: s_add_u32 s3, s6, s3
+; GFX1064GISEL-NEXT: s_mul_i32 s2, s2, s4
+; GFX1064GISEL-NEXT: s_add_u32 s3, s3, s5
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX1064GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: uniform_value_i64:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s4, s4
+; GFX1032DAGISEL-NEXT: s_sub_i32 s4, 0, s4
+; GFX1032DAGISEL-NEXT: s_ashr_i32 s5, s4, 31
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: s_mul_hi_u32 s6, s2, s4
+; GFX1032DAGISEL-NEXT: s_mul_i32 s3, s3, s4
+; GFX1032DAGISEL-NEXT: s_mul_i32 s5, s2, s5
+; GFX1032DAGISEL-NEXT: s_add_u32 s3, s6, s3
+; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s2, s4
+; GFX1032DAGISEL-NEXT: s_add_u32 s3, s3, s5
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032DAGISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: uniform_value_i64:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s4, s4
+; GFX1032GISEL-NEXT: s_sub_i32 s4, 0, s4
+; GFX1032GISEL-NEXT: s_ashr_i32 s5, s4, 31
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: s_mul_hi_u32 s6, s2, s4
+; GFX1032GISEL-NEXT: s_mul_i32 s3, s3, s4
+; GFX1032GISEL-NEXT: s_mul_i32 s5, s2, s5
+; GFX1032GISEL-NEXT: s_add_u32 s3, s6, s3
+; GFX1032GISEL-NEXT: s_mul_i32 s2, s2, s4
+; GFX1032GISEL-NEXT: s_add_u32 s3, s3, s5
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: uniform_value_i64:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: s_sub_i32 s4, 0, s4
+; GFX1164DAGISEL-NEXT: s_ashr_i32 s5, s4, 31
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: s_mul_hi_u32 s6, s2, s4
+; GFX1164DAGISEL-NEXT: s_mul_i32 s3, s3, s4
+; GFX1164DAGISEL-NEXT: s_mul_i32 s5, s2, s5
+; GFX1164DAGISEL-NEXT: s_add_u32 s3, s6, s3
+; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s2, s4
+; GFX1164DAGISEL-NEXT: s_add_u32 s3, s3, s5
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX1164DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: uniform_value_i64:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: s_sub_i32 s4, 0, s4
+; GFX1164GISEL-NEXT: s_ashr_i32 s5, s4, 31
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: s_mul_hi_u32 s6, s2, s4
+; GFX1164GISEL-NEXT: s_mul_i32 s3, s3, s4
+; GFX1164GISEL-NEXT: s_mul_i32 s5, s2, s5
+; GFX1164GISEL-NEXT: s_add_u32 s3, s6, s3
+; GFX1164GISEL-NEXT: s_mul_i32 s2, s2, s4
+; GFX1164GISEL-NEXT: s_add_u32 s3, s3, s5
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX1164GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: uniform_value_i64:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s4, s4
+; GFX1132DAGISEL-NEXT: s_sub_i32 s4, 0, s4
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_ashr_i32 s5, s4, 31
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: s_mul_hi_u32 s6, s2, s4
+; GFX1132DAGISEL-NEXT: s_mul_i32 s3, s3, s4
+; GFX1132DAGISEL-NEXT: s_mul_i32 s5, s2, s5
+; GFX1132DAGISEL-NEXT: s_add_u32 s3, s6, s3
+; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s2, s4
+; GFX1132DAGISEL-NEXT: s_add_u32 s3, s3, s5
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX1132DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: uniform_value_i64:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132GISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s4, s4
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: s_sub_i32 s4, 0, s4
+; GFX1132GISEL-NEXT: s_ashr_i32 s5, s4, 31
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: s_mul_hi_u32 s6, s2, s4
+; GFX1132GISEL-NEXT: s_mul_i32 s3, s3, s4
+; GFX1132GISEL-NEXT: s_mul_i32 s5, s2, s5
+; GFX1132GISEL-NEXT: s_add_u32 s3, s6, s3
+; GFX1132GISEL-NEXT: s_mul_i32 s2, s2, s4
+; GFX1132GISEL-NEXT: s_add_u32 s3, s3, s5
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1132GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132GISEL-NEXT: s_endpgm
+entry:
+ %result = call i64 @llvm.amdgcn.wave.reduce.sub.i64(i64 %in, i32 1)
+ store i64 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @const_value_i64(ptr addrspace(1) %out) {
+; GFX8DAGISEL-LABEL: const_value_i64:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8DAGISEL-NEXT: s_sub_i32 s3, 0, s2
+; GFX8DAGISEL-NEXT: s_ashr_i32 s2, s3, 31
+; GFX8DAGISEL-NEXT: s_mul_i32 s4, s2, 0x7b
+; GFX8DAGISEL-NEXT: s_mul_i32 s2, s3, 0x7b
+; GFX8DAGISEL-NEXT: s_mul_hi_u32 s5, 0x7b, s3
+; GFX8DAGISEL-NEXT: s_mul_i32 s3, s3, 0
+; GFX8DAGISEL-NEXT: s_add_u32 s3, s5, s3
+; GFX8DAGISEL-NEXT: s_add_u32 s3, s3, s4
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s3
+; GFX8DAGISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: const_value_i64:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8GISEL-NEXT: s_sub_i32 s3, 0, s2
+; GFX8GISEL-NEXT: s_ashr_i32 s2, s3, 31
+; GFX8GISEL-NEXT: s_mul_i32 s4, s2, 0x7b
+; GFX8GISEL-NEXT: s_mul_i32 s2, s3, 0x7b
+; GFX8GISEL-NEXT: s_mul_hi_u32 s5, 0x7b, s3
+; GFX8GISEL-NEXT: s_mul_i32 s3, s3, 0
+; GFX8GISEL-NEXT: s_add_u32 s3, s5, s3
+; GFX8GISEL-NEXT: s_add_u32 s3, s3, s4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX8GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: const_value_i64:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9DAGISEL-NEXT: s_sub_i32 s3, 0, s2
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: s_ashr_i32 s2, s3, 31
+; GFX9DAGISEL-NEXT: s_mul_i32 s4, s2, 0x7b
+; GFX9DAGISEL-NEXT: s_mul_i32 s2, s3, 0x7b
+; GFX9DAGISEL-NEXT: s_mul_hi_u32 s5, 0x7b, s3
+; GFX9DAGISEL-NEXT: s_mul_i32 s3, s3, 0
+; GFX9DAGISEL-NEXT: s_add_u32 s3, s5, s3
+; GFX9DAGISEL-NEXT: s_add_u32 s3, s3, s4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: const_value_i64:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9GISEL-NEXT: s_sub_i32 s3, 0, s2
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: s_ashr_i32 s2, s3, 31
+; GFX9GISEL-NEXT: s_mul_i32 s4, s2, 0x7b
+; GFX9GISEL-NEXT: s_mul_i32 s2, s3, 0x7b
+; GFX9GISEL-NEXT: s_mul_hi_u32 s5, 0x7b, s3
+; GFX9GISEL-NEXT: s_mul_i32 s3, s3, 0
+; GFX9GISEL-NEXT: s_add_u32 s3, s5, s3
+; GFX9GISEL-NEXT: s_add_u32 s3, s3, s4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX9GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: const_value_i64:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT: s_sub_i32 s2, 0, s2
+; GFX1064DAGISEL-NEXT: s_ashr_i32 s3, s2, 31
+; GFX1064DAGISEL-NEXT: s_mul_hi_u32 s4, 0x7b, s2
+; GFX1064DAGISEL-NEXT: s_mul_i32 s5, s2, 0
+; GFX1064DAGISEL-NEXT: s_mulk_i32 s3, 0x7b
+; GFX1064DAGISEL-NEXT: s_add_u32 s4, s4, s5
+; GFX1064DAGISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1064DAGISEL-NEXT: s_add_u32 s3, s4, s3
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: const_value_i64:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT: s_sub_i32 s2, 0, s2
+; GFX1064GISEL-NEXT: s_ashr_i32 s3, s2, 31
+; GFX1064GISEL-NEXT: s_mul_hi_u32 s4, 0x7b, s2
+; GFX1064GISEL-NEXT: s_mul_i32 s5, s2, 0
+; GFX1064GISEL-NEXT: s_mulk_i32 s3, 0x7b
+; GFX1064GISEL-NEXT: s_add_u32 s4, s4, s5
+; GFX1064GISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1064GISEL-NEXT: s_add_u32 s3, s4, s3
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: const_value_i64:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032DAGISEL-NEXT: s_sub_i32 s2, 0, s2
+; GFX1032DAGISEL-NEXT: s_ashr_i32 s3, s2, 31
+; GFX1032DAGISEL-NEXT: s_mul_hi_u32 s4, 0x7b, s2
+; GFX1032DAGISEL-NEXT: s_mul_i32 s5, s2, 0
+; GFX1032DAGISEL-NEXT: s_mulk_i32 s3, 0x7b
+; GFX1032DAGISEL-NEXT: s_add_u32 s4, s4, s5
+; GFX1032DAGISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1032DAGISEL-NEXT: s_add_u32 s3, s4, s3
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: const_value_i64:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032GISEL-NEXT: s_sub_i32 s2, 0, s2
+; GFX1032GISEL-NEXT: s_ashr_i32 s3, s2, 31
+; GFX1032GISEL-NEXT: s_mul_hi_u32 s4, 0x7b, s2
+; GFX1032GISEL-NEXT: s_mul_i32 s5, s2, 0
+; GFX1032GISEL-NEXT: s_mulk_i32 s3, 0x7b
+; GFX1032GISEL-NEXT: s_add_u32 s4, s4, s5
+; GFX1032GISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1032GISEL-NEXT: s_add_u32 s3, s4, s3
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: const_value_i64:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: s_sub_i32 s2, 0, s2
+; GFX1164DAGISEL-NEXT: s_ashr_i32 s3, s2, 31
+; GFX1164DAGISEL-NEXT: s_mul_hi_u32 s4, 0x7b, s2
+; GFX1164DAGISEL-NEXT: s_mul_i32 s5, s2, 0
+; GFX1164DAGISEL-NEXT: s_mulk_i32 s3, 0x7b
+; GFX1164DAGISEL-NEXT: s_add_u32 s4, s4, s5
+; GFX1164DAGISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1164DAGISEL-NEXT: s_add_u32 s3, s4, s3
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: const_value_i64:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: s_sub_i32 s2, 0, s2
+; GFX1164GISEL-NEXT: s_ashr_i32 s3, s2, 31
+; GFX1164GISEL-NEXT: s_mul_hi_u32 s4, 0x7b, s2
+; GFX1164GISEL-NEXT: s_mul_i32 s5, s2, 0
+; GFX1164GISEL-NEXT: s_mulk_i32 s3, 0x7b
+; GFX1164GISEL-NEXT: s_add_u32 s4, s4, s5
+; GFX1164GISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1164GISEL-NEXT: s_add_u32 s3, s4, s3
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: const_value_i64:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132DAGISEL-NEXT: s_sub_i32 s2, 0, s2
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_ashr_i32 s3, s2, 31
+; GFX1132DAGISEL-NEXT: s_mul_hi_u32 s4, 0x7b, s2
+; GFX1132DAGISEL-NEXT: s_mul_i32 s5, s2, 0
+; GFX1132DAGISEL-NEXT: s_mulk_i32 s3, 0x7b
+; GFX1132DAGISEL-NEXT: s_add_u32 s4, s4, s5
+; GFX1132DAGISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1132DAGISEL-NEXT: s_add_u32 s3, s4, s3
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: const_value_i64:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: s_sub_i32 s2, 0, s2
+; GFX1132GISEL-NEXT: s_ashr_i32 s3, s2, 31
+; GFX1132GISEL-NEXT: s_mul_hi_u32 s4, 0x7b, s2
+; GFX1132GISEL-NEXT: s_mul_i32 s5, s2, 0
+; GFX1132GISEL-NEXT: s_mulk_i32 s3, 0x7b
+; GFX1132GISEL-NEXT: s_add_u32 s4, s4, s5
+; GFX1132GISEL-NEXT: s_mulk_i32 s2, 0x7b
+; GFX1132GISEL-NEXT: s_add_u32 s3, s4, s3
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132GISEL-NEXT: s_endpgm
+entry:
+ %result = call i64 @llvm.amdgcn.wave.reduce.sub.i64(i64 123, i32 1)
+ store i64 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @poison_value_i64(ptr addrspace(1) %out, i64 %in) {
+; GFX8DAGISEL-LABEL: poison_value_i64:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8DAGISEL-NEXT: s_sub_i32 s3, 0, s2
+; GFX8DAGISEL-NEXT: s_ashr_i32 s2, s3, 31
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: s_mul_i32 s4, s0, s2
+; GFX8DAGISEL-NEXT: s_mul_i32 s2, s0, s3
+; GFX8DAGISEL-NEXT: s_mul_hi_u32 s5, s0, s3
+; GFX8DAGISEL-NEXT: s_mul_i32 s3, s1, s3
+; GFX8DAGISEL-NEXT: s_add_u32 s3, s5, s3
+; GFX8DAGISEL-NEXT: s_add_u32 s3, s3, s4
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s3
+; GFX8DAGISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: poison_value_i64:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8GISEL-NEXT: s_sub_i32 s3, 0, s2
+; GFX8GISEL-NEXT: s_ashr_i32 s2, s3, 31
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: s_mul_i32 s4, s0, s2
+; GFX8GISEL-NEXT: s_mul_i32 s2, s0, s3
+; GFX8GISEL-NEXT: s_mul_hi_u32 s5, s0, s3
+; GFX8GISEL-NEXT: s_mul_i32 s3, s1, s3
+; GFX8GISEL-NEXT: s_add_u32 s3, s5, s3
+; GFX8GISEL-NEXT: s_add_u32 s3, s3, s4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX8GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: poison_value_i64:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9DAGISEL-NEXT: s_sub_i32 s3, 0, s2
+; GFX9DAGISEL-NEXT: s_ashr_i32 s2, s3, 31
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: s_mul_i32 s4, s0, s2
+; GFX9DAGISEL-NEXT: s_mul_i32 s2, s0, s3
+; GFX9DAGISEL-NEXT: s_mul_hi_u32 s5, s0, s3
+; GFX9DAGISEL-NEXT: s_mul_i32 s3, s1, s3
+; GFX9DAGISEL-NEXT: s_add_u32 s3, s5, s3
+; GFX9DAGISEL-NEXT: s_add_u32 s3, s3, s4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX9DAGISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: poison_value_i64:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9GISEL-NEXT: s_sub_i32 s3, 0, s2
+; GFX9GISEL-NEXT: s_ashr_i32 s2, s3, 31
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: s_mul_i32 s4, s0, s2
+; GFX9GISEL-NEXT: s_mul_i32 s2, s0, s3
+; GFX9GISEL-NEXT: s_mul_hi_u32 s5, s0, s3
+; GFX9GISEL-NEXT: s_mul_i32 s3, s1, s3
+; GFX9GISEL-NEXT: s_add_u32 s3, s5, s3
+; GFX9GISEL-NEXT: s_add_u32 s3, s3, s4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX9GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX9GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: poison_value_i64:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT: s_sub_i32 s2, 0, s2
+; GFX1064DAGISEL-NEXT: s_ashr_i32 s3, s2, 31
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: s_mul_hi_u32 s4, s0, s2
+; GFX1064DAGISEL-NEXT: s_mul_i32 s5, s1, s2
+; GFX1064DAGISEL-NEXT: s_mul_i32 s3, s0, s3
+; GFX1064DAGISEL-NEXT: s_add_u32 s4, s4, s5
+; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1064DAGISEL-NEXT: s_add_u32 s3, s4, s3
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX1064DAGISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: poison_value_i64:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT: s_sub_i32 s2, 0, s2
+; GFX1064GISEL-NEXT: s_ashr_i32 s3, s2, 31
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: s_mul_hi_u32 s4, s0, s2
+; GFX1064GISEL-NEXT: s_mul_i32 s5, s1, s2
+; GFX1064GISEL-NEXT: s_mul_i32 s3, s0, s3
+; GFX1064GISEL-NEXT: s_add_u32 s4, s4, s5
+; GFX1064GISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1064GISEL-NEXT: s_add_u32 s3, s4, s3
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX1064GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: poison_value_i64:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032DAGISEL-NEXT: s_sub_i32 s2, 0, s2
+; GFX1032DAGISEL-NEXT: s_ashr_i32 s3, s2, 31
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: s_mul_hi_u32 s4, s0, s2
+; GFX1032DAGISEL-NEXT: s_mul_i32 s5, s1, s2
+; GFX1032DAGISEL-NEXT: s_mul_i32 s3, s0, s3
+; GFX1032DAGISEL-NEXT: s_add_u32 s4, s4, s5
+; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1032DAGISEL-NEXT: s_add_u32 s3, s4, s3
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032DAGISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: poison_value_i64:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032GISEL-NEXT: s_sub_i32 s2, 0, s2
+; GFX1032GISEL-NEXT: s_ashr_i32 s3, s2, 31
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: s_mul_hi_u32 s4, s0, s2
+; GFX1032GISEL-NEXT: s_mul_i32 s5, s1, s2
+; GFX1032GISEL-NEXT: s_mul_i32 s3, s0, s3
+; GFX1032GISEL-NEXT: s_add_u32 s4, s4, s5
+; GFX1032GISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1032GISEL-NEXT: s_add_u32 s3, s4, s3
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: poison_value_i64:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: s_sub_i32 s2, 0, s2
+; GFX1164DAGISEL-NEXT: s_ashr_i32 s3, s2, 31
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: s_mul_hi_u32 s4, s0, s2
+; GFX1164DAGISEL-NEXT: s_mul_i32 s5, s1, s2
+; GFX1164DAGISEL-NEXT: s_mul_i32 s3, s0, s3
+; GFX1164DAGISEL-NEXT: s_add_u32 s4, s4, s5
+; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1164DAGISEL-NEXT: s_add_u32 s3, s4, s3
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX1164DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: poison_value_i64:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: s_sub_i32 s2, 0, s2
+; GFX1164GISEL-NEXT: s_ashr_i32 s3, s2, 31
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: s_mul_hi_u32 s4, s0, s2
+; GFX1164GISEL-NEXT: s_mul_i32 s5, s1, s2
+; GFX1164GISEL-NEXT: s_mul_i32 s3, s0, s3
+; GFX1164GISEL-NEXT: s_add_u32 s4, s4, s5
+; GFX1164GISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1164GISEL-NEXT: s_add_u32 s3, s4, s3
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX1164GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: poison_value_i64:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132DAGISEL-NEXT: s_sub_i32 s2, 0, s2
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_ashr_i32 s3, s2, 31
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: s_mul_hi_u32 s4, s0, s2
+; GFX1132DAGISEL-NEXT: s_mul_i32 s5, s1, s2
+; GFX1132DAGISEL-NEXT: s_mul_i32 s3, s0, s3
+; GFX1132DAGISEL-NEXT: s_add_u32 s4, s4, s5
+; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1132DAGISEL-NEXT: s_add_u32 s3, s4, s3
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX1132DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: poison_value_i64:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: s_sub_i32 s2, 0, s2
+; GFX1132GISEL-NEXT: s_ashr_i32 s3, s2, 31
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: s_mul_hi_u32 s4, s0, s2
+; GFX1132GISEL-NEXT: s_mul_i32 s5, s1, s2
+; GFX1132GISEL-NEXT: s_mul_i32 s3, s0, s3
+; GFX1132GISEL-NEXT: s_add_u32 s4, s4, s5
+; GFX1132GISEL-NEXT: s_mul_i32 s2, s0, s2
+; GFX1132GISEL-NEXT: s_add_u32 s3, s4, s3
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1132GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132GISEL-NEXT: s_endpgm
+entry:
+ %result = call i64 @llvm.amdgcn.wave.reduce.sub.i64(i64 poison, i32 1)
+ store i64 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
+; GFX8DAGISEL-LABEL: divergent_value_i64:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8DAGISEL-NEXT: s_mov_b32 s4, 0
+; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec
+; GFX8DAGISEL-NEXT: s_mov_b32 s5, s4
+; GFX8DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s8, s[6:7]
+; GFX8DAGISEL-NEXT: v_readlane_b32 s9, v2, s8
+; GFX8DAGISEL-NEXT: v_readlane_b32 s10, v3, s8
+; GFX8DAGISEL-NEXT: s_sub_u32 s4, s4, s9
+; GFX8DAGISEL-NEXT: s_bitset0_b64 s[6:7], s8
+; GFX8DAGISEL-NEXT: s_subb_u32 s5, s5, s10
+; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX8DAGISEL-NEXT: ; %bb.2:
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s5
+; GFX8DAGISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX8DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8GISEL-LABEL: divergent_value_i64:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8GISEL-NEXT: s_mov_b32 s4, 0
+; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec
+; GFX8GISEL-NEXT: s_mov_b32 s5, s4
+; GFX8GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: s_ff1_i32_b64 s8, s[6:7]
+; GFX8GISEL-NEXT: v_readlane_b32 s9, v2, s8
+; GFX8GISEL-NEXT: v_readlane_b32 s10, v3, s8
+; GFX8GISEL-NEXT: s_sub_u32 s4, s4, s9
+; GFX8GISEL-NEXT: s_bitset0_b64 s[6:7], s8
+; GFX8GISEL-NEXT: s_subb_u32 s5, s5, s10
+; GFX8GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX8GISEL-NEXT: ; %bb.2:
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s5
+; GFX8GISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX8GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9DAGISEL-LABEL: divergent_value_i64:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9DAGISEL-NEXT: s_mov_b32 s4, 0
+; GFX9DAGISEL-NEXT: s_mov_b64 s[6:7], exec
+; GFX9DAGISEL-NEXT: s_mov_b32 s5, s4
+; GFX9DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s8, s[6:7]
+; GFX9DAGISEL-NEXT: v_readlane_b32 s9, v2, s8
+; GFX9DAGISEL-NEXT: v_readlane_b32 s10, v3, s8
+; GFX9DAGISEL-NEXT: s_sub_u32 s4, s4, s9
+; GFX9DAGISEL-NEXT: s_bitset0_b64 s[6:7], s8
+; GFX9DAGISEL-NEXT: s_subb_u32 s5, s5, s10
+; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX9DAGISEL-NEXT: ; %bb.2:
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v3, s5
+; GFX9DAGISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9GISEL-LABEL: divergent_value_i64:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GISEL-NEXT: s_mov_b32 s4, 0
+; GFX9GISEL-NEXT: s_mov_b64 s[6:7], exec
+; GFX9GISEL-NEXT: s_mov_b32 s5, s4
+; GFX9GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: s_ff1_i32_b64 s8, s[6:7]
+; GFX9GISEL-NEXT: v_readlane_b32 s9, v2, s8
+; GFX9GISEL-NEXT: v_readlane_b32 s10, v3, s8
+; GFX9GISEL-NEXT: s_sub_u32 s4, s4, s9
+; GFX9GISEL-NEXT: s_bitset0_b64 s[6:7], s8
+; GFX9GISEL-NEXT: s_subb_u32 s5, s5, s10
+; GFX9GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX9GISEL-NEXT: ; %bb.2:
+; GFX9GISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v3, s5
+; GFX9GISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1064DAGISEL-LABEL: divergent_value_i64:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: s_mov_b32 s4, 0
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[6:7], exec
+; GFX1064DAGISEL-NEXT: s_mov_b32 s5, s4
+; GFX1064DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s8, s[6:7]
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s9, v2, s8
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s10, v3, s8
+; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[6:7], s8
+; GFX1064DAGISEL-NEXT: s_sub_u32 s4, s4, s9
+; GFX1064DAGISEL-NEXT: s_subb_u32 s5, s5, s10
+; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX1064DAGISEL-NEXT: ; %bb.2:
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v3, s5
+; GFX1064DAGISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1064DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1064GISEL-LABEL: divergent_value_i64:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064GISEL-NEXT: s_mov_b32 s4, 0
+; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], exec
+; GFX1064GISEL-NEXT: s_mov_b32 s5, s4
+; GFX1064GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: s_ff1_i32_b64 s8, s[6:7]
+; GFX1064GISEL-NEXT: v_readlane_b32 s9, v2, s8
+; GFX1064GISEL-NEXT: v_readlane_b32 s10, v3, s8
+; GFX1064GISEL-NEXT: s_bitset0_b64 s[6:7], s8
+; GFX1064GISEL-NEXT: s_sub_u32 s4, s4, s9
+; GFX1064GISEL-NEXT: s_subb_u32 s5, s5, s10
+; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX1064GISEL-NEXT: ; %bb.2:
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v3, s5
+; GFX1064GISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1064GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1032DAGISEL-LABEL: divergent_value_i64:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: s_mov_b32 s4, 0
+; GFX1032DAGISEL-NEXT: s_mov_b32 s6, exec_lo
+; GFX1032DAGISEL-NEXT: s_mov_b32 s5, s4
+; GFX1032DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s7, s6
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s8, v2, s7
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s9, v3, s7
+; GFX1032DAGISEL-NEXT: s_bitset0_b32 s6, s7
+; GFX1032DAGISEL-NEXT: s_sub_u32 s4, s4, s8
+; GFX1032DAGISEL-NEXT: s_subb_u32 s5, s5, s9
+; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s6, 0
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX1032DAGISEL-NEXT: ; %bb.2:
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032DAGISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1032DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1032GISEL-LABEL: divergent_value_i64:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032GISEL-NEXT: s_mov_b32 s4, 0
+; GFX1032GISEL-NEXT: s_mov_b32 s6, exec_lo
+; GFX1032GISEL-NEXT: s_mov_b32 s5, s4
+; GFX1032GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: s_ff1_i32_b32 s7, s6
+; GFX1032GISEL-NEXT: v_readlane_b32 s8, v2, s7
+; GFX1032GISEL-NEXT: v_readlane_b32 s9, v3, s7
+; GFX1032GISEL-NEXT: s_bitset0_b32 s6, s7
+; GFX1032GISEL-NEXT: s_sub_u32 s4, s4, s8
+; GFX1032GISEL-NEXT: s_subb_u32 s5, s5, s9
+; GFX1032GISEL-NEXT: s_cmp_lg_u32 s6, 0
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX1032GISEL-NEXT: ; %bb.2:
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032GISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1032GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1164DAGISEL-LABEL: divergent_value_i64:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: s_mov_b32 s0, 0
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: s_mov_b32 s1, s0
+; GFX1164DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s4, s[2:3]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: v_readlane_b32 s5, v2, s4
+; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v3, s4
+; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s4
+; GFX1164DAGISEL-NEXT: s_sub_u32 s0, s0, s5
+; GFX1164DAGISEL-NEXT: s_subb_u32 s1, s1, s6
+; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX1164DAGISEL-NEXT: ; %bb.2:
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164DAGISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1164DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1164GISEL-LABEL: divergent_value_i64:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164GISEL-NEXT: s_mov_b32 s0, 0
+; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: s_mov_b32 s1, s0
+; GFX1164GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: s_ctz_i32_b64 s4, s[2:3]
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: v_readlane_b32 s5, v2, s4
+; GFX1164GISEL-NEXT: v_readlane_b32 s6, v3, s4
+; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s4
+; GFX1164GISEL-NEXT: s_sub_u32 s0, s0, s5
+; GFX1164GISEL-NEXT: s_subb_u32 s1, s1, s6
+; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX1164GISEL-NEXT: ; %bb.2:
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1164GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1132DAGISEL-LABEL: divergent_value_i64:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: s_mov_b32 s0, 0
+; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT: s_mov_b32 s1, s0
+; GFX1132DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: v_readlane_b32 s4, v2, s3
+; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v3, s3
+; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3
+; GFX1132DAGISEL-NEXT: s_sub_u32 s0, s0, s4
+; GFX1132DAGISEL-NEXT: s_subb_u32 s1, s1, s5
+; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX1132DAGISEL-NEXT: ; %bb.2:
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX1132DAGISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1132DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1132GISEL-LABEL: divergent_value_i64:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132GISEL-NEXT: s_mov_b32 s0, 0
+; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT: s_mov_b32 s1, s0
+; GFX1132GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: v_readlane_b32 s4, v2, s3
+; GFX1132GISEL-NEXT: v_readlane_b32 s5, v3, s3
+; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3
+; GFX1132GISEL-NEXT: s_sub_u32 s0, s0, s4
+; GFX1132GISEL-NEXT: s_subb_u32 s1, s1, s5
+; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX1132GISEL-NEXT: ; %bb.2:
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX1132GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1132GISEL-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %result = call i64 @llvm.amdgcn.wave.reduce.sub.i64(i64 %id.x, i32 1)
+ store i64 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 %in2) {
+; GFX8DAGISEL-LABEL: divergent_cfg_i64:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
+; GFX8DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr6_sgpr7
+; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc
+; GFX8DAGISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB9_2
+; GFX8DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec
+; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX8DAGISEL-NEXT: s_sub_i32 s7, 0, s6
+; GFX8DAGISEL-NEXT: s_ashr_i32 s6, s7, 31
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: s_mul_i32 s10, s2, s6
+; GFX8DAGISEL-NEXT: s_mul_i32 s6, s2, s7
+; GFX8DAGISEL-NEXT: s_mul_hi_u32 s2, s2, s7
+; GFX8DAGISEL-NEXT: s_mul_i32 s3, s3, s7
+; GFX8DAGISEL-NEXT: s_add_u32 s2, s2, s3
+; GFX8DAGISEL-NEXT: s_add_u32 s7, s2, s10
+; GFX8DAGISEL-NEXT: .LBB9_2: ; %Flow
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9]
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s7
+; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3]
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB9_4
+; GFX8DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec
+; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX8DAGISEL-NEXT: s_sub_i32 s7, 0, s6
+; GFX8DAGISEL-NEXT: s_ashr_i32 s6, s7, 31
+; GFX8DAGISEL-NEXT: s_mul_i32 s8, s4, s6
+; GFX8DAGISEL-NEXT: s_mul_i32 s6, s4, s7
+; GFX8DAGISEL-NEXT: s_mul_hi_u32 s4, s4, s7
+; GFX8DAGISEL-NEXT: s_mul_i32 s5, s5, s7
+; GFX8DAGISEL-NEXT: s_add_u32 s4, s4, s5
+; GFX8DAGISEL-NEXT: s_add_u32 s7, s4, s8
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s7
+; GFX8DAGISEL-NEXT: .LBB9_4: ; %endif
+; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_cfg_i64:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
+; GFX8GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7
+; GFX8GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc
+; GFX8GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB9_2
+; GFX8GISEL-NEXT: ; %bb.1: ; %else
+; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec
+; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX8GISEL-NEXT: s_sub_i32 s7, 0, s6
+; GFX8GISEL-NEXT: s_ashr_i32 s6, s7, 31
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: s_mul_i32 s10, s2, s6
+; GFX8GISEL-NEXT: s_mul_i32 s6, s2, s7
+; GFX8GISEL-NEXT: s_mul_hi_u32 s2, s2, s7
+; GFX8GISEL-NEXT: s_mul_i32 s3, s3, s7
+; GFX8GISEL-NEXT: s_add_u32 s2, s2, s3
+; GFX8GISEL-NEXT: s_add_u32 s7, s2, s10
+; GFX8GISEL-NEXT: .LBB9_2: ; %Flow
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[8:9]
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB9_4
+; GFX8GISEL-NEXT: ; %bb.3: ; %if
+; GFX8GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
+; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec
+; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX8GISEL-NEXT: s_sub_i32 s7, 0, s6
+; GFX8GISEL-NEXT: s_ashr_i32 s6, s7, 31
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: s_mul_i32 s8, s4, s6
+; GFX8GISEL-NEXT: s_mul_i32 s6, s4, s7
+; GFX8GISEL-NEXT: s_mul_hi_u32 s4, s4, s7
+; GFX8GISEL-NEXT: s_mul_i32 s5, s5, s7
+; GFX8GISEL-NEXT: s_add_u32 s4, s4, s5
+; GFX8GISEL-NEXT: s_add_u32 s7, s4, s8
+; GFX8GISEL-NEXT: .LBB9_4: ; %endif
+; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s7
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX8GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_cfg_i64:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr4_sgpr5
+; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc
+; GFX9DAGISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB9_2
+; GFX9DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX9DAGISEL-NEXT: s_sub_i32 s5, 0, s4
+; GFX9DAGISEL-NEXT: s_ashr_i32 s4, s5, 31
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: s_mul_i32 s10, s2, s4
+; GFX9DAGISEL-NEXT: s_mul_i32 s4, s2, s5
+; GFX9DAGISEL-NEXT: s_mul_hi_u32 s2, s2, s5
+; GFX9DAGISEL-NEXT: s_mul_i32 s3, s3, s5
+; GFX9DAGISEL-NEXT: s_add_u32 s2, s2, s3
+; GFX9DAGISEL-NEXT: s_add_u32 s5, s2, s10
+; GFX9DAGISEL-NEXT: .LBB9_2: ; %Flow
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9]
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s5
+; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3]
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB9_4
+; GFX9DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX9DAGISEL-NEXT: s_sub_i32 s5, 0, s4
+; GFX9DAGISEL-NEXT: s_ashr_i32 s4, s5, 31
+; GFX9DAGISEL-NEXT: s_mul_i32 s8, s6, s4
+; GFX9DAGISEL-NEXT: s_mul_i32 s4, s6, s5
+; GFX9DAGISEL-NEXT: s_mul_hi_u32 s6, s6, s5
+; GFX9DAGISEL-NEXT: s_mul_i32 s5, s7, s5
+; GFX9DAGISEL-NEXT: s_add_u32 s5, s6, s5
+; GFX9DAGISEL-NEXT: s_add_u32 s5, s5, s8
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s5
+; GFX9DAGISEL-NEXT: .LBB9_4: ; %endif
+; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX9DAGISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_cfg_i64:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
+; GFX9GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7
+; GFX9GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc
+; GFX9GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB9_2
+; GFX9GISEL-NEXT: ; %bb.1: ; %else
+; GFX9GISEL-NEXT: s_mov_b64 s[6:7], exec
+; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX9GISEL-NEXT: s_sub_i32 s7, 0, s6
+; GFX9GISEL-NEXT: s_ashr_i32 s6, s7, 31
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: s_mul_i32 s10, s2, s6
+; GFX9GISEL-NEXT: s_mul_i32 s6, s2, s7
+; GFX9GISEL-NEXT: s_mul_hi_u32 s2, s2, s7
+; GFX9GISEL-NEXT: s_mul_i32 s3, s3, s7
+; GFX9GISEL-NEXT: s_add_u32 s2, s2, s3
+; GFX9GISEL-NEXT: s_add_u32 s7, s2, s10
+; GFX9GISEL-NEXT: .LBB9_2: ; %Flow
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[8:9]
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB9_4
+; GFX9GISEL-NEXT: ; %bb.3: ; %if
+; GFX9GISEL-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
+; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX9GISEL-NEXT: s_sub_i32 s4, 0, s4
+; GFX9GISEL-NEXT: s_ashr_i32 s5, s4, 31
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: s_mul_i32 s6, s8, s4
+; GFX9GISEL-NEXT: s_mul_hi_u32 s7, s8, s4
+; GFX9GISEL-NEXT: s_mul_i32 s4, s9, s4
+; GFX9GISEL-NEXT: s_mul_i32 s5, s8, s5
+; GFX9GISEL-NEXT: s_add_u32 s4, s7, s4
+; GFX9GISEL-NEXT: s_add_u32 s7, s4, s5
+; GFX9GISEL-NEXT: .LBB9_4: ; %endif
+; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s7
+; GFX9GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX9GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_cfg_i64:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_clause 0x1
+; GFX1064DAGISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX1064DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr8_sgpr9
+; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064DAGISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB9_2
+; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[8:9], exec
+; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s8, s[8:9]
+; GFX1064DAGISEL-NEXT: s_sub_i32 s8, 0, s8
+; GFX1064DAGISEL-NEXT: s_ashr_i32 s9, s8, 31
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: s_mul_hi_u32 s10, s2, s8
+; GFX1064DAGISEL-NEXT: s_mul_i32 s3, s3, s8
+; GFX1064DAGISEL-NEXT: s_mul_i32 s9, s2, s9
+; GFX1064DAGISEL-NEXT: s_add_u32 s3, s10, s3
+; GFX1064DAGISEL-NEXT: s_mul_i32 s8, s2, s8
+; GFX1064DAGISEL-NEXT: s_add_u32 s9, s3, s9
+; GFX1064DAGISEL-NEXT: .LBB9_2: ; %Flow
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[4:5]
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s8
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s9
+; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3]
+; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1064DAGISEL-NEXT: s_sub_i32 s4, 0, s4
+; GFX1064DAGISEL-NEXT: s_ashr_i32 s5, s4, 31
+; GFX1064DAGISEL-NEXT: s_mul_hi_u32 s8, s6, s4
+; GFX1064DAGISEL-NEXT: s_mul_i32 s7, s7, s4
+; GFX1064DAGISEL-NEXT: s_mul_i32 s5, s6, s5
+; GFX1064DAGISEL-NEXT: s_add_u32 s7, s8, s7
+; GFX1064DAGISEL-NEXT: s_mul_i32 s4, s6, s4
+; GFX1064DAGISEL-NEXT: s_add_u32 s5, s7, s5
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s5
+; GFX1064DAGISEL-NEXT: ; %bb.4: ; %endif
+; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064DAGISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_cfg_i64:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1064GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0
+; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7
+; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc
+; GFX1064GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB9_2
+; GFX1064GISEL-NEXT: ; %bb.1: ; %else
+; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], exec
+; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1064GISEL-NEXT: s_sub_i32 s6, 0, s6
+; GFX1064GISEL-NEXT: s_ashr_i32 s7, s6, 31
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: s_mul_hi_u32 s10, s2, s6
+; GFX1064GISEL-NEXT: s_mul_i32 s3, s3, s6
+; GFX1064GISEL-NEXT: s_mul_i32 s7, s2, s7
+; GFX1064GISEL-NEXT: s_add_u32 s3, s10, s3
+; GFX1064GISEL-NEXT: s_mul_i32 s6, s2, s6
+; GFX1064GISEL-NEXT: s_add_u32 s7, s3, s7
+; GFX1064GISEL-NEXT: .LBB9_2: ; %Flow
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[8:9]
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB9_4
+; GFX1064GISEL-NEXT: ; %bb.3: ; %if
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX1064GISEL-NEXT: s_sub_i32 s4, 0, s4
+; GFX1064GISEL-NEXT: s_ashr_i32 s5, s4, 31
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: s_mul_hi_u32 s8, s6, s4
+; GFX1064GISEL-NEXT: s_mul_i32 s7, s7, s4
+; GFX1064GISEL-NEXT: s_mul_i32 s5, s6, s5
+; GFX1064GISEL-NEXT: s_add_u32 s7, s8, s7
+; GFX1064GISEL-NEXT: s_mul_i32 s6, s6, s4
+; GFX1064GISEL-NEXT: s_add_u32 s7, s7, s5
+; GFX1064GISEL-NEXT: .LBB9_4: ; %endif
+; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, s7
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_cfg_i64:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_clause 0x1
+; GFX1032DAGISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX1032DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc_lo, 15, v0
+; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr4_sgpr5
+; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s8, vcc_lo
+; GFX1032DAGISEL-NEXT: s_xor_b32 s8, exec_lo, s8
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB9_2
+; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s4, s4
+; GFX1032DAGISEL-NEXT: s_sub_i32 s4, 0, s4
+; GFX1032DAGISEL-NEXT: s_ashr_i32 s5, s4, 31
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: s_mul_hi_u32 s9, s2, s4
+; GFX1032DAGISEL-NEXT: s_mul_i32 s3, s3, s4
+; GFX1032DAGISEL-NEXT: s_mul_i32 s5, s2, s5
+; GFX1032DAGISEL-NEXT: s_add_u32 s3, s9, s3
+; GFX1032DAGISEL-NEXT: s_mul_i32 s4, s2, s4
+; GFX1032DAGISEL-NEXT: s_add_u32 s5, s3, s5
+; GFX1032DAGISEL-NEXT: .LBB9_2: ; %Flow
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s2, s8
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s5
+; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s2
+; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032DAGISEL-NEXT: s_sub_i32 s3, 0, s3
+; GFX1032DAGISEL-NEXT: s_ashr_i32 s4, s3, 31
+; GFX1032DAGISEL-NEXT: s_mul_hi_u32 s5, s6, s3
+; GFX1032DAGISEL-NEXT: s_mul_i32 s7, s7, s3
+; GFX1032DAGISEL-NEXT: s_mul_i32 s8, s6, s4
+; GFX1032DAGISEL-NEXT: s_add_u32 s5, s5, s7
+; GFX1032DAGISEL-NEXT: s_mul_i32 s4, s6, s3
+; GFX1032DAGISEL-NEXT: s_add_u32 s5, s5, s8
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s5
+; GFX1032DAGISEL-NEXT: ; %bb.4: ; %endif
+; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032DAGISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_cfg_i64:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1032GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v0
+; GFX1032GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7
+; GFX1032GISEL-NEXT: s_and_saveexec_b32 s8, vcc_lo
+; GFX1032GISEL-NEXT: s_xor_b32 s8, exec_lo, s8
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB9_2
+; GFX1032GISEL-NEXT: ; %bb.1: ; %else
+; GFX1032GISEL-NEXT: s_mov_b32 s6, exec_lo
+; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s6, s6
+; GFX1032GISEL-NEXT: s_sub_i32 s6, 0, s6
+; GFX1032GISEL-NEXT: s_ashr_i32 s7, s6, 31
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: s_mul_hi_u32 s9, s2, s6
+; GFX1032GISEL-NEXT: s_mul_i32 s3, s3, s6
+; GFX1032GISEL-NEXT: s_mul_i32 s7, s2, s7
+; GFX1032GISEL-NEXT: s_add_u32 s3, s9, s3
+; GFX1032GISEL-NEXT: s_mul_i32 s6, s2, s6
+; GFX1032GISEL-NEXT: s_add_u32 s7, s3, s7
+; GFX1032GISEL-NEXT: .LBB9_2: ; %Flow
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s2, s8
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB9_4
+; GFX1032GISEL-NEXT: ; %bb.3: ; %if
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032GISEL-NEXT: s_sub_i32 s3, 0, s3
+; GFX1032GISEL-NEXT: s_ashr_i32 s4, s3, 31
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: s_mul_hi_u32 s5, s6, s3
+; GFX1032GISEL-NEXT: s_mul_i32 s7, s7, s3
+; GFX1032GISEL-NEXT: s_mul_i32 s4, s6, s4
+; GFX1032GISEL-NEXT: s_add_u32 s5, s5, s7
+; GFX1032GISEL-NEXT: s_mul_i32 s6, s6, s3
+; GFX1032GISEL-NEXT: s_add_u32 s7, s5, s4
+; GFX1032GISEL-NEXT: .LBB9_4: ; %endif
+; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s7
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_cfg_i64:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_clause 0x1
+; GFX1164DAGISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[6:7], exec
+; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr8_sgpr9
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
+; GFX1164DAGISEL-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB9_2
+; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[8:9], exec
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s8, s[8:9]
+; GFX1164DAGISEL-NEXT: s_sub_i32 s8, 0, s8
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: s_ashr_i32 s9, s8, 31
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: s_mul_hi_u32 s10, s2, s8
+; GFX1164DAGISEL-NEXT: s_mul_i32 s3, s3, s8
+; GFX1164DAGISEL-NEXT: s_mul_i32 s9, s2, s9
+; GFX1164DAGISEL-NEXT: s_add_u32 s3, s10, s3
+; GFX1164DAGISEL-NEXT: s_mul_i32 s8, s2, s8
+; GFX1164DAGISEL-NEXT: s_add_u32 s9, s3, s9
+; GFX1164DAGISEL-NEXT: .LBB9_2: ; %Flow
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[6:7]
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s8
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s9
+; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3]
+; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[6:7], exec
+; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT: s_sub_i32 s6, 0, s6
+; GFX1164DAGISEL-NEXT: s_ashr_i32 s7, s6, 31
+; GFX1164DAGISEL-NEXT: s_mul_hi_u32 s8, s4, s6
+; GFX1164DAGISEL-NEXT: s_mul_i32 s5, s5, s6
+; GFX1164DAGISEL-NEXT: s_mul_i32 s7, s4, s7
+; GFX1164DAGISEL-NEXT: s_add_u32 s5, s8, s5
+; GFX1164DAGISEL-NEXT: s_mul_i32 s4, s4, s6
+; GFX1164DAGISEL-NEXT: s_add_u32 s5, s5, s7
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s5
+; GFX1164DAGISEL-NEXT: ; %bb.4: ; %endif
+; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_cfg_i64:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT: s_mov_b64 s[8:9], exec
+; GFX1164GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
+; GFX1164GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB9_2
+; GFX1164GISEL-NEXT: ; %bb.1: ; %else
+; GFX1164GISEL-NEXT: s_mov_b64 s[6:7], exec
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1164GISEL-NEXT: s_sub_i32 s6, 0, s6
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: s_ashr_i32 s7, s6, 31
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: s_mul_hi_u32 s10, s2, s6
+; GFX1164GISEL-NEXT: s_mul_i32 s3, s3, s6
+; GFX1164GISEL-NEXT: s_mul_i32 s7, s2, s7
+; GFX1164GISEL-NEXT: s_add_u32 s3, s10, s3
+; GFX1164GISEL-NEXT: s_mul_i32 s6, s2, s6
+; GFX1164GISEL-NEXT: s_add_u32 s7, s3, s7
+; GFX1164GISEL-NEXT: .LBB9_2: ; %Flow
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[2:3], s[8:9]
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB9_4
+; GFX1164GISEL-NEXT: ; %bb.3: ; %if
+; GFX1164GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1164GISEL-NEXT: s_mov_b64 s[6:7], exec
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1164GISEL-NEXT: s_sub_i32 s6, 0, s6
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT: s_ashr_i32 s7, s6, 31
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: s_mul_hi_u32 s8, s4, s6
+; GFX1164GISEL-NEXT: s_mul_i32 s5, s5, s6
+; GFX1164GISEL-NEXT: s_mul_i32 s7, s4, s7
+; GFX1164GISEL-NEXT: s_add_u32 s5, s8, s5
+; GFX1164GISEL-NEXT: s_mul_i32 s6, s4, s6
+; GFX1164GISEL-NEXT: s_add_u32 s7, s5, s7
+; GFX1164GISEL-NEXT: .LBB9_4: ; %endif
+; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, s7
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_cfg_i64:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_clause 0x1
+; GFX1132DAGISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT: s_mov_b32 s8, exec_lo
+; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr6_sgpr7
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
+; GFX1132DAGISEL-NEXT: s_xor_b32 s8, exec_lo, s8
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB9_2
+; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else
+; GFX1132DAGISEL-NEXT: s_mov_b32 s6, exec_lo
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s6, s6
+; GFX1132DAGISEL-NEXT: s_sub_i32 s6, 0, s6
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_ashr_i32 s7, s6, 31
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: s_mul_hi_u32 s9, s2, s6
+; GFX1132DAGISEL-NEXT: s_mul_i32 s3, s3, s6
+; GFX1132DAGISEL-NEXT: s_mul_i32 s7, s2, s7
+; GFX1132DAGISEL-NEXT: s_add_u32 s3, s9, s3
+; GFX1132DAGISEL-NEXT: s_mul_i32 s6, s2, s6
+; GFX1132DAGISEL-NEXT: s_add_u32 s7, s3, s7
+; GFX1132DAGISEL-NEXT: .LBB9_2: ; %Flow
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s2, s8
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s2
+; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_sub_i32 s3, 0, s3
+; GFX1132DAGISEL-NEXT: s_ashr_i32 s6, s3, 31
+; GFX1132DAGISEL-NEXT: s_mul_hi_u32 s7, s4, s3
+; GFX1132DAGISEL-NEXT: s_mul_i32 s5, s5, s3
+; GFX1132DAGISEL-NEXT: s_mul_i32 s6, s4, s6
+; GFX1132DAGISEL-NEXT: s_add_u32 s5, s7, s5
+; GFX1132DAGISEL-NEXT: s_mul_i32 s4, s4, s3
+; GFX1132DAGISEL-NEXT: s_add_u32 s5, s5, s6
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX1132DAGISEL-NEXT: ; %bb.4: ; %endif
+; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_cfg_i64:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT: s_mov_b32 s8, exec_lo
+; GFX1132GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
+; GFX1132GISEL-NEXT: s_xor_b32 s8, exec_lo, s8
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB9_2
+; GFX1132GISEL-NEXT: ; %bb.1: ; %else
+; GFX1132GISEL-NEXT: s_mov_b32 s6, exec_lo
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s6, s6
+; GFX1132GISEL-NEXT: s_sub_i32 s6, 0, s6
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: s_ashr_i32 s7, s6, 31
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: s_mul_hi_u32 s9, s2, s6
+; GFX1132GISEL-NEXT: s_mul_i32 s3, s3, s6
+; GFX1132GISEL-NEXT: s_mul_i32 s7, s2, s7
+; GFX1132GISEL-NEXT: s_add_u32 s3, s9, s3
+; GFX1132GISEL-NEXT: s_mul_i32 s6, s2, s6
+; GFX1132GISEL-NEXT: s_add_u32 s7, s3, s7
+; GFX1132GISEL-NEXT: .LBB9_2: ; %Flow
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s2, s8
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB9_4
+; GFX1132GISEL-NEXT: ; %bb.3: ; %if
+; GFX1132GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1132GISEL-NEXT: s_sub_i32 s3, 0, s3
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT: s_ashr_i32 s6, s3, 31
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: s_mul_hi_u32 s7, s4, s3
+; GFX1132GISEL-NEXT: s_mul_i32 s5, s5, s3
+; GFX1132GISEL-NEXT: s_mul_i32 s8, s4, s6
+; GFX1132GISEL-NEXT: s_add_u32 s5, s7, s5
+; GFX1132GISEL-NEXT: s_mul_i32 s6, s4, s3
+; GFX1132GISEL-NEXT: s_add_u32 s7, s5, s8
+; GFX1132GISEL-NEXT: .LBB9_4: ; %endif
+; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132GISEL-NEXT: s_endpgm
+entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %d_cmp = icmp ult i32 %tid, 16
+ br i1 %d_cmp, label %if, label %else
+
+if:
+ %reducedValTid = call i64 @llvm.amdgcn.wave.reduce.sub.i64(i64 %in2, i32 1)
+ br label %endif
+
+else:
+ %reducedValIn = call i64 @llvm.amdgcn.wave.reduce.sub.i64(i64 %in, i32 1)
+ br label %endif
+
+endif:
+ %combine = phi i64 [%reducedValTid, %if], [%reducedValIn, %else]
+ store i64 %combine, ptr addrspace(1) %out
+ ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX10DAGISEL: {{.*}}
+; GFX10GISEL: {{.*}}
+; GFX11DAGISEL: {{.*}}
+; GFX11GISEL: {{.*}}
More information about the llvm-branch-commits
mailing list