[llvm-branch-commits] [llvm] [AMDGPU] DPP wave reduction for long types - 1 (PR #189224)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Sun Mar 29 04:10:32 PDT 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Aaditya (easyonaadit)
<details>
<summary>Changes</summary>
Supported Ops: `min`, `max`, `umin`, `umax`
---
Patch is 406.26 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/189224.diff
5 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+188-46)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll (+1084-108)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll (+1084-108)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll (+1044-108)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll (+1044-108)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 465bf3981cd5a..dfec8aaf56767 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5673,7 +5673,7 @@ static uint64_t getIdentityValueForWaveReduction(unsigned Opc) {
}
}
-static std::tuple<bool, bool> ClassifyWaveReductionOp(unsigned Opc) {
+static std::tuple<bool, bool, bool> ClassifyWaveReductionOp(unsigned Opc) {
bool is32BitOpc = Opc == AMDGPU::S_MIN_U32 || Opc == AMDGPU::S_MIN_I32 ||
Opc == AMDGPU::S_MAX_U32 || Opc == AMDGPU::S_MAX_I32 ||
Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32 ||
@@ -5688,7 +5688,8 @@ static std::tuple<bool, bool> ClassifyWaveReductionOp(unsigned Opc) {
Opc == AMDGPU::V_MIN_F64_e64 || Opc == AMDGPU::V_MAX_F64_e64 ||
Opc == AMDGPU::V_MIN_NUM_F64_e64 || Opc == AMDGPU::V_MAX_NUM_F64_e64 ||
Opc == AMDGPU::V_ADD_F64_e64 || Opc == AMDGPU::V_ADD_F64_pseudo_e64;
- return {is32BitOpc, isFPOp};
+ bool NeedsMovDPP = !is32BitOpc;
+ return {is32BitOpc, isFPOp, NeedsMovDPP};
}
static std::tuple<unsigned, unsigned>
@@ -5731,12 +5732,17 @@ getDPPOpcForWaveReduction(unsigned Opc, const GCNSubtarget &ST) {
case AMDGPU::V_MAX_F32_e64:
DPPOpc = AMDGPU::V_MAX_F32_dpp;
break;
+ case AMDGPU::V_CMP_LT_U64_e64: // umin.u64
+ case AMDGPU::V_CMP_LT_I64_e64: // min.i64
+ case AMDGPU::V_CMP_GT_U64_e64: // umax.u64
+ case AMDGPU::V_CMP_GT_I64_e64: // max.i64
+ DPPOpc = AMDGPU::V_MOV_B64_DPP_PSEUDO;
+ break;
default:
llvm_unreachable("unhandled lane op");
}
- bool isFPOp = std::get<1>(ClassifyWaveReductionOp(Opc));
unsigned ClampOpc = Opc;
- if (!isFPOp) {
+ if (!ST.getInstrInfo()->isVALU(Opc)) {
if (Opc == AMDGPU::S_SUB_I32)
ClampOpc = AMDGPU::S_ADD_I32;
ClampOpc = ST.getInstrInfo()->getVALUOp(ClampOpc);
@@ -6013,7 +6019,7 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
} else {
MachineBasicBlock::iterator I = BB.end();
Register SrcReg = MI.getOperand(1).getReg();
- auto [is32BitOpc, isFPOp] = ClassifyWaveReductionOp(Opc);
+ auto [is32BitOpc, isFPOp, NeedsMovDPP] = ClassifyWaveReductionOp(Opc);
// Create virtual registers required for lowering.
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
@@ -6246,7 +6252,7 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
RetBB = ComputeEnd;
} else {
assert(ST.hasDPP() && "Sub Target does not support DPP Operations");
-
+ MachineBasicBlock *CurrBB = &BB;
Register SrcWithIdentity = MRI.createVirtualRegister(SrcRegClass);
Register IdentityVGPR = MRI.createVirtualRegister(SrcRegClass);
Register IdentitySGPR = MRI.createVirtualRegister(DstRegClass);
@@ -6260,13 +6266,19 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
Register RowBcast31 = MRI.createVirtualRegister(SrcRegClass);
Register UndefExec = MRI.createVirtualRegister(WaveMaskRegClass);
Register FinalDPPResult;
- BuildMI(BB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), UndefExec);
+ MachineInstr *SrcWithIdentityInstr;
+ MachineInstr *LastBcastInstr;
+ BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), UndefExec);
uint64_t IdentityValue = getIdentityValueForWaveReduction(Opc);
- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), IdentitySGPR)
+ BuildMI(*CurrBB, MI, DL,
+ TII->get(is32BitOpc ? AMDGPU::S_MOV_B32
+ : AMDGPU::S_MOV_B64_IMM_PSEUDO),
+ IdentitySGPR)
.addImm(IdentityValue);
- BuildMI(BB, MI, DL, TII->get(AMDGPU::COPY), IdentityVGPR)
- .addReg(IdentitySGPR);
+ auto IdentityCopyInstr =
+ BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::COPY), IdentityVGPR)
+ .addReg(IdentitySGPR);
auto [DPPOpc, ClampOpc] = getDPPOpcForWaveReduction(Opc, ST);
auto BuildSetInactiveInstr = [&](Register Dst, Register Src0,
Register Src1) {
@@ -6281,21 +6293,22 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
auto BuildDPPMachineInstr = [&](Register Dst, Register Src,
unsigned DPPCtrl) {
auto DPPInstr =
- BuildMI(BB, MI, DL, TII->get(DPPOpc), Dst).addReg(Src); // old
+ BuildMI(*CurrBB, MI, DL, TII->get(DPPOpc), Dst).addReg(Src); // old
if (isFPOp)
DPPInstr.addImm(SISrcMods::NONE); // src0 modifier
DPPInstr.addReg(Src); // src0
if (isFPOp)
DPPInstr.addImm(SISrcMods::NONE); // src1 modifier
+ if (!NeedsMovDPP)
+ DPPInstr.addReg(Src); // src1
DPPInstr
- .addReg(Src) // src1
.addImm(DPPCtrl) // dpp-ctrl
.addImm(0xf) // row-mask
.addImm(0xf) // bank-mask
.addImm(0); // bound-control
};
auto BuildClampInstr = [&](Register Dst, Register Src0, Register Src1) {
- auto ClampInstr = BuildMI(BB, MI, DL, TII->get(ClampOpc), Dst);
+ auto ClampInstr = BuildMI(*CurrBB, MI, DL, TII->get(ClampOpc), Dst);
if (isFPOp)
ClampInstr.addImm(SISrcMods::NONE); // src0 mod
ClampInstr.addReg(Src0); // src0
@@ -6306,95 +6319,201 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
ClampInstr.addImm(0); // clamp
if (isFPOp)
ClampInstr.addImm(0); // omod
+ LastBcastInstr = ClampInstr;
+ };
+ auto BuildPostDPPInstr = [&](Register Src0, Register Src1) {
+ Register CmpMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
+ Register MinMaxResultReg = MRI.createVirtualRegister(SrcRegClass);
+ BuildMI(*CurrBB, MI, DL, TII->get(Opc), CmpMaskReg)
+ .addReg(Src0) // src0
+ .addReg(Src1); // src1
+ LastBcastInstr =
+ BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B64_PSEUDO),
+ MinMaxResultReg)
+ .addReg(Src1) // src0
+ .addReg(Src0) // src1
+ .addReg(CmpMaskReg); // src2
+ CurrBB = Expand64BitV_CND_MASK(*LastBcastInstr, CurrBB);
+ return MinMaxResultReg;
};
// Set inactive lanes to the identity value.
- MachineInstr *SrcWithIdentityInstr =
- BuildSetInactiveInstr(SrcWithIdentity, SrcReg, IdentityVGPR);
-
+ if (is32BitOpc) {
+ SrcWithIdentityInstr =
+ BuildSetInactiveInstr(SrcWithIdentity, SrcReg, IdentityVGPR);
+ } else {
+ Register SrcWithIdentitylo =
+ MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register SrcWithIdentityhi =
+ MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ auto [Reg0Sub0, Reg0Sub1] =
+ ExtractSubRegs(MI, IdentityCopyInstr->getOperand(0), SrcRegClass);
+ auto [SrcReg0Sub0, SrcReg0Sub1] =
+ ExtractSubRegs(MI, MI.getOperand(1), SrcRegClass);
+ MachineInstr *SetInactiveLoInstr = BuildSetInactiveInstr(
+ SrcWithIdentitylo, SrcReg0Sub0.getReg(), Reg0Sub0.getReg());
+ MachineInstr *SetInactiveHiInstr = BuildSetInactiveInstr(
+ SrcWithIdentityhi, SrcReg0Sub1.getReg(), Reg0Sub1.getReg());
+ SrcWithIdentityInstr =
+ BuildRegSequence(*CurrBB, MI, SrcWithIdentity,
+ SetInactiveLoInstr->getOperand(0).getReg(),
+ SetInactiveHiInstr->getOperand(0).getReg());
+ }
// DPP reduction
- BuildDPPMachineInstr(DPPRowShr1, SrcWithIdentity,
+ Register SrcWithIdentityReg =
+ SrcWithIdentityInstr->getOperand(0).getReg();
+ BuildDPPMachineInstr(DPPRowShr1, SrcWithIdentityReg,
AMDGPU::DPP::ROW_SHR_FIRST);
+ if (NeedsMovDPP)
+ DPPRowShr1 = BuildPostDPPInstr(SrcWithIdentityReg, DPPRowShr1);
BuildDPPMachineInstr(DPPRowShr2, DPPRowShr1,
(AMDGPU::DPP::ROW_SHR_FIRST + 1));
+ if (NeedsMovDPP)
+ DPPRowShr2 = BuildPostDPPInstr(DPPRowShr1, DPPRowShr2);
BuildDPPMachineInstr(DPPRowShr4, DPPRowShr2,
(AMDGPU::DPP::ROW_SHR_FIRST + 3));
+ if (NeedsMovDPP)
+ DPPRowShr4 = BuildPostDPPInstr(DPPRowShr2, DPPRowShr4);
BuildDPPMachineInstr(DPPRowShr8, DPPRowShr4,
(AMDGPU::DPP::ROW_SHR_FIRST + 7));
+ if (NeedsMovDPP)
+ DPPRowShr8 = BuildPostDPPInstr(DPPRowShr4, DPPRowShr8);
if (ST.hasDPPBroadcasts()) {
BuildDPPMachineInstr(RowBcast15, DPPRowShr8, AMDGPU::DPP::BCAST15);
+ if (NeedsMovDPP)
+ RowBcast15 = BuildPostDPPInstr(DPPRowShr8, RowBcast15);
} else {
// magic constant: 0x1E0
// To Set BIT_MODE : bit 15 = 0
// XOR mask : bit [14:10] = 0
// OR mask : bit [9:5] = 15
// AND mask : bit [4:0] = 0
- Register SwizzledValue =
- MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- BuildMI(BB, MI, DL, TII->get(AMDGPU::DS_SWIZZLE_B32), SwizzledValue)
- .addReg(DPPRowShr8) // addr
- .addImm(0x1E0) // swizzle offset (i16)
- .addImm(0x0); // gds (i1)
- BuildClampInstr(RowBcast15, DPPRowShr8, SwizzledValue);
+ if (is32BitOpc) {
+ Register SwizzledValue =
+ MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::DS_SWIZZLE_B32),
+ SwizzledValue)
+ .addReg(DPPRowShr8) // addr
+ .addImm(0x1E0) // swizzle offset (i16)
+ .addImm(0x0); // gds (i1)
+ BuildClampInstr(RowBcast15, DPPRowShr8, SwizzledValue);
+ } else {
+ Register SwizzledValuelo =
+ MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register SwizzledValuehi =
+ MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register SwizzledValue64 = MRI.createVirtualRegister(SrcRegClass);
+ MachineOperand DPPRowShr8Op =
+ MachineOperand::CreateReg(DPPRowShr8, /*isDef=*/false);
+ auto [Op1L, Op1H] = ExtractSubRegs(MI, DPPRowShr8Op, SrcRegClass);
+ BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::DS_SWIZZLE_B32),
+ SwizzledValuelo)
+ .add(Op1L) // addr
+ .addImm(0x1E0) // swizzle offset (i16)
+ .addImm(0x0); // gds (i1)
+ BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::DS_SWIZZLE_B32),
+ SwizzledValuehi)
+ .add(Op1H) // addr
+ .addImm(0x1E0) // swizzle offset (i16)
+ .addImm(0x0); // gds (i1)
+ BuildRegSequence(*CurrBB, MI, SwizzledValue64, SwizzledValuelo,
+ SwizzledValuehi);
+ if (NeedsMovDPP)
+ RowBcast15 = BuildPostDPPInstr(DPPRowShr8, SwizzledValue64);
+ else
+ BuildClampInstr(RowBcast15, DPPRowShr8, SwizzledValue64);
+ }
}
FinalDPPResult = RowBcast15;
if (!IsWave32) {
if (ST.hasDPPBroadcasts()) {
BuildDPPMachineInstr(RowBcast31, RowBcast15, AMDGPU::DPP::BCAST31);
+ if (NeedsMovDPP)
+ RowBcast31 = BuildPostDPPInstr(RowBcast15, RowBcast31);
} else {
Register ShiftedThreadID =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
Register PermuteByteOffset =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- Register PermutedValue =
- MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- Register Lane32Offset = MRI.createVirtualRegister(DstRegClass);
- Register WordSizeConst = MRI.createVirtualRegister(DstRegClass);
+ Register PermutedValue = MRI.createVirtualRegister(SrcRegClass);
+ Register Lane32Offset =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register WordSizeConst =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
Register ThreadIDRegLo =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
Register ThreadIDReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
// Get the thread ID.
- BuildMI(BB, MI, DL, TII->get(AMDGPU::V_MBCNT_LO_U32_B32_e64),
+ BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::V_MBCNT_LO_U32_B32_e64),
ThreadIDRegLo)
.addImm(-1)
.addImm(0);
- BuildMI(BB, MI, DL, TII->get(AMDGPU::V_MBCNT_HI_U32_B32_e64),
+ BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::V_MBCNT_HI_U32_B32_e64),
ThreadIDReg)
.addImm(-1)
.addReg(ThreadIDRegLo);
// shift each lane over by 32 positions, so value in 31st lane is
// present in 63rd lane.
- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), Lane32Offset)
+ BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), Lane32Offset)
.addImm(0x20);
- BuildMI(BB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e64), ShiftedThreadID)
+ BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e64),
+ ShiftedThreadID)
.addReg(ThreadIDReg)
.addReg(Lane32Offset)
.addImm(0); // clamp
// multiply by reg size.
- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), WordSizeConst)
+ BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), WordSizeConst)
.addImm(0x4);
- BuildMI(BB, MI, DL, TII->get(AMDGPU::V_MUL_LO_U32_e64),
+ BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::V_MUL_LO_U32_e64),
PermuteByteOffset)
.addReg(WordSizeConst)
.addReg(ShiftedThreadID);
// Permute the lanes
- BuildMI(BB, MI, DL, TII->get(AMDGPU::DS_PERMUTE_B32), PermutedValue)
- .addReg(PermuteByteOffset) // addr
- .addReg(RowBcast15) // data
- .addImm(0); // offset
- BuildClampInstr(RowBcast31, RowBcast15, PermutedValue);
+ if (is32BitOpc) {
+ BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::DS_PERMUTE_B32),
+ PermutedValue)
+ .addReg(PermuteByteOffset) // addr
+ .addReg(RowBcast15) // data
+ .addImm(0); // offset
+ } else {
+ Register PermutedValuelo =
+ MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register PermutedValuehi =
+ MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ MachineOperand RowBcast15Op =
+ MachineOperand::CreateReg(RowBcast15, /*isDef=*/false);
+ auto [RowBcast15Lo, RowBcast15Hi] =
+ ExtractSubRegs(MI, RowBcast15Op, SrcRegClass);
+ BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::DS_PERMUTE_B32),
+ PermutedValuelo)
+ .addReg(PermuteByteOffset) // addr
+ .add(RowBcast15Lo) // swizzle offset (i16)
+ .addImm(0x0); // gds (i1)
+ BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::DS_PERMUTE_B32),
+ PermutedValuehi)
+ .addReg(PermuteByteOffset) // addr
+ .add(RowBcast15Hi) // swizzle offset (i16)
+ .addImm(0x0); // gds (i1)
+ BuildRegSequence(*CurrBB, MI, PermutedValue, PermutedValuelo,
+ PermutedValuehi);
+ }
+ if (NeedsMovDPP)
+ RowBcast31 = BuildPostDPPInstr(RowBcast15, PermutedValue);
+ else
+ BuildClampInstr(RowBcast31, RowBcast15, PermutedValue);
}
FinalDPPResult = RowBcast31;
}
if (Opc == AMDGPU::V_SUB_F32_e64) {
Register NegatedValVGPR =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- BuildMI(BB, MI, DL, TII->get(AMDGPU::V_SUB_F32_e64), NegatedValVGPR)
+ BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::V_SUB_F32_e64),
+ NegatedValVGPR)
.addImm(SISrcMods::NONE) // src0 mods
.addReg(IdentityVGPR) // src0
.addImm(SISrcMods::NONE) // src1 mods
@@ -6404,18 +6523,41 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
FinalDPPResult = NegatedValVGPR;
}
// The final reduced value is in the last lane.
- BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), ReducedValSGPR)
- .addReg(FinalDPPResult)
- .addImm(ST.getWavefrontSize() - 1);
+ if (is32BitOpc) {
+ BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32),
+ ReducedValSGPR)
+ .addReg(FinalDPPResult)
+ .addImm(ST.getWavefrontSize() - 1);
+ } else {
+ Register LaneValueLoReg =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ Register LaneValueHiReg =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
+ MachineOperand FinalDPPResultOperand =
+ MachineOperand::CreateReg(FinalDPPResult, /*isDef=*/false);
+ auto [Op1L, Op1H] = ExtractSubRegs(MI, FinalDPPResultOperand, SrcRC);
+ // lane value input should be in an sgpr
+ BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32),
+ LaneValueLoReg)
+ .add(Op1L)
+ .addImm(ST.getWavefrontSize() - 1);
+ BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32),
+ LaneValueHiReg)
+ .add(Op1H)
+ .addImm(ST.getWavefrontSize() - 1);
+ BuildRegSequence(*CurrBB, MI, ReducedValSGPR, LaneValueLoReg,
+ LaneValueHiReg);
+ }
if (Opc == AMDGPU::S_SUB_I32)
- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedReducedVal)
+ BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedReducedVal)
.addImm(0)
.addReg(ReducedValSGPR);
// Mark the final result as a whole-wave-mode calculation.
- BuildMI(BB, MI, DL, TII->get(AMDGPU::STRICT_WWM), DstReg)
+ BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::STRICT_WWM), DstReg)
.addReg(Opc == AMDGPU::S_SUB_I32 ? NegatedReducedVal
: ReducedValSGPR);
- RetBB = &BB;
+ RetBB = CurrBB;
}
}
MI.eraseFromParent();
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll
index 502d58f66bd31..8f49864792794 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll
@@ -816,6 +816,982 @@ entry:
ret void
}
+define void @divergent_value_dpp_i64(ptr addrspace(1) %out, i64 %in) {
+; GFX8DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8DAGISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX8DAGISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: buffer_store_dword v5, off, s[0...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/189224
More information about the llvm-branch-commits
mailing list