[llvm] Test branch wave reduce (PR #111366)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Oct 7 22:28:36 PDT 2024
https://github.com/easyonaadit updated https://github.com/llvm/llvm-project/pull/111366
>From 30454d1ceb4e126d08ed01d19fbcd7cc513ecec6 Mon Sep 17 00:00:00 2001
From: easyonaadit <aaditya.alokdeshpande at amd.com>
Date: Tue, 24 Sep 2024 15:35:42 +0530
Subject: [PATCH 01/13] Added wave reduce intrinsics for int
add,sub,or,xor,and. Still have to extend for unsigned sub and floats.
---
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 7 +
.../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 9 +-
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 236 +++++++++++++++++-
llvm/lib/Target/AMDGPU/SIInstructions.td | 35 +++
.../global_atomics_iterative_scan_fp.ll | 2 +-
5 files changed, 281 insertions(+), 8 deletions(-)
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 4cd32a0502c66d..097a074859ca10 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2121,6 +2121,13 @@ class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic<
def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce;
def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce;
+def int_amdgcn_wave_reduce_add : AMDGPUWaveReduce;
+def int_amdgcn_wave_reduce_uadd : AMDGPUWaveReduce;
+def int_amdgcn_wave_reduce_sub : AMDGPUWaveReduce;
+def int_amdgcn_wave_reduce_usub : AMDGPUWaveReduce;
+def int_amdgcn_wave_reduce_and : AMDGPUWaveReduce;
+def int_amdgcn_wave_reduce_or : AMDGPUWaveReduce;
+def int_amdgcn_wave_reduce_xor : AMDGPUWaveReduce;
def int_amdgcn_readfirstlane :
Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index f2c9619cb8276a..c5ee2944e3015e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4851,7 +4851,14 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
break;
}
case Intrinsic::amdgcn_wave_reduce_umin:
- case Intrinsic::amdgcn_wave_reduce_umax: {
+ case Intrinsic::amdgcn_wave_reduce_umax:
+ case Intrinsic::amdgcn_wave_reduce_and:
+ case Intrinsic::amdgcn_wave_reduce_or:
+ case Intrinsic::amdgcn_wave_reduce_xor:
+ case Intrinsic::amdgcn_wave_reduce_usub:
+ case Intrinsic::amdgcn_wave_reduce_sub:
+ case Intrinsic::amdgcn_wave_reduce_uadd:
+ case Intrinsic::amdgcn_wave_reduce_add: {
unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 10108866a7005a..f787f3d71fc045 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4859,10 +4859,220 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
Register DstReg = MI.getOperand(0).getReg();
MachineBasicBlock *RetBB = nullptr;
if (isSGPR) {
- // These operations with a uniform value i.e. SGPR are idempotent.
- // Reduced value will be same as given sgpr.
- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
- RetBB = &BB;
+ switch(Opc){
+ case AMDGPU::S_MIN_U32:
+ case AMDGPU::S_MAX_U32:
+ case AMDGPU::S_AND_B32:
+ case AMDGPU::S_OR_B32:
+ // These operations with a uniform value i.e. SGPR are idempotent.
+ // Reduced value will be same as given sgpr.
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
+ RetBB = &BB;
+ break;
+ // TODO --> add support for Unsigned ADD and unsigned SUB.
+ case AMDGPU::S_XOR_B32:
+ case AMDGPU::S_ADD_U32:
+ case AMDGPU::S_ADD_I32:
+ // case AMDGPU::S_SUB_U32:
+ case AMDGPU::S_SUB_I32:{
+ MachineBasicBlock::iterator I = BB.end();
+ Register SrcReg = MI.getOperand(1).getReg();
+
+ // Create Control flow for loop
+ // Split MI's Machine Basic block into For loop
+ auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
+
+ // Create virtual registers required for lowering.
+ const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
+ const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
+ Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
+ Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
+
+ Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
+ Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
+ Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
+
+ Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
+ Register CountOfActiveLanesReg = MRI.createVirtualRegister(DstRegClass);
+
+ bool IsWave32 = ST.isWave32();
+ unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+ unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+
+ // Create initail values of induction variable from Exec, Accumulator and
+ // insert branch instr to newly created ComputeBlock
+ uint32_t InitalValue = 0;
+
+ auto TmpSReg =
+ BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
+ BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
+ .addImm(InitalValue);
+ BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH)).addMBB(ComputeLoop);
+
+ // Start constructing ComputeLoop
+ I = ComputeLoop->end();
+ auto Accumulator =
+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
+ .addReg(InitalValReg)
+ .addMBB(&BB);
+ auto ActiveBits =
+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
+ .addReg(TmpSReg->getOperand(0).getReg())
+ .addMBB(&BB);
+
+ // Perform the computations
+ unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
+ auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
+ .addReg(ActiveBits->getOperand(0).getReg());
+ auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), CountOfActiveLanesReg)
+ .addReg(Accumulator->getOperand(0).getReg())
+ .addImm(1);
+
+ // Manipulate the iterator to get the next active lane
+ unsigned BITSETOpc =
+ IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
+ auto NewActiveBits =
+ BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
+ .addReg(FF1->getOperand(0).getReg())
+ .addReg(ActiveBits->getOperand(0).getReg());
+
+ // Add phi nodes
+ Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
+ .addMBB(ComputeLoop);
+ ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
+ .addMBB(ComputeLoop);
+
+ // Creating branching
+ unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
+ BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
+ .addReg(NewActiveBits->getOperand(0).getReg())
+ .addImm(0);
+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
+ .addMBB(ComputeLoop);
+
+ I = ComputeEnd->begin();
+ switch(Opc){
+ case AMDGPU::S_XOR_B32:{
+ // Performing an XOR operation on a uniform value
+ // depends on the number of active lanes. If there
+ // are an even number of active lanes, then the XOR
+ // will result in 0. And if there are an odd number
+ // of Active lanes then the XOR will result in the
+ // same value as that in the SGPR. This comes from
+ // the fact that A^A = 0 and A^0 = A.
+
+ // Create basic block to check the parity.
+ // MachineFunction &MF = *ComputeEnd->getParent();
+ // MachineBasicBlock *CheckParity = MF.CreateMachineBasicBlock();
+ // MachineFunction::iterator It = ComputeEnd->getIterator();
+ // MF.insert(It, CheckParity);
+ // ComputeLoop->addSuccessor(CheckParity);
+ // ComputeLoop->removeSuccessor(ComputeEnd);
+
+ Register ParityRegister = MRI.createVirtualRegister(DstRegClass);
+ // Register Product = MRI.createVirtualRegister(DstRegClass);
+ // Register OddResult = MRI.createVirtualRegister(DstRegClass);
+ // MachineBasicBlock *Even = MF.CreateMachineBasicBlock();
+ // MachineBasicBlock *Odd = MF.CreateMachineBasicBlock();
+ // MF.push_back(Even);
+ // MF.push_back(Odd);
+ // CheckParity->addSuccessor(Even);
+ // CheckParity->addSuccessor(Odd);
+ // Even->addSuccessor(ComputeEnd);
+ // Odd->addSuccessor(ComputeEnd);
+
+ // If the LSB is set, the number is odd, else it is even.
+ // TODO --> is FF0 faster or left-shift by 31 faster or AND 0xfffffffe??
+ // I = CheckParity->begin();
+ auto ParityReg = BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
+ .addReg(NewAccumulator->getOperand(0).getReg())
+ .addImm(1);
+
+ BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
+ .addReg(ParityReg->getOperand(0).getReg())
+ .addImm(SrcReg);
+ // BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
+ // .addMBB(Even);
+ // BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_BRANCH))
+ // .addMBB(Odd);
+
+ // If there are an even number of active lanes, the result is 0.
+ // I = Even->begin();
+ // BuildMI(*Even, I, DL, TII->get(AMDGPU::S_MOV_B32), EvenResult).addImm(0);
+ // BuildMI(*Even, I, DL, TII->get(AMDGPU::S_BRANCH))
+ // .addMBB(ComputeEnd);
+
+ // If there are an odd number of active lanes, the result is the value itself.
+ // I = Odd->begin();
+ // BuildMI(*Odd, I, DL, TII->get(AMDGPU::S_MOV_B32), OddResult).addReg(SrcReg);
+ // BuildMI(*Odd, I, DL, TII->get(AMDGPU::S_BRANCH))
+ // .addMBB(ComputeEnd);
+
+ // Add PHI node to get the appropriate result.
+ // I = ComputeEnd->begin();
+ // auto PhiNode =
+ // BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::PHI), DstReg)
+ // .addReg(EvenResult)
+ // .addMBB(Even);
+ // PhiNode.addReg(OddResult)
+ // .addMBB(Odd);
+ break;
+ }
+ case AMDGPU::S_SUB_U32:{
+ // Doubt --> how can you have a negative unsigned value??
+ break;
+ }
+ case AMDGPU::S_SUB_I32:{
+ // TODO --> use 2's compliment or subtract from 0 to find the negation of the number.
+ Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
+ // Take the negation of the source operand.
+ auto InvertedValReg = BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal).addImm(-1).addReg(SrcReg);
+ // Multiply the negated value with the number of active lanes.
+ BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MUL_I32), DstReg).addReg(InvertedValReg->getOperand(0).getReg()).addReg(NewAccumulator->getOperand(0).getReg());
+ break;
+ }
+ // Doubt --> is SSA form still have to be followed for MIR?
+ case AMDGPU::S_ADD_U32:{
+ // For unsigned multiplication, zero extend the inputs to 64bits,
+ // perform an unsigned multiplication on them and then store the
+ // 32 lower order bits as the result.
+ Register ExtendedInput = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ Register ZeroExtension = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register ExtendedCount = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ Register UnsignedProduct = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+
+ auto ZeroExtented =
+ BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MOV_B32), ZeroExtension)
+ .addImm(0);
+
+ // Zero extend the input to 64bits.
+ auto Input_64 =
+ BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::REG_SEQUENCE), ExtendedInput)
+ .addReg(SrcReg).addImm(AMDGPU::sub0)
+ .addReg(ZeroExtented->getOperand(0).getReg()).addImm(AMDGPU::sub1);
+
+ // Zero extend the number of active lanes to 64bits.
+ auto Count_64 =
+ BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::REG_SEQUENCE), ExtendedCount)
+ .addReg(NewAccumulator->getOperand(0).getReg()).addImm(AMDGPU::sub0)
+ .addReg(ZeroExtented->getOperand(0).getReg()).addImm(AMDGPU::sub1);
+
+ auto Product =
+ BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MUL_U64), UnsignedProduct)
+ .addReg(Input_64->getOperand(0).getReg())
+ .addReg(Count_64->getOperand(0).getReg());
+
+ // Store the lower 32bits of the product as the result.
+ BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(Product->getOperand(0).getReg(), 0, AMDGPU::sub0);
+ break;
+ }
+ case AMDGPU::S_ADD_I32:
+ BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MUL_I32), DstReg).addReg(SrcReg).addReg(NewAccumulator->getOperand(0).getReg());
+ }
+
+ RetBB = ComputeEnd;
+ }
+ }
} else {
// TODO: Implement DPP Strategy and switch based on immediate strategy
// operand. For now, for all the cases (default, Iterative and DPP we use
@@ -4898,7 +5108,7 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
// Create initail values of induction variable from Exec, Accumulator and
- // insert branch instr to newly created ComputeBlockk
+ // insert branch instr to newly created ComputeBlock
uint32_t InitalValue =
(Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
auto TmpSReg =
@@ -4970,6 +5180,20 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
+ case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U32);
+ case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
+ case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U32);
+ case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
+ case AMDGPU::WAVE_REDUCE_AND_PSEUDO_U32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
+ case AMDGPU::WAVE_REDUCE_OR_PSEUDO_U32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
+ case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_U32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
case AMDGPU::S_UADDO_PSEUDO:
case AMDGPU::S_USUBO_PSEUDO: {
const DebugLoc &DL = MI.getDebugLoc();
@@ -6771,7 +6995,7 @@ SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
- // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
+ // If all the operands are zero-extended to 32-bits, then we replace s_mul_u64 // TODO --> `..are zero-extended to 32-bits, then we ..` , should this be zero-extended from 32 bits?
// with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
// 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 9afb29d95abd7d..b61094cd5f6309 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -264,6 +264,41 @@ let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses
(ins VSrc_b32: $src, VSrc_b32:$strategy),
[(set i32:$sdst, (int_amdgcn_wave_reduce_umax i32:$src, i32:$strategy))]> {
}
+
+ def WAVE_REDUCE_ADD_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+ (ins VSrc_b32: $src, VSrc_b32:$strategy),
+ [(set i32:$sdst, (int_amdgcn_wave_reduce_add i32:$src, i32:$strategy))]> {
+ }
+
+ def WAVE_REDUCE_ADD_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+ (ins VSrc_b32: $src, VSrc_b32:$strategy),
+ [(set i32:$sdst, (int_amdgcn_wave_reduce_uadd i32:$src, i32:$strategy))]> {
+ }
+
+ def WAVE_REDUCE_SUB_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+ (ins VSrc_b32: $src, VSrc_b32:$strategy),
+ [(set i32:$sdst, (int_amdgcn_wave_reduce_sub i32:$src, i32:$strategy))]> {
+ }
+
+ def WAVE_REDUCE_SUB_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+ (ins VSrc_b32: $src, VSrc_b32:$strategy),
+ [(set i32:$sdst, (int_amdgcn_wave_reduce_usub i32:$src, i32:$strategy))]> {
+ }
+
+ def WAVE_REDUCE_AND_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+ (ins VSrc_b32: $src, VSrc_b32:$strategy),
+ [(set i32:$sdst, (int_amdgcn_wave_reduce_and i32:$src, i32:$strategy))]> {
+ }
+
+ def WAVE_REDUCE_OR_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+ (ins VSrc_b32: $src, VSrc_b32:$strategy),
+ [(set i32:$sdst, (int_amdgcn_wave_reduce_or i32:$src, i32:$strategy))]> {
+ }
+
+ def WAVE_REDUCE_XOR_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+ (ins VSrc_b32: $src, VSrc_b32:$strategy),
+ [(set i32:$sdst, (int_amdgcn_wave_reduce_xor i32:$src, i32:$strategy))]> {
+ }
}
let usesCustomInserter = 1, Defs = [VCC] in {
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll
index d1e50bd560cb23..02942254cc555b 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll
@@ -156,7 +156,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_value(ptr addrspace(1) %ptr) #
; IR-DPP: 14:
; IR-DPP-NEXT: ret void
;
- %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 seq_cst
+ %result = atomicrmw fsub ptr addrspace(1) %ptr, float 4.0 seq_cst
ret void
}
>From 2ea5ae516d888e1c3c302f25a8297f6e49c46fd4 Mon Sep 17 00:00:00 2001
From: easyonaadit <aaditya.alokdeshpande at amd.com>
Date: Thu, 26 Sep 2024 13:33:57 +0530
Subject: [PATCH 02/13] S_MUL fiasciao
---
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 15 +-
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 180 +++++++++++-----------
2 files changed, 104 insertions(+), 91 deletions(-)
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 097a074859ca10..c80168c01bc9ad 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2109,7 +2109,7 @@ def int_amdgcn_s_quadmask :
def int_amdgcn_s_wqm :
DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyint_ty], [IntrNoMem, IntrConvergent]>;
-class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic<
+class AMDGPUWaveReduce<LLVMType data_ty = llvm_any_ty> : Intrinsic<
[data_ty],
[
LLVMMatchType<0>, // llvm value to reduce (SGPR/VGPR)
@@ -2119,6 +2119,19 @@ class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic<
],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<1>>]>;
+//multiclass AMDGPUWaveReducee {
+// foreach Opcode = ["umin", "umax", "add", "uadd", "sub", "usub", "and", "or", "xor"] in
+// def int_amdgcn_wave_reduce_#Opcode : AMDGPUWaveReduce;
+//}
+
+//multiclass AMDGPUMFp8MfmaIntrinsic<LLVMType DestTy> {
+// foreach kind = ["bf8_bf8", "bf8_fp8", "fp8_bf8", "fp8_fp8"] in
+// def NAME#"_"#kind : AMDGPUMFp8MfmaIntrinsic<DestTy>;
+//}
+
+//WaveReduceDefs<["umin", "umax", "add", "uadd", "sub", "usub", "and", "or", "xor"]>;
+//list<string> Operations
+
def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce;
def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce;
def int_amdgcn_wave_reduce_add : AMDGPUWaveReduce;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 9b4f25ba10d42b..646e7a3d415f72 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4875,7 +4875,7 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
case AMDGPU::S_XOR_B32:
case AMDGPU::S_ADD_U32:
case AMDGPU::S_ADD_I32:
- // case AMDGPU::S_SUB_U32:
+ case AMDGPU::S_SUB_U32:
case AMDGPU::S_SUB_I32:{
MachineBasicBlock::iterator I = BB.end();
Register SrcReg = MI.getOperand(1).getReg();
@@ -4963,115 +4963,115 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
// same value as that in the SGPR. This comes from
// the fact that A^A = 0 and A^0 = A.
- // Create basic block to check the parity.
- // MachineFunction &MF = *ComputeEnd->getParent();
- // MachineBasicBlock *CheckParity = MF.CreateMachineBasicBlock();
- // MachineFunction::iterator It = ComputeEnd->getIterator();
- // MF.insert(It, CheckParity);
- // ComputeLoop->addSuccessor(CheckParity);
- // ComputeLoop->removeSuccessor(ComputeEnd);
-
Register ParityRegister = MRI.createVirtualRegister(DstRegClass);
- // Register Product = MRI.createVirtualRegister(DstRegClass);
- // Register OddResult = MRI.createVirtualRegister(DstRegClass);
- // MachineBasicBlock *Even = MF.CreateMachineBasicBlock();
- // MachineBasicBlock *Odd = MF.CreateMachineBasicBlock();
- // MF.push_back(Even);
- // MF.push_back(Odd);
- // CheckParity->addSuccessor(Even);
- // CheckParity->addSuccessor(Odd);
- // Even->addSuccessor(ComputeEnd);
- // Odd->addSuccessor(ComputeEnd);
-
- // If the LSB is set, the number is odd, else it is even.
- // TODO --> is FF0 faster or left-shift by 31 faster or AND 0xfffffffe??
- // I = CheckParity->begin();
+
auto ParityReg = BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
.addReg(NewAccumulator->getOperand(0).getReg())
.addImm(1);
BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
.addReg(ParityReg->getOperand(0).getReg())
- .addImm(SrcReg);
- // BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
- // .addMBB(Even);
- // BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_BRANCH))
- // .addMBB(Odd);
-
- // If there are an even number of active lanes, the result is 0.
- // I = Even->begin();
- // BuildMI(*Even, I, DL, TII->get(AMDGPU::S_MOV_B32), EvenResult).addImm(0);
- // BuildMI(*Even, I, DL, TII->get(AMDGPU::S_BRANCH))
- // .addMBB(ComputeEnd);
-
- // If there are an odd number of active lanes, the result is the value itself.
- // I = Odd->begin();
- // BuildMI(*Odd, I, DL, TII->get(AMDGPU::S_MOV_B32), OddResult).addReg(SrcReg);
- // BuildMI(*Odd, I, DL, TII->get(AMDGPU::S_BRANCH))
- // .addMBB(ComputeEnd);
-
- // Add PHI node to get the appropriate result.
- // I = ComputeEnd->begin();
- // auto PhiNode =
- // BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::PHI), DstReg)
- // .addReg(EvenResult)
- // .addMBB(Even);
- // PhiNode.addReg(OddResult)
- // .addMBB(Odd);
- break;
- }
- case AMDGPU::S_SUB_U32:{
- // Doubt --> how can you have a negative unsigned value??
+ .addReg(SrcReg);
break;
}
+ // case AMDGPU::S_SUB_U32:{
+ // // // Doubt --> how can you have a negative unsigned value??
+ // Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
+ // Register SrcVal = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ // Register Product = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+ // // Take the negation of the source operand.
+ // auto InvertedValReg = BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal).addImm(-1).addReg(SrcReg);
+
+ // auto V_SrcReg =
+ // BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), SrcVal)
+ // .addReg(InvertedValReg->getOperand(0).getReg());
+
+ // auto ProductVal =
+ // BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::V_MUL_LO_U32_e64), Product)
+ // .addReg(V_SrcReg->getOperand(0).getReg())
+ // .addReg(NewAccumulator->getOperand(0).getReg())
+ // .addReg(AMDGPU::EXEC, RegState::Implicit);
+
+ // BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
+ // .addReg(ProductVal->getOperand(0).getReg());
+
+ // break;
+ // }
+ case AMDGPU::S_SUB_U32:
case AMDGPU::S_SUB_I32:{
// TODO --> use 2's compliment or subtract from 0 to find the negation of the number.
Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
+ // Register SrcVal = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ // Register Product = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
// Take the negation of the source operand.
auto InvertedValReg = BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal).addImm(-1).addReg(SrcReg);
- // Multiply the negated value with the number of active lanes.
- BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MUL_I32), DstReg).addReg(InvertedValReg->getOperand(0).getReg()).addReg(NewAccumulator->getOperand(0).getReg());
- break;
+ BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
+ .addReg(InvertedValReg->getOperand(0).getReg())
+ .addReg(NewAccumulator->getOperand(0).getReg());
+
+ // auto V_SrcReg =
+ // BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), SrcVal)
+ // .addReg(InvertedValReg->getOperand(0).getReg());
+
+ // auto ProductVal =
+ // BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::V_MUL_LO_U32_e64), Product)
+ // .addReg(V_SrcReg->getOperand(0).getReg())
+ // .addReg(NewAccumulator->getOperand(0).getReg())
+ // .addReg(AMDGPU::EXEC, RegState::Implicit)
+ // .setMIFlag(MachineInstr::MIFlag::NoUWrap)
+ // .setMIFlag(MachineInstr::MIFlag::NoSWrap);
+
+ // BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
+ // .addReg(ProductVal->getOperand(0).getReg());
+
+ // break;
}
// Doubt --> is SSA form still have to be followed for MIR?
- case AMDGPU::S_ADD_U32:{
- // For unsigned multiplication, zero extend the inputs to 64bits,
- // perform an unsigned multiplication on them and then store the
- // 32 lower order bits as the result.
- Register ExtendedInput = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
- Register ZeroExtension = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
- Register ExtendedCount = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
- Register UnsignedProduct = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
-
- auto ZeroExtented =
- BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MOV_B32), ZeroExtension)
- .addImm(0);
-
- // Zero extend the input to 64bits.
- auto Input_64 =
- BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::REG_SEQUENCE), ExtendedInput)
- .addReg(SrcReg).addImm(AMDGPU::sub0)
- .addReg(ZeroExtented->getOperand(0).getReg()).addImm(AMDGPU::sub1);
+ case AMDGPU::S_ADD_U32:
+ case AMDGPU::S_ADD_I32:{
+ BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
+ .addReg(SrcReg)
+ .addReg(NewAccumulator->getOperand(0).getReg());
+ // Register SrcVal = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ // Register Product = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+ // auto V_SrcReg =
+ // BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), SrcVal)
+ // .addReg(SrcReg);
- // Zero extend the number of active lanes to 64bits.
- auto Count_64 =
- BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::REG_SEQUENCE), ExtendedCount)
- .addReg(NewAccumulator->getOperand(0).getReg()).addImm(AMDGPU::sub0)
- .addReg(ZeroExtented->getOperand(0).getReg()).addImm(AMDGPU::sub1);
-
- auto Product =
- BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MUL_U64), UnsignedProduct)
- .addReg(Input_64->getOperand(0).getReg())
- .addReg(Count_64->getOperand(0).getReg());
-
- // Store the lower 32bits of the product as the result.
- BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(Product->getOperand(0).getReg(), 0, AMDGPU::sub0);
+ // auto ProductVal =
+ // BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::V_MUL_LO_U32_e64), Product)
+ // .addReg(V_SrcReg->getOperand(0).getReg())
+ // .addReg(NewAccumulator->getOperand(0).getReg())
+ // .addReg(AMDGPU::EXEC, RegState::Implicit)
+ // .setMIFlag(MachineInstr::MIFlag::NoUWrap)
+ // .setMIFlag(MachineInstr::MIFlag::NoSWrap);
+
+ // BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
+ // .addReg(ProductVal->getOperand(0).getReg());
break;
}
- case AMDGPU::S_ADD_I32:
- BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MUL_I32), DstReg).addReg(SrcReg).addReg(NewAccumulator->getOperand(0).getReg());
+ // case AMDGPU::S_ADD_U32:{
+ // Register SrcVal = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ // Register Product = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+ // auto V_SrcReg =
+ // BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), SrcVal)
+ // .addReg(SrcReg);
+
+ // auto ProductVal =
+ // BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::V_MUL_LO_U32_e64), Product)
+ // .addReg(V_SrcReg->getOperand(0).getReg())
+ // .addReg(NewAccumulator->getOperand(0).getReg())
+ // .addReg(AMDGPU::EXEC, RegState::Implicit);
+
+ // BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
+ // .addReg(ProductVal->getOperand(0).getReg());
+ // break;
+ // }
}
-
RetBB = ComputeEnd;
}
}
>From c0dd1471380cb33344aa358b12bc13583d0e1cc6 Mon Sep 17 00:00:00 2001
From: easyonaadit <aaditya.alokdeshpande at amd.com>
Date: Thu, 26 Sep 2024 14:11:51 +0530
Subject: [PATCH 03/13] temp commit
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 646e7a3d415f72..5a06070dd64feb 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4868,7 +4868,9 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
case AMDGPU::S_OR_B32:
// These operations with a uniform value i.e. SGPR are idempotent.
// Reduced value will be same as given sgpr.
- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
+ // bool IsWave32 = ST.isWave32();
+ unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+ BuildMI(BB, MI, DL, TII->get(MovOpc), DstReg).addReg(SrcReg);
RetBB = &BB;
break;
// TODO --> add support for Unsigned ADD and unsigned SUB.
>From 93e8802817dee2b79258732d3ac01e0a92ba6625 Mon Sep 17 00:00:00 2001
From: easyonaadit <aaditya.alokdeshpande at amd.com>
Date: Thu, 26 Sep 2024 17:34:14 +0530
Subject: [PATCH 04/13] temp commit
---
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 10 ++
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 130 +++++++---------------
llvm/lib/Target/AMDGPU/SIInstructions.td | 50 +++++++++
3 files changed, 102 insertions(+), 88 deletions(-)
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index c80168c01bc9ad..299513a95e2989 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2133,13 +2133,23 @@ class AMDGPUWaveReduce<LLVMType data_ty = llvm_any_ty> : Intrinsic<
//list<string> Operations
def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce;
+def int_amdgcn_wave_reduce_min : AMDGPUWaveReduce;
+def int_amdgcn_wave_reduce_fmin : AMDGPUWaveReduce;
def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce;
+def int_amdgcn_wave_reduce_max : AMDGPUWaveReduce;
+def int_amdgcn_wave_reduce_fmax : AMDGPUWaveReduce;
def int_amdgcn_wave_reduce_add : AMDGPUWaveReduce;
def int_amdgcn_wave_reduce_uadd : AMDGPUWaveReduce;
+def int_amdgcn_wave_reduce_fadd : AMDGPUWaveReduce;
def int_amdgcn_wave_reduce_sub : AMDGPUWaveReduce;
def int_amdgcn_wave_reduce_usub : AMDGPUWaveReduce;
+def int_amdgcn_wave_reduce_fsub : AMDGPUWaveReduce;
def int_amdgcn_wave_reduce_and : AMDGPUWaveReduce;
+def int_amdgcn_wave_reduce_uand : AMDGPUWaveReduce;
+def int_amdgcn_wave_reduce_fand : AMDGPUWaveReduce;
def int_amdgcn_wave_reduce_or : AMDGPUWaveReduce;
+def int_amdgcn_wave_reduce_uor : AMDGPUWaveReduce;
+def int_amdgcn_wave_reduce_for : AMDGPUWaveReduce;
def int_amdgcn_wave_reduce_xor : AMDGPUWaveReduce;
def int_amdgcn_readfirstlane :
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 5a06070dd64feb..cd057702d6072d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4863,9 +4863,13 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
if (isSGPR) {
switch(Opc){
case AMDGPU::S_MIN_U32:
+ case AMDGPU::S_MIN_I32:
+ case AMDGPU::S_MIN_F32:
case AMDGPU::S_MAX_U32:
+ case AMDGPU::S_MAX_I32:
+ case AMDGPU::S_MAX_F32:
case AMDGPU::S_AND_B32:
- case AMDGPU::S_OR_B32:
+ case AMDGPU::S_OR_B32:{
// These operations with a uniform value i.e. SGPR are idempotent.
// Reduced value will be same as given sgpr.
// bool IsWave32 = ST.isWave32();
@@ -4873,12 +4877,14 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
BuildMI(BB, MI, DL, TII->get(MovOpc), DstReg).addReg(SrcReg);
RetBB = &BB;
break;
- // TODO --> add support for Unsigned ADD and unsigned SUB.
+ }
case AMDGPU::S_XOR_B32:
case AMDGPU::S_ADD_U32:
case AMDGPU::S_ADD_I32:
+ case AMDGPU::S_ADD_F32:
case AMDGPU::S_SUB_U32:
- case AMDGPU::S_SUB_I32:{
+ case AMDGPU::S_SUB_I32:
+ case AMDGPU::S_SUB_F32:{
MachineBasicBlock::iterator I = BB.end();
Register SrcReg = MI.getOperand(1).getReg();
@@ -4970,109 +4976,37 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
auto ParityReg = BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
.addReg(NewAccumulator->getOperand(0).getReg())
.addImm(1);
-
+// S_MUL_I32
+ // auto MulOp =
+ // Can you have one float and one int op? I dont think you can, need to handle the float case seperately.
BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
- .addReg(ParityReg->getOperand(0).getReg())
- .addReg(SrcReg);
+ .addReg(SrcReg)
+ .addReg(ParityReg->getOperand(0).getReg()) ;
break;
}
- // case AMDGPU::S_SUB_U32:{
- // // // Doubt --> how can you have a negative unsigned value??
- // Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
- // Register SrcVal = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- // Register Product = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-
- // // Take the negation of the source operand.
- // auto InvertedValReg = BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal).addImm(-1).addReg(SrcReg);
-
- // auto V_SrcReg =
- // BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), SrcVal)
- // .addReg(InvertedValReg->getOperand(0).getReg());
-
- // auto ProductVal =
- // BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::V_MUL_LO_U32_e64), Product)
- // .addReg(V_SrcReg->getOperand(0).getReg())
- // .addReg(NewAccumulator->getOperand(0).getReg())
- // .addReg(AMDGPU::EXEC, RegState::Implicit);
-
- // BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
- // .addReg(ProductVal->getOperand(0).getReg());
-
- // break;
- // }
case AMDGPU::S_SUB_U32:
- case AMDGPU::S_SUB_I32:{
+ case AMDGPU::S_SUB_I32:
+ case AMDGPU::S_SUB_F32:{
// TODO --> use 2's compliment or subtract from 0 to find the negation of the number.
Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
- // Register SrcVal = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- // Register Product = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-
+
// Take the negation of the source operand.
auto InvertedValReg = BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal).addImm(-1).addReg(SrcReg);
BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
.addReg(InvertedValReg->getOperand(0).getReg())
.addReg(NewAccumulator->getOperand(0).getReg());
-
- // auto V_SrcReg =
- // BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), SrcVal)
- // .addReg(InvertedValReg->getOperand(0).getReg());
-
- // auto ProductVal =
- // BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::V_MUL_LO_U32_e64), Product)
- // .addReg(V_SrcReg->getOperand(0).getReg())
- // .addReg(NewAccumulator->getOperand(0).getReg())
- // .addReg(AMDGPU::EXEC, RegState::Implicit)
- // .setMIFlag(MachineInstr::MIFlag::NoUWrap)
- // .setMIFlag(MachineInstr::MIFlag::NoSWrap);
-
- // BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
- // .addReg(ProductVal->getOperand(0).getReg());
-
- // break;
+ break;
}
// Doubt --> is SSA form still have to be followed for MIR?
case AMDGPU::S_ADD_U32:
- case AMDGPU::S_ADD_I32:{
- BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
+ case AMDGPU::S_ADD_I32:
+ case AMDGPU::S_ADD_F32:{
+ auto Opcode = Opc == AMDGPU::S_ADD_U32 || Opc == AMDGPU::S_ADD_I32 ? AMDGPU::S_MUL_I32 : AMDGPU::S_MUL_F32;
+ BuildMI(*ComputeEnd, I, DL, TII->get(Opcode), DstReg)
.addReg(SrcReg)
.addReg(NewAccumulator->getOperand(0).getReg());
- // Register SrcVal = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- // Register Product = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-
- // auto V_SrcReg =
- // BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), SrcVal)
- // .addReg(SrcReg);
-
- // auto ProductVal =
- // BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::V_MUL_LO_U32_e64), Product)
- // .addReg(V_SrcReg->getOperand(0).getReg())
- // .addReg(NewAccumulator->getOperand(0).getReg())
- // .addReg(AMDGPU::EXEC, RegState::Implicit)
- // .setMIFlag(MachineInstr::MIFlag::NoUWrap)
- // .setMIFlag(MachineInstr::MIFlag::NoSWrap);
-
- // BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
- // .addReg(ProductVal->getOperand(0).getReg());
break;
}
- // case AMDGPU::S_ADD_U32:{
- // Register SrcVal = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- // Register Product = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-
- // auto V_SrcReg =
- // BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), SrcVal)
- // .addReg(SrcReg);
-
- // auto ProductVal =
- // BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::V_MUL_LO_U32_e64), Product)
- // .addReg(V_SrcReg->getOperand(0).getReg())
- // .addReg(NewAccumulator->getOperand(0).getReg())
- // .addReg(AMDGPU::EXEC, RegState::Implicit);
-
- // BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
- // .addReg(ProductVal->getOperand(0).getReg());
- // break;
- // }
}
RetBB = ComputeEnd;
}
@@ -5182,20 +5116,40 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
switch (MI.getOpcode()) {
case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
+ case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_I32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
+ case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_F32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_F32);
case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
+ case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_I32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
+ case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_F32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_F32);
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U32);
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
+ case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_F32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_F32);
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U32);
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
+ case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_F32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_F32);
case AMDGPU::WAVE_REDUCE_AND_PSEUDO_U32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
+ case AMDGPU::WAVE_REDUCE_AND_PSEUDO_I32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
+ case AMDGPU::WAVE_REDUCE_AND_PSEUDO_F32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
case AMDGPU::WAVE_REDUCE_OR_PSEUDO_U32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
+ case AMDGPU::WAVE_REDUCE_OR_PSEUDO_I32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
+ case AMDGPU::WAVE_REDUCE_OR_PSEUDO_F32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_U32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
case AMDGPU::S_UADDO_PSEUDO:
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index b61094cd5f6309..534b4d2c052482 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -260,15 +260,40 @@ let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses
[(set i32:$sdst, (int_amdgcn_wave_reduce_umin i32:$src, i32:$strategy))]> {
}
+ def WAVE_REDUCE_UMIN_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+ (ins VSrc_b32: $src, VSrc_b32:$strategy),
+ [(set i32:$sdst, (int_amdgcn_wave_reduce_min i32:$src, i32:$strategy))]> {
+ }
+
+ def WAVE_REDUCE_UMIN_PSEUDO_F32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+ (ins VSrc_b32: $src, VSrc_b32:$strategy),
+ [(set f32:$sdst, (int_amdgcn_wave_reduce_fmin f32:$src, i32:$strategy))]> {
+ }
+
def WAVE_REDUCE_UMAX_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
(ins VSrc_b32: $src, VSrc_b32:$strategy),
[(set i32:$sdst, (int_amdgcn_wave_reduce_umax i32:$src, i32:$strategy))]> {
}
+ def WAVE_REDUCE_UMAX_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+ (ins VSrc_b32: $src, VSrc_b32:$strategy),
+ [(set i32:$sdst, (int_amdgcn_wave_reduce_max i32:$src, i32:$strategy))]> {
+ }
+
+ def WAVE_REDUCE_UMAX_PSEUDO_F32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+ (ins VSrc_b32: $src, VSrc_b32:$strategy),
+ [(set f32:$sdst, (int_amdgcn_wave_reduce_fmax f32:$src, i32:$strategy))]> {
+ }
+
def WAVE_REDUCE_ADD_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst),
(ins VSrc_b32: $src, VSrc_b32:$strategy),
[(set i32:$sdst, (int_amdgcn_wave_reduce_add i32:$src, i32:$strategy))]> {
}
+
+ def WAVE_REDUCE_ADD_PSEUDO_F32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+ (ins VSrc_b32: $src, VSrc_b32:$strategy),
+ [(set f32:$sdst, (int_amdgcn_wave_reduce_add f32:$src, i32:$strategy))]> {
+ }
def WAVE_REDUCE_ADD_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
(ins VSrc_b32: $src, VSrc_b32:$strategy),
@@ -284,16 +309,41 @@ let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses
(ins VSrc_b32: $src, VSrc_b32:$strategy),
[(set i32:$sdst, (int_amdgcn_wave_reduce_usub i32:$src, i32:$strategy))]> {
}
+
+ def WAVE_REDUCE_SUB_PSEUDO_F32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+ (ins VSrc_b32: $src, VSrc_b32:$strategy),
+ [(set f32:$sdst, (int_amdgcn_wave_reduce_fsub f32:$src, i32:$strategy))]> {
+ }
def WAVE_REDUCE_AND_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+ (ins VSrc_b32: $src, VSrc_b32:$strategy),
+ [(set i32:$sdst, (int_amdgcn_wave_reduce_uand i32:$src, i32:$strategy))]> {
+ }
+
+ def WAVE_REDUCE_AND_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst),
(ins VSrc_b32: $src, VSrc_b32:$strategy),
[(set i32:$sdst, (int_amdgcn_wave_reduce_and i32:$src, i32:$strategy))]> {
}
+
+ def WAVE_REDUCE_AND_PSEUDO_F32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+ (ins VSrc_b32: $src, VSrc_b32:$strategy),
+ [(set f32:$sdst, (int_amdgcn_wave_reduce_fand f32:$src, i32:$strategy))]> {
+ }
def WAVE_REDUCE_OR_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+ (ins VSrc_b32: $src, VSrc_b32:$strategy),
+ [(set i32:$sdst, (int_amdgcn_wave_reduce_uor i32:$src, i32:$strategy))]> {
+ }
+
+ def WAVE_REDUCE_OR_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst),
(ins VSrc_b32: $src, VSrc_b32:$strategy),
[(set i32:$sdst, (int_amdgcn_wave_reduce_or i32:$src, i32:$strategy))]> {
}
+
+ def WAVE_REDUCE_OR_PSEUDO_F32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+ (ins VSrc_b32: $src, VSrc_b32:$strategy),
+ [(set f32:$sdst, (int_amdgcn_wave_reduce_for f32:$src, i32:$strategy))]> {
+ }
def WAVE_REDUCE_XOR_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
(ins VSrc_b32: $src, VSrc_b32:$strategy),
>From d6dc7a5e23e2c5b839fabfbb85259cb3efdd0197 Mon Sep 17 00:00:00 2001
From: easyonaadit <aaditya.alokdeshpande at amd.com>
Date: Fri, 27 Sep 2024 10:03:36 +0530
Subject: [PATCH 05/13] Changed atomic optimizer to emit wave.reduce intrinsic.
---
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 12 -
.../Target/AMDGPU/AMDGPUAtomicOptimizer.cpp | 346 +++++++++---------
2 files changed, 177 insertions(+), 181 deletions(-)
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 299513a95e2989..fc6d13899a809d 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2119,18 +2119,6 @@ class AMDGPUWaveReduce<LLVMType data_ty = llvm_any_ty> : Intrinsic<
],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<1>>]>;
-//multiclass AMDGPUWaveReducee {
-// foreach Opcode = ["umin", "umax", "add", "uadd", "sub", "usub", "and", "or", "xor"] in
-// def int_amdgcn_wave_reduce_#Opcode : AMDGPUWaveReduce;
-//}
-
-//multiclass AMDGPUMFp8MfmaIntrinsic<LLVMType DestTy> {
-// foreach kind = ["bf8_bf8", "bf8_fp8", "fp8_bf8", "fp8_fp8"] in
-// def NAME#"_"#kind : AMDGPUMFp8MfmaIntrinsic<DestTy>;
-//}
-
-//WaveReduceDefs<["umin", "umax", "add", "uadd", "sub", "usub", "and", "or", "xor"]>;
-//list<string> Operations
def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce;
def int_amdgcn_wave_reduce_min : AMDGPUWaveReduce;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index f408a013d7a379..8bc3bc81002adf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -682,7 +682,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
// lanes that are around only for the purposes of derivatives to take part
// in any cross-lane communication, and we use a branch on whether the lane is
// live to do this.
- if (IsPixelShader) {
+ if (false) {
// Record I's original position as the entry block.
PixelEntryBB = I.getParent();
@@ -705,29 +705,35 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
// This is the value in the atomic operation we need to combine in order to
// reduce the number of atomic operations.
Value *V = I.getOperand(ValIdx);
+ // ------------------------------------
+ Type *const WaveTy = B.getIntNTy(ST->getWavefrontSize());
+ CallInst *const WaveRed =
+ B.CreateIntrinsic(Intrinsic::amdgcn_wave_reduce_umin, Int32Ty, {V, B.getInt32(0)});
+
+ // ------------------------------------
// We need to know how many lanes are active within the wavefront, and we do
// this by doing a ballot of active lanes.
- Type *const WaveTy = B.getIntNTy(ST->getWavefrontSize());
- CallInst *const Ballot =
- B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue());
+ // Type *const WaveTy = B.getIntNTy(ST->getWavefrontSize());
+ // CallInst *const Ballot =
+ // B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue());
// We need to know how many lanes are active within the wavefront that are
// below us. If we counted each lane linearly starting from 0, a lane is
// below us only if its associated index was less than ours. We do this by
// using the mbcnt intrinsic.
- Value *Mbcnt;
- if (ST->isWave32()) {
- Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
- {Ballot, B.getInt32(0)});
- } else {
- Value *const ExtractLo = B.CreateTrunc(Ballot, Int32Ty);
- Value *const ExtractHi = B.CreateTrunc(B.CreateLShr(Ballot, 32), Int32Ty);
- Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
- {ExtractLo, B.getInt32(0)});
- Mbcnt =
- B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {ExtractHi, Mbcnt});
- }
+ // Value *Mbcnt;
+ // if (ST->isWave32()) {
+ // Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
+ // {Ballot, B.getInt32(0)});
+ // } else {
+ // Value *const ExtractLo = B.CreateTrunc(Ballot, Int32Ty);
+ // Value *const ExtractHi = B.CreateTrunc(B.CreateLShr(Ballot, 32), Int32Ty);
+ // Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
+ // {ExtractLo, B.getInt32(0)});
+ // Mbcnt =
+ // B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {ExtractHi, Mbcnt});
+ // }
Function *F = I.getFunction();
LLVMContext &C = F->getContext();
@@ -745,13 +751,14 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
Value *ExclScan = nullptr;
Value *NewV = nullptr;
- const bool NeedResult = !I.use_empty();
+ // const bool NeedResult = !I.use_empty();
+ const bool NeedResult = false;
BasicBlock *ComputeLoop = nullptr;
BasicBlock *ComputeEnd = nullptr;
// If we have a divergent value in each lane, we need to combine the value
// using DPP.
- if (ValDivergent) {
+ if (false) {
if (ScanImpl == ScanOptions::DPP) {
// First we need to set all inactive invocations to the identity value, so
// that they can correctly contribute to the final result.
@@ -785,54 +792,55 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
llvm_unreachable("Atomic Optimzer is disabled for None strategy");
}
} else {
- switch (Op) {
- default:
- llvm_unreachable("Unhandled atomic op");
-
- case AtomicRMWInst::Add:
- case AtomicRMWInst::Sub: {
- // The new value we will be contributing to the atomic operation is the
- // old value times the number of active lanes.
- Value *const Ctpop = B.CreateIntCast(
- B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false);
- NewV = buildMul(B, V, Ctpop);
- break;
- }
- case AtomicRMWInst::FAdd:
- case AtomicRMWInst::FSub: {
- Value *const Ctpop = B.CreateIntCast(
- B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Int32Ty, false);
- Value *const CtpopFP = B.CreateUIToFP(Ctpop, Ty);
- NewV = B.CreateFMul(V, CtpopFP);
- break;
- }
- case AtomicRMWInst::And:
- case AtomicRMWInst::Or:
- case AtomicRMWInst::Max:
- case AtomicRMWInst::Min:
- case AtomicRMWInst::UMax:
- case AtomicRMWInst::UMin:
- case AtomicRMWInst::FMin:
- case AtomicRMWInst::FMax:
- // These operations with a uniform value are idempotent: doing the atomic
- // operation multiple times has the same effect as doing it once.
- NewV = V;
- break;
-
- case AtomicRMWInst::Xor:
- // The new value we will be contributing to the atomic operation is the
- // old value times the parity of the number of active lanes.
- Value *const Ctpop = B.CreateIntCast(
- B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false);
- NewV = buildMul(B, V, B.CreateAnd(Ctpop, 1));
- break;
- }
+ // switch (Op) {
+ // default:
+ // llvm_unreachable("Unhandled atomic op");
+
+ // case AtomicRMWInst::Add:
+ // case AtomicRMWInst::Sub: {
+ // // The new value we will be contributing to the atomic operation is the
+ // // old value times the number of active lanes.
+ // // Value *const Ctpop = B.CreateIntCast(
+ // // B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false);
+ // // NewV = buildMul(B, V, Ctpop);
+ // break;
+ // }
+ // case AtomicRMWInst::FAdd:
+ // case AtomicRMWInst::FSub: {
+ // // Value *const Ctpop = B.CreateIntCast(
+ // // B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Int32Ty, false);
+ // // Value *const CtpopFP = B.CreateUIToFP(Ctpop, Ty);
+ // // NewV = B.CreateFMul(V, CtpopFP);
+ // break;
+ // }
+ // case AtomicRMWInst::And:
+ // case AtomicRMWInst::Or:
+ // case AtomicRMWInst::Max:
+ // case AtomicRMWInst::Min:
+ // case AtomicRMWInst::UMax:
+ // case AtomicRMWInst::UMin:
+ // case AtomicRMWInst::FMin:
+ // case AtomicRMWInst::FMax:
+ // // These operations with a uniform value are idempotent: doing the atomic
+ // // operation multiple times has the same effect as doing it once.
+ // NewV = V;
+ // break;
+
+ // case AtomicRMWInst::Xor:
+ // // The new value we will be contributing to the atomic operation is the
+ // // old value times the parity of the number of active lanes.
+ // Value *const Ctpop = B.CreateIntCast(
+ // B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false);
+ // NewV = buildMul(B, V, B.CreateAnd(Ctpop, 1));
+ // break;
+ // }
+ // }
}
// We only want a single lane to enter our new control flow, and we do this
// by checking if there are any active lanes below us. Only one lane will
// have 0 active lanes below us, so that will be the only one to progress.
- Value *const Cond = B.CreateICmpEQ(Mbcnt, B.getInt32(0));
+ // Value *const Cond = B.CreateICmpEQ(Mbcnt, B.getInt32(0));
// Store I's original basic block before we split the block.
BasicBlock *const OriginalBB = I.getParent();
@@ -842,8 +850,8 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
// new block such that:
// entry --> single_lane -\
// \------------------> exit
- Instruction *const SingleLaneTerminator =
- SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, &DTU, nullptr);
+ // Instruction *const SingleLaneTerminator =
+ // SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, &DTU, nullptr);
// At this point, we have split the I's block to allow one lane in wavefront
// to update the precomputed reduced value. Also, completed the codegen for
@@ -854,135 +862,135 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
// ComputeEnd block. We also need to set up predecessor to next block when
// single lane done updating the final reduced value.
BasicBlock *Predecessor = nullptr;
- if (ValDivergent && ScanImpl == ScanOptions::Iterative) {
+ // if (ValDivergent && ScanImpl == ScanOptions::Iterative) {
// Move terminator from I's block to ComputeEnd block.
//
// OriginalBB is known to have a branch as terminator because
// SplitBlockAndInsertIfThen will have inserted one.
- BranchInst *Terminator = cast<BranchInst>(OriginalBB->getTerminator());
- B.SetInsertPoint(ComputeEnd);
- Terminator->removeFromParent();
- B.Insert(Terminator);
+ // BranchInst *Terminator = cast<BranchInst>(OriginalBB->getTerminator());
+ // B.SetInsertPoint(ComputeEnd);
+ // Terminator->removeFromParent();
+ // B.Insert(Terminator);
// Branch to ComputeLoop Block unconditionally from the I's block for
// iterative approach.
- B.SetInsertPoint(OriginalBB);
- B.CreateBr(ComputeLoop);
+ // B.SetInsertPoint(OriginalBB);
+ // B.CreateBr(ComputeLoop);
// Update the dominator tree for new control flow.
- SmallVector<DominatorTree::UpdateType, 6> DomTreeUpdates(
- {{DominatorTree::Insert, OriginalBB, ComputeLoop},
- {DominatorTree::Insert, ComputeLoop, ComputeEnd}});
+ // SmallVector<DominatorTree::UpdateType, 6> DomTreeUpdates(
+ // {{DominatorTree::Insert, OriginalBB, ComputeLoop},
+ // {DominatorTree::Insert, ComputeLoop, ComputeEnd}});
// We're moving the terminator from EntryBB to ComputeEnd, make sure we move
// the DT edges as well.
- for (auto *Succ : Terminator->successors()) {
- DomTreeUpdates.push_back({DominatorTree::Insert, ComputeEnd, Succ});
- DomTreeUpdates.push_back({DominatorTree::Delete, OriginalBB, Succ});
- }
+ // for (auto *Succ : Terminator->successors()) {
+ // DomTreeUpdates.push_back({DominatorTree::Insert, ComputeEnd, Succ});
+ // DomTreeUpdates.push_back({DominatorTree::Delete, OriginalBB, Succ});
+ // }
- DTU.applyUpdates(DomTreeUpdates);
+ // DTU.applyUpdates(DomTreeUpdates);
- Predecessor = ComputeEnd;
- } else {
- Predecessor = OriginalBB;
- }
+ // Predecessor = ComputeEnd;
+ // } else {
+ // Predecessor = OriginalBB;
+ // }
// Move the IR builder into single_lane next.
- B.SetInsertPoint(SingleLaneTerminator);
+ // B.SetInsertPoint(SingleLaneTerminator);
// Clone the original atomic operation into single lane, replacing the
// original value with our newly created one.
Instruction *const NewI = I.clone();
B.Insert(NewI);
- NewI->setOperand(ValIdx, NewV);
+ NewI->setOperand(ValIdx, WaveRed);
// Move the IR builder into exit next, and start inserting just before the
// original instruction.
- B.SetInsertPoint(&I);
-
- if (NeedResult) {
- // Create a PHI node to get our new atomic result into the exit block.
- PHINode *const PHI = B.CreatePHI(Ty, 2);
- PHI->addIncoming(PoisonValue::get(Ty), Predecessor);
- PHI->addIncoming(NewI, SingleLaneTerminator->getParent());
-
- // We need to broadcast the value who was the lowest active lane (the first
- // lane) to all other lanes in the wavefront.
- Value *BroadcastI = nullptr;
- BroadcastI = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readfirstlane, PHI);
-
- // Now that we have the result of our single atomic operation, we need to
- // get our individual lane's slice into the result. We use the lane offset
- // we previously calculated combined with the atomic result value we got
- // from the first lane, to get our lane's index into the atomic result.
- Value *LaneOffset = nullptr;
- if (ValDivergent) {
- if (ScanImpl == ScanOptions::DPP) {
- LaneOffset =
- B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan);
- } else if (ScanImpl == ScanOptions::Iterative) {
- LaneOffset = ExclScan;
- } else {
- llvm_unreachable("Atomic Optimzer is disabled for None strategy");
- }
- } else {
- Mbcnt = isAtomicFloatingPointTy ? B.CreateUIToFP(Mbcnt, Ty)
- : B.CreateIntCast(Mbcnt, Ty, false);
- switch (Op) {
- default:
- llvm_unreachable("Unhandled atomic op");
- case AtomicRMWInst::Add:
- case AtomicRMWInst::Sub:
- LaneOffset = buildMul(B, V, Mbcnt);
- break;
- case AtomicRMWInst::And:
- case AtomicRMWInst::Or:
- case AtomicRMWInst::Max:
- case AtomicRMWInst::Min:
- case AtomicRMWInst::UMax:
- case AtomicRMWInst::UMin:
- case AtomicRMWInst::FMin:
- case AtomicRMWInst::FMax:
- LaneOffset = B.CreateSelect(Cond, Identity, V);
- break;
- case AtomicRMWInst::Xor:
- LaneOffset = buildMul(B, V, B.CreateAnd(Mbcnt, 1));
- break;
- case AtomicRMWInst::FAdd:
- case AtomicRMWInst::FSub: {
- LaneOffset = B.CreateFMul(V, Mbcnt);
- break;
- }
- }
- }
- Value *Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset);
- if (isAtomicFloatingPointTy) {
- // For fadd/fsub the first active lane of LaneOffset should be the
- // identity (-0.0 for fadd or +0.0 for fsub) but the value we calculated
- // is V * +0.0 which might have the wrong sign or might be nan (if V is
- // inf or nan).
- //
- // For all floating point ops if the in-memory value was a nan then the
- // binop we just built might have quieted it or changed its payload.
- //
- // Correct all these problems by using BroadcastI as the result in the
- // first active lane.
- Result = B.CreateSelect(Cond, BroadcastI, Result);
- }
-
- if (IsPixelShader) {
- // Need a final PHI to reconverge to above the helper lane branch mask.
- B.SetInsertPoint(PixelExitBB, PixelExitBB->getFirstNonPHIIt());
-
- PHINode *const PHI = B.CreatePHI(Ty, 2);
- PHI->addIncoming(PoisonValue::get(Ty), PixelEntryBB);
- PHI->addIncoming(Result, I.getParent());
- I.replaceAllUsesWith(PHI);
- } else {
- // Replace the original atomic instruction with the new one.
- I.replaceAllUsesWith(Result);
- }
- }
+ // B.SetInsertPoint(&I);
+
+ // if (NeedResult) {
+ // // Create a PHI node to get our new atomic result into the exit block.
+ // PHINode *const PHI = B.CreatePHI(Ty, 2);
+ // PHI->addIncoming(PoisonValue::get(Ty), Predecessor);
+ // PHI->addIncoming(NewI, SingleLaneTerminator->getParent());
+
+ // // We need to broadcast the value who was the lowest active lane (the first
+ // // lane) to all other lanes in the wavefront.
+ // Value *BroadcastI = nullptr;
+ // BroadcastI = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readfirstlane, PHI);
+
+ // // Now that we have the result of our single atomic operation, we need to
+ // // get our individual lane's slice into the result. We use the lane offset
+ // // we previously calculated combined with the atomic result value we got
+ // // from the first lane, to get our lane's index into the atomic result.
+ // Value *LaneOffset = nullptr;
+ // if (ValDivergent) {
+ // if (ScanImpl == ScanOptions::DPP) {
+ // LaneOffset =
+ // B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan);
+ // } else if (ScanImpl == ScanOptions::Iterative) {
+ // LaneOffset = ExclScan;
+ // } else {
+ // llvm_unreachable("Atomic Optimzer is disabled for None strategy");
+ // }
+ // } else {
+ // Mbcnt = isAtomicFloatingPointTy ? B.CreateUIToFP(Mbcnt, Ty)
+ // : B.CreateIntCast(Mbcnt, Ty, false);
+ // switch (Op) {
+ // default:
+ // llvm_unreachable("Unhandled atomic op");
+ // case AtomicRMWInst::Add:
+ // case AtomicRMWInst::Sub:
+ // LaneOffset = buildMul(B, V, Mbcnt);
+ // break;
+ // case AtomicRMWInst::And:
+ // case AtomicRMWInst::Or:
+ // case AtomicRMWInst::Max:
+ // case AtomicRMWInst::Min:
+ // case AtomicRMWInst::UMax:
+ // case AtomicRMWInst::UMin:
+ // case AtomicRMWInst::FMin:
+ // case AtomicRMWInst::FMax:
+ // LaneOffset = B.CreateSelect(Cond, Identity, V);
+ // break;
+ // case AtomicRMWInst::Xor:
+ // LaneOffset = buildMul(B, V, B.CreateAnd(Mbcnt, 1));
+ // break;
+ // case AtomicRMWInst::FAdd:
+ // case AtomicRMWInst::FSub: {
+ // LaneOffset = B.CreateFMul(V, Mbcnt);
+ // break;
+ // }
+ // }
+ // }
+ // Value *Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset);
+ // if (isAtomicFloatingPointTy) {
+ // // For fadd/fsub the first active lane of LaneOffset should be the
+ // // identity (-0.0 for fadd or +0.0 for fsub) but the value we calculated
+ // // is V * +0.0 which might have the wrong sign or might be nan (if V is
+ // // inf or nan).
+ // //
+ // // For all floating point ops if the in-memory value was a nan then the
+ // // binop we just built might have quieted it or changed its payload.
+ // //
+ // // Correct all these problems by using BroadcastI as the result in the
+ // // first active lane.
+ // Result = B.CreateSelect(Cond, BroadcastI, Result);
+ // }
+
+ // if (IsPixelShader) {
+ // // Need a final PHI to reconverge to above the helper lane branch mask.
+ // B.SetInsertPoint(PixelExitBB, PixelExitBB->getFirstNonPHIIt());
+
+ // PHINode *const PHI = B.CreatePHI(Ty, 2);
+ // PHI->addIncoming(PoisonValue::get(Ty), PixelEntryBB);
+ // PHI->addIncoming(Result, I.getParent());
+ // I.replaceAllUsesWith(PHI);
+ // } else {
+ // // Replace the original atomic instruction with the new one.
+ // I.replaceAllUsesWith(Result);
+ // }
+ // }
// And delete the original.
I.eraseFromParent();
>From dd799eedb86c3e6f76bc27a0f37b36375f5a9e62 Mon Sep 17 00:00:00 2001
From: easyonaadit <aaditya.alokdeshpande at amd.com>
Date: Mon, 30 Sep 2024 10:33:45 +0530
Subject: [PATCH 06/13] Working testing module, but atomicAdd has umin
intrinsic
---
.../Target/AMDGPU/AMDGPUAtomicOptimizer.cpp | 345 +++++++++---------
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 4 +-
2 files changed, 172 insertions(+), 177 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index 8bc3bc81002adf..e4782cb5d9c131 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -682,7 +682,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
// lanes that are around only for the purposes of derivatives to take part
// in any cross-lane communication, and we use a branch on whether the lane is
// live to do this.
- if (false) {
+ if (IsPixelShader) {
// Record I's original position as the entry block.
PixelEntryBB = I.getParent();
@@ -705,35 +705,29 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
// This is the value in the atomic operation we need to combine in order to
// reduce the number of atomic operations.
Value *V = I.getOperand(ValIdx);
- // ------------------------------------
- Type *const WaveTy = B.getIntNTy(ST->getWavefrontSize());
- CallInst *const WaveRed =
- B.CreateIntrinsic(Intrinsic::amdgcn_wave_reduce_umin, Int32Ty, {V, B.getInt32(0)});
-
- // ------------------------------------
// We need to know how many lanes are active within the wavefront, and we do
// this by doing a ballot of active lanes.
- // Type *const WaveTy = B.getIntNTy(ST->getWavefrontSize());
- // CallInst *const Ballot =
- // B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue());
+ Type *const WaveTy = B.getIntNTy(ST->getWavefrontSize());
+ CallInst *const Ballot =
+ B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue());
// We need to know how many lanes are active within the wavefront that are
// below us. If we counted each lane linearly starting from 0, a lane is
// below us only if its associated index was less than ours. We do this by
// using the mbcnt intrinsic.
- // Value *Mbcnt;
- // if (ST->isWave32()) {
- // Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
- // {Ballot, B.getInt32(0)});
- // } else {
- // Value *const ExtractLo = B.CreateTrunc(Ballot, Int32Ty);
- // Value *const ExtractHi = B.CreateTrunc(B.CreateLShr(Ballot, 32), Int32Ty);
- // Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
- // {ExtractLo, B.getInt32(0)});
- // Mbcnt =
- // B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {ExtractHi, Mbcnt});
- // }
+ Value *Mbcnt;
+ if (ST->isWave32()) {
+ Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
+ {Ballot, B.getInt32(0)});
+ } else {
+ Value *const ExtractLo = B.CreateTrunc(Ballot, Int32Ty);
+ Value *const ExtractHi = B.CreateTrunc(B.CreateLShr(Ballot, 32), Int32Ty);
+ Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
+ {ExtractLo, B.getInt32(0)});
+ Mbcnt =
+ B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {ExtractHi, Mbcnt});
+ }
Function *F = I.getFunction();
LLVMContext &C = F->getContext();
@@ -751,47 +745,46 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
Value *ExclScan = nullptr;
Value *NewV = nullptr;
- // const bool NeedResult = !I.use_empty();
- const bool NeedResult = false;
+ const bool NeedResult = !I.use_empty();
BasicBlock *ComputeLoop = nullptr;
BasicBlock *ComputeEnd = nullptr;
// If we have a divergent value in each lane, we need to combine the value
// using DPP.
- if (false) {
- if (ScanImpl == ScanOptions::DPP) {
- // First we need to set all inactive invocations to the identity value, so
- // that they can correctly contribute to the final result.
- NewV =
- B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
- if (!NeedResult && ST->hasPermLaneX16()) {
- // On GFX10 the permlanex16 instruction helps us build a reduction
- // without too many readlanes and writelanes, which are generally bad
- // for performance.
- NewV = buildReduction(B, ScanOp, NewV, Identity);
- } else {
- NewV = buildScan(B, ScanOp, NewV, Identity);
- if (NeedResult)
- ExclScan = buildShiftRight(B, NewV, Identity);
- // Read the value from the last lane, which has accumulated the values
- // of each active lane in the wavefront. This will be our new value
- // which we will provide to the atomic operation.
- Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
- NewV = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readlane,
- {NewV, LastLaneIdx});
- }
- // Finally mark the readlanes in the WWM section.
- NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV);
- } else if (ScanImpl == ScanOptions::Iterative) {
- // Alternative implementation for scan
- ComputeLoop = BasicBlock::Create(C, "ComputeLoop", F);
- ComputeEnd = BasicBlock::Create(C, "ComputeEnd", F);
- std::tie(ExclScan, NewV) = buildScanIteratively(B, ScanOp, Identity, V, I,
- ComputeLoop, ComputeEnd);
- } else {
- llvm_unreachable("Atomic Optimzer is disabled for None strategy");
- }
- } else {
+ // if (ValDivergent) {
+ // if (ScanImpl == ScanOptions::DPP) {
+ // // First we need to set all inactive invocations to the identity value, so
+ // // that they can correctly contribute to the final result.
+ // NewV =
+ // B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
+ // if (!NeedResult && ST->hasPermLaneX16()) {
+ // // On GFX10 the permlanex16 instruction helps us build a reduction
+ // // without too many readlanes and writelanes, which are generally bad
+ // // for performance.
+ // NewV = buildReduction(B, ScanOp, NewV, Identity);
+ // } else {
+ // NewV = buildScan(B, ScanOp, NewV, Identity);
+ // if (NeedResult)
+ // ExclScan = buildShiftRight(B, NewV, Identity);
+ // // Read the value from the last lane, which has accumulated the values
+ // // of each active lane in the wavefront. This will be our new value
+ // // which we will provide to the atomic operation.
+ // Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
+ // NewV = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readlane,
+ // {NewV, LastLaneIdx});
+ // }
+ // // Finally mark the readlanes in the WWM section.
+ // NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV);
+ // } else if (ScanImpl == ScanOptions::Iterative) {
+ // // Alternative implementation for scan
+ // ComputeLoop = BasicBlock::Create(C, "ComputeLoop", F);
+ // ComputeEnd = BasicBlock::Create(C, "ComputeEnd", F);
+ // std::tie(ExclScan, NewV) = buildScanIteratively(B, ScanOp, Identity, V, I,
+ // ComputeLoop, ComputeEnd);
+ // } else {
+ // llvm_unreachable("Atomic Optimzer is disabled for None strategy");
+ // }
+ // } else {
// switch (Op) {
// default:
// llvm_unreachable("Unhandled atomic op");
@@ -800,17 +793,17 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
// case AtomicRMWInst::Sub: {
// // The new value we will be contributing to the atomic operation is the
// // old value times the number of active lanes.
- // // Value *const Ctpop = B.CreateIntCast(
- // // B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false);
- // // NewV = buildMul(B, V, Ctpop);
+ // Value *const Ctpop = B.CreateIntCast(
+ // B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false);
+ // NewV = buildMul(B, V, Ctpop);
// break;
// }
// case AtomicRMWInst::FAdd:
// case AtomicRMWInst::FSub: {
- // // Value *const Ctpop = B.CreateIntCast(
- // // B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Int32Ty, false);
- // // Value *const CtpopFP = B.CreateUIToFP(Ctpop, Ty);
- // // NewV = B.CreateFMul(V, CtpopFP);
+ // Value *const Ctpop = B.CreateIntCast(
+ // B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Int32Ty, false);
+ // Value *const CtpopFP = B.CreateUIToFP(Ctpop, Ty);
+ // NewV = B.CreateFMul(V, CtpopFP);
// break;
// }
// case AtomicRMWInst::And:
@@ -835,12 +828,13 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
// break;
// }
// }
- }
+
+ NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wave_reduce_umin, Int32Ty, {V, B.getInt32(0)});
// We only want a single lane to enter our new control flow, and we do this
// by checking if there are any active lanes below us. Only one lane will
// have 0 active lanes below us, so that will be the only one to progress.
- // Value *const Cond = B.CreateICmpEQ(Mbcnt, B.getInt32(0));
+ Value *const Cond = B.CreateICmpEQ(Mbcnt, B.getInt32(0));
// Store I's original basic block before we split the block.
BasicBlock *const OriginalBB = I.getParent();
@@ -850,8 +844,8 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
// new block such that:
// entry --> single_lane -\
// \------------------> exit
- // Instruction *const SingleLaneTerminator =
- // SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, &DTU, nullptr);
+ Instruction *const SingleLaneTerminator =
+ SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, &DTU, nullptr);
// At this point, we have split the I's block to allow one lane in wavefront
// to update the precomputed reduced value. Also, completed the codegen for
@@ -863,27 +857,27 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
// single lane done updating the final reduced value.
BasicBlock *Predecessor = nullptr;
// if (ValDivergent && ScanImpl == ScanOptions::Iterative) {
- // Move terminator from I's block to ComputeEnd block.
- //
- // OriginalBB is known to have a branch as terminator because
- // SplitBlockAndInsertIfThen will have inserted one.
- // BranchInst *Terminator = cast<BranchInst>(OriginalBB->getTerminator());
- // B.SetInsertPoint(ComputeEnd);
- // Terminator->removeFromParent();
- // B.Insert(Terminator);
-
- // Branch to ComputeLoop Block unconditionally from the I's block for
- // iterative approach.
- // B.SetInsertPoint(OriginalBB);
- // B.CreateBr(ComputeLoop);
-
- // Update the dominator tree for new control flow.
- // SmallVector<DominatorTree::UpdateType, 6> DomTreeUpdates(
- // {{DominatorTree::Insert, OriginalBB, ComputeLoop},
- // {DominatorTree::Insert, ComputeLoop, ComputeEnd}});
-
- // We're moving the terminator from EntryBB to ComputeEnd, make sure we move
- // the DT edges as well.
+ // // Move terminator from I's block to ComputeEnd block.
+ // //
+ // // OriginalBB is known to have a branch as terminator because
+ // // SplitBlockAndInsertIfThen will have inserted one.
+ // BranchInst *Terminator = cast<BranchInst>(OriginalBB->getTerminator());
+ // B.SetInsertPoint(ComputeEnd);
+ // Terminator->removeFromParent();
+ // B.Insert(Terminator);
+
+ // // Branch to ComputeLoop Block unconditionally from the I's block for
+ // // iterative approach.
+ // B.SetInsertPoint(OriginalBB);
+ // B.CreateBr(ComputeLoop);
+
+ // // Update the dominator tree for new control flow.
+ // SmallVector<DominatorTree::UpdateType, 6> DomTreeUpdates(
+ // {{DominatorTree::Insert, OriginalBB, ComputeLoop},
+ // {DominatorTree::Insert, ComputeLoop, ComputeEnd}});
+
+ // // We're moving the terminator from EntryBB to ComputeEnd, make sure we move
+ // // the DT edges as well.
// for (auto *Succ : Terminator->successors()) {
// DomTreeUpdates.push_back({DominatorTree::Insert, ComputeEnd, Succ});
// DomTreeUpdates.push_back({DominatorTree::Delete, OriginalBB, Succ});
@@ -895,102 +889,103 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
// } else {
// Predecessor = OriginalBB;
// }
+ Predecessor = OriginalBB;
// Move the IR builder into single_lane next.
- // B.SetInsertPoint(SingleLaneTerminator);
+ B.SetInsertPoint(SingleLaneTerminator);
// Clone the original atomic operation into single lane, replacing the
// original value with our newly created one.
Instruction *const NewI = I.clone();
B.Insert(NewI);
- NewI->setOperand(ValIdx, WaveRed);
+ NewI->setOperand(ValIdx, NewV);
// Move the IR builder into exit next, and start inserting just before the
// original instruction.
- // B.SetInsertPoint(&I);
-
- // if (NeedResult) {
- // // Create a PHI node to get our new atomic result into the exit block.
- // PHINode *const PHI = B.CreatePHI(Ty, 2);
- // PHI->addIncoming(PoisonValue::get(Ty), Predecessor);
- // PHI->addIncoming(NewI, SingleLaneTerminator->getParent());
-
- // // We need to broadcast the value who was the lowest active lane (the first
- // // lane) to all other lanes in the wavefront.
- // Value *BroadcastI = nullptr;
- // BroadcastI = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readfirstlane, PHI);
-
- // // Now that we have the result of our single atomic operation, we need to
- // // get our individual lane's slice into the result. We use the lane offset
- // // we previously calculated combined with the atomic result value we got
- // // from the first lane, to get our lane's index into the atomic result.
- // Value *LaneOffset = nullptr;
- // if (ValDivergent) {
- // if (ScanImpl == ScanOptions::DPP) {
- // LaneOffset =
- // B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan);
- // } else if (ScanImpl == ScanOptions::Iterative) {
- // LaneOffset = ExclScan;
- // } else {
- // llvm_unreachable("Atomic Optimzer is disabled for None strategy");
- // }
- // } else {
- // Mbcnt = isAtomicFloatingPointTy ? B.CreateUIToFP(Mbcnt, Ty)
- // : B.CreateIntCast(Mbcnt, Ty, false);
- // switch (Op) {
- // default:
- // llvm_unreachable("Unhandled atomic op");
- // case AtomicRMWInst::Add:
- // case AtomicRMWInst::Sub:
- // LaneOffset = buildMul(B, V, Mbcnt);
- // break;
- // case AtomicRMWInst::And:
- // case AtomicRMWInst::Or:
- // case AtomicRMWInst::Max:
- // case AtomicRMWInst::Min:
- // case AtomicRMWInst::UMax:
- // case AtomicRMWInst::UMin:
- // case AtomicRMWInst::FMin:
- // case AtomicRMWInst::FMax:
- // LaneOffset = B.CreateSelect(Cond, Identity, V);
- // break;
- // case AtomicRMWInst::Xor:
- // LaneOffset = buildMul(B, V, B.CreateAnd(Mbcnt, 1));
- // break;
- // case AtomicRMWInst::FAdd:
- // case AtomicRMWInst::FSub: {
- // LaneOffset = B.CreateFMul(V, Mbcnt);
- // break;
- // }
- // }
- // }
- // Value *Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset);
- // if (isAtomicFloatingPointTy) {
- // // For fadd/fsub the first active lane of LaneOffset should be the
- // // identity (-0.0 for fadd or +0.0 for fsub) but the value we calculated
- // // is V * +0.0 which might have the wrong sign or might be nan (if V is
- // // inf or nan).
- // //
- // // For all floating point ops if the in-memory value was a nan then the
- // // binop we just built might have quieted it or changed its payload.
- // //
- // // Correct all these problems by using BroadcastI as the result in the
- // // first active lane.
- // Result = B.CreateSelect(Cond, BroadcastI, Result);
- // }
+ B.SetInsertPoint(&I);
- // if (IsPixelShader) {
- // // Need a final PHI to reconverge to above the helper lane branch mask.
- // B.SetInsertPoint(PixelExitBB, PixelExitBB->getFirstNonPHIIt());
+ if (NeedResult) {
+ // Create a PHI node to get our new atomic result into the exit block.
+ PHINode *const PHI = B.CreatePHI(Ty, 2);
+ PHI->addIncoming(PoisonValue::get(Ty), Predecessor);
+ PHI->addIncoming(NewI, SingleLaneTerminator->getParent());
+
+ // We need to broadcast the value who was the lowest active lane (the first
+ // lane) to all other lanes in the wavefront.
+ Value *BroadcastI = nullptr;
+ BroadcastI = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readfirstlane, PHI);
+
+ // Now that we have the result of our single atomic operation, we need to
+ // get our individual lane's slice into the result. We use the lane offset
+ // we previously calculated combined with the atomic result value we got
+ // from the first lane, to get our lane's index into the atomic result.
+ Value *LaneOffset = nullptr;
+ if (ValDivergent) {
+ if (ScanImpl == ScanOptions::DPP) {
+ LaneOffset =
+ B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan);
+ } else if (ScanImpl == ScanOptions::Iterative) {
+ LaneOffset = ExclScan;
+ } else {
+ llvm_unreachable("Atomic Optimzer is disabled for None strategy");
+ }
+ } else {
+ Mbcnt = isAtomicFloatingPointTy ? B.CreateUIToFP(Mbcnt, Ty)
+ : B.CreateIntCast(Mbcnt, Ty, false);
+ switch (Op) {
+ default:
+ llvm_unreachable("Unhandled atomic op");
+ case AtomicRMWInst::Add:
+ case AtomicRMWInst::Sub:
+ LaneOffset = buildMul(B, V, Mbcnt);
+ break;
+ case AtomicRMWInst::And:
+ case AtomicRMWInst::Or:
+ case AtomicRMWInst::Max:
+ case AtomicRMWInst::Min:
+ case AtomicRMWInst::UMax:
+ case AtomicRMWInst::UMin:
+ case AtomicRMWInst::FMin:
+ case AtomicRMWInst::FMax:
+ LaneOffset = B.CreateSelect(Cond, Identity, V);
+ break;
+ case AtomicRMWInst::Xor:
+ LaneOffset = buildMul(B, V, B.CreateAnd(Mbcnt, 1));
+ break;
+ case AtomicRMWInst::FAdd:
+ case AtomicRMWInst::FSub: {
+ LaneOffset = B.CreateFMul(V, Mbcnt);
+ break;
+ }
+ }
+ }
+ Value *Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset);
+ if (isAtomicFloatingPointTy) {
+ // For fadd/fsub the first active lane of LaneOffset should be the
+ // identity (-0.0 for fadd or +0.0 for fsub) but the value we calculated
+ // is V * +0.0 which might have the wrong sign or might be nan (if V is
+ // inf or nan).
+ //
+ // For all floating point ops if the in-memory value was a nan then the
+ // binop we just built might have quieted it or changed its payload.
+ //
+ // Correct all these problems by using BroadcastI as the result in the
+ // first active lane.
+ Result = B.CreateSelect(Cond, BroadcastI, Result);
+ }
- // PHINode *const PHI = B.CreatePHI(Ty, 2);
- // PHI->addIncoming(PoisonValue::get(Ty), PixelEntryBB);
- // PHI->addIncoming(Result, I.getParent());
- // I.replaceAllUsesWith(PHI);
- // } else {
- // // Replace the original atomic instruction with the new one.
- // I.replaceAllUsesWith(Result);
- // }
- // }
+ if (IsPixelShader) {
+ // Need a final PHI to reconverge to above the helper lane branch mask.
+ B.SetInsertPoint(PixelExitBB, PixelExitBB->getFirstNonPHIIt());
+
+ PHINode *const PHI = B.CreatePHI(Ty, 2);
+ PHI->addIncoming(PoisonValue::get(Ty), PixelEntryBB);
+ PHI->addIncoming(Result, I.getParent());
+ I.replaceAllUsesWith(PHI);
+ } else {
+ // Replace the original atomic instruction with the new one.
+ I.replaceAllUsesWith(Result);
+ }
+ }
// And delete the original.
I.eraseFromParent();
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index cd057702d6072d..7dfab57ebc6b0d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4873,8 +4873,8 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
// These operations with a uniform value i.e. SGPR are idempotent.
// Reduced value will be same as given sgpr.
// bool IsWave32 = ST.isWave32();
- unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
- BuildMI(BB, MI, DL, TII->get(MovOpc), DstReg).addReg(SrcReg);
+ // unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
RetBB = &BB;
break;
}
>From 91a569741796eca70f33ba3d132954f0fe564f16 Mon Sep 17 00:00:00 2001
From: easyonaadit <aaditya.alokdeshpande at amd.com>
Date: Tue, 1 Oct 2024 12:33:46 +0530
Subject: [PATCH 07/13] changes to intrinsicsAMDGPU.td
---
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 24 ++++++++++++++++++------
1 file changed, 18 insertions(+), 6 deletions(-)
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index fc6d13899a809d..b2b6e2039f1725 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2119,6 +2119,18 @@ class AMDGPUWaveReduce<LLVMType data_ty = llvm_any_ty> : Intrinsic<
],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<1>>]>;
+//multiclass AMDGPUWaveReducee {
+// foreach Opcode = ["umin", "umax", "add", "uadd", "sub", "usub", "and", "or", "xor"] in
+// def int_amdgcn_wave_reduce_#Opcode : AMDGPUWaveReduce;
+//}
+
+//multiclass AMDGPUMFp8MfmaIntrinsic<LLVMType DestTy> {
+// foreach kind = ["bf8_bf8", "bf8_fp8", "fp8_bf8", "fp8_fp8"] in
+// def NAME#"_"#kind : AMDGPUMFp8MfmaIntrinsic<DestTy>;
+//}
+
+//WaveReduceDefs<["umin", "umax", "add", "uadd", "sub", "usub", "and", "or", "xor"]>;
+//list<string> Operations
def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce;
def int_amdgcn_wave_reduce_min : AMDGPUWaveReduce;
@@ -2127,17 +2139,17 @@ def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce;
def int_amdgcn_wave_reduce_max : AMDGPUWaveReduce;
def int_amdgcn_wave_reduce_fmax : AMDGPUWaveReduce;
def int_amdgcn_wave_reduce_add : AMDGPUWaveReduce;
-def int_amdgcn_wave_reduce_uadd : AMDGPUWaveReduce;
+//def int_amdgcn_wave_reduce_uadd : AMDGPUWaveReduce;
def int_amdgcn_wave_reduce_fadd : AMDGPUWaveReduce;
def int_amdgcn_wave_reduce_sub : AMDGPUWaveReduce;
-def int_amdgcn_wave_reduce_usub : AMDGPUWaveReduce;
+//def int_amdgcn_wave_reduce_usub : AMDGPUWaveReduce;
def int_amdgcn_wave_reduce_fsub : AMDGPUWaveReduce;
def int_amdgcn_wave_reduce_and : AMDGPUWaveReduce;
-def int_amdgcn_wave_reduce_uand : AMDGPUWaveReduce;
-def int_amdgcn_wave_reduce_fand : AMDGPUWaveReduce;
+//def int_amdgcn_wave_reduce_uand : AMDGPUWaveReduce;
+//def int_amdgcn_wave_reduce_fand : AMDGPUWaveReduce;
def int_amdgcn_wave_reduce_or : AMDGPUWaveReduce;
-def int_amdgcn_wave_reduce_uor : AMDGPUWaveReduce;
-def int_amdgcn_wave_reduce_for : AMDGPUWaveReduce;
+//def int_amdgcn_wave_reduce_uor : AMDGPUWaveReduce;
+//def int_amdgcn_wave_reduce_for : AMDGPUWaveReduce;
def int_amdgcn_wave_reduce_xor : AMDGPUWaveReduce;
def int_amdgcn_readfirstlane :
>From d447aa509957f68db0b893d863afd2fe5829e3e5 Mon Sep 17 00:00:00 2001
From: easyonaadit <aaditya.alokdeshpande at amd.com>
Date: Tue, 1 Oct 2024 12:37:56 +0530
Subject: [PATCH 08/13] changes to AMDGPUAtomicOptimizer.cpp
---
.../Target/AMDGPU/AMDGPUAtomicOptimizer.cpp | 96 ++++++++++---------
1 file changed, 52 insertions(+), 44 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index e4782cb5d9c131..76c1feb0d5fe08 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -785,52 +785,60 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
// llvm_unreachable("Atomic Optimzer is disabled for None strategy");
// }
// } else {
- // switch (Op) {
- // default:
- // llvm_unreachable("Unhandled atomic op");
-
- // case AtomicRMWInst::Add:
- // case AtomicRMWInst::Sub: {
- // // The new value we will be contributing to the atomic operation is the
- // // old value times the number of active lanes.
- // Value *const Ctpop = B.CreateIntCast(
- // B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false);
- // NewV = buildMul(B, V, Ctpop);
- // break;
- // }
- // case AtomicRMWInst::FAdd:
- // case AtomicRMWInst::FSub: {
- // Value *const Ctpop = B.CreateIntCast(
- // B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Int32Ty, false);
- // Value *const CtpopFP = B.CreateUIToFP(Ctpop, Ty);
- // NewV = B.CreateFMul(V, CtpopFP);
- // break;
- // }
- // case AtomicRMWInst::And:
- // case AtomicRMWInst::Or:
- // case AtomicRMWInst::Max:
- // case AtomicRMWInst::Min:
- // case AtomicRMWInst::UMax:
- // case AtomicRMWInst::UMin:
- // case AtomicRMWInst::FMin:
- // case AtomicRMWInst::FMax:
- // // These operations with a uniform value are idempotent: doing the atomic
- // // operation multiple times has the same effect as doing it once.
- // NewV = V;
- // break;
-
- // case AtomicRMWInst::Xor:
- // // The new value we will be contributing to the atomic operation is the
- // // old value times the parity of the number of active lanes.
- // Value *const Ctpop = B.CreateIntCast(
- // B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false);
- // NewV = buildMul(B, V, B.CreateAnd(Ctpop, 1));
- // break;
- // }
- // }
+ // **************************************** Implement from here
+ switch (Op) {
+ // TODO --implement for floats
+ default:
+ llvm_unreachable("Unhandled atomic op");
+
+ case AtomicRMWInst::Add:
+ NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wave_reduce_add, Int32Ty, {V, B.getInt32(0)});
+ break;
+ case AtomicRMWInst::Sub:
+ NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wave_reduce_sub, Int32Ty, {V, B.getInt32(0)});
+ break;
+ case AtomicRMWInst::FAdd:
+ case AtomicRMWInst::FSub: {
+ Value *const Ctpop = B.CreateIntCast(
+ B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Int32Ty, false);
+ Value *const CtpopFP = B.CreateUIToFP(Ctpop, Ty);
+ NewV = B.CreateFMul(V, CtpopFP);
+ break;
+ }
+ case AtomicRMWInst::And:
+ NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wave_reduce_and, Int32Ty, {V, B.getInt32(0)});
+ break;
+ case AtomicRMWInst::Or:
+ NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wave_reduce_or, Int32Ty, {V, B.getInt32(0)});
+ break;
+ case AtomicRMWInst::Xor:
+ NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wave_reduce_xor, Int32Ty, {V, B.getInt32(0)});
+ break;
+ case AtomicRMWInst::Max:
+ NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wave_reduce_max, Int32Ty, {V, B.getInt32(0)});
+ break;
+ case AtomicRMWInst::Min:
+ NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wave_reduce_min, Int32Ty, {V, B.getInt32(0)});
+ break;
+ case AtomicRMWInst::UMax:
+ NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wave_reduce_umax, Int32Ty, {V, B.getInt32(0)});
+ break;
+ case AtomicRMWInst::UMin:
+ NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wave_reduce_umin, Int32Ty, {V, B.getInt32(0)});
+ break;
+ case AtomicRMWInst::FMin:
+ case AtomicRMWInst::FMax:
+ // These operations with a uniform value are idempotent: doing the atomic
+ // operation multiple times has the same effect as doing it once.
+ NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wave_reduce_umin, Int32Ty, {V, B.getInt32(0)});
+ break;
+
+ }
+
+ // **************************************** Implement to here
- NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wave_reduce_umin, Int32Ty, {V, B.getInt32(0)});
+ // NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wave_reduce_umin, Int32Ty, {V, B.getInt32(0)});
// We only want a single lane to enter our new control flow, and we do this
// by checking if there are any active lanes below us. Only one lane will
// have 0 active lanes below us, so that will be the only one to progress.
>From a33398dba7113486feedf3b163d0381a52cd7052 Mon Sep 17 00:00:00 2001
From: easyonaadit <aaditya.alokdeshpande at amd.com>
Date: Tue, 1 Oct 2024 12:38:37 +0530
Subject: [PATCH 09/13] changes to AMDGPURegisterBankInfo.cpp
---
.../lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 16 ++++++++++------
1 file changed, 10 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 68b0857fd21504..24c6dc0afbce57 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4846,15 +4846,19 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize);
break;
}
- case Intrinsic::amdgcn_wave_reduce_umin:
+ case Intrinsic::amdgcn_wave_reduce_add:
+ case Intrinsic::amdgcn_wave_reduce_fadd:
+ case Intrinsic::amdgcn_wave_reduce_sub:
+ case Intrinsic::amdgcn_wave_reduce_fsub:
+ case Intrinsic::amdgcn_wave_reduce_min:
+ case Intrinsic::amdgcn_wave_reduce_umin:
+ case Intrinsic::amdgcn_wave_reduce_fmin:
+ case Intrinsic::amdgcn_wave_reduce_max:
case Intrinsic::amdgcn_wave_reduce_umax:
+ case Intrinsic::amdgcn_wave_reduce_fmax:
case Intrinsic::amdgcn_wave_reduce_and:
case Intrinsic::amdgcn_wave_reduce_or:
- case Intrinsic::amdgcn_wave_reduce_xor:
- case Intrinsic::amdgcn_wave_reduce_usub:
- case Intrinsic::amdgcn_wave_reduce_sub:
- case Intrinsic::amdgcn_wave_reduce_uadd:
- case Intrinsic::amdgcn_wave_reduce_add: {
+ case Intrinsic::amdgcn_wave_reduce_xor: {
unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
>From 9b6f7099d2c1b6a2f03ddd4724f2065c8dd7e6d8 Mon Sep 17 00:00:00 2001
From: easyonaadit <aaditya.alokdeshpande at amd.com>
Date: Tue, 1 Oct 2024 12:39:49 +0530
Subject: [PATCH 10/13] changes to SIISelLowering.cpp
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 67 +++++++++++++++--------
1 file changed, 44 insertions(+), 23 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 7dfab57ebc6b0d..41431ee9ee2824 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5047,8 +5047,29 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
// Create initail values of induction variable from Exec, Accumulator and
// insert branch instr to newly created ComputeBlock
- uint32_t InitalValue =
- (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
+ uint32_t InitalValue;
+ switch(Opc){
+ case AMDGPU::S_MIN_U32:
+ InitalValue = std::numeric_limits<uint32_t>::max();
+ break;
+ case AMDGPU::S_MIN_I32:
+ InitalValue = std::numeric_limits<int32_t>::max();
+ break;
+ case AMDGPU::S_MAX_U32:
+ InitalValue = 0;
+ break;
+ case AMDGPU::S_MAX_I32:
+ InitalValue = std::numeric_limits<int32_t>::min();
+ break;
+ case AMDGPU::S_ADD_I32:
+ case AMDGPU::S_SUB_I32:
+ case AMDGPU::S_OR_B32:
+ case AMDGPU::S_XOR_B32:
+ InitalValue = 0x00000000;
+ break;
+ case AMDGPU::S_AND_B32:
+ InitalValue = 0xFFFFFFFF;
+ }
auto TmpSReg =
BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
@@ -5114,43 +5135,43 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
switch (MI.getOpcode()) {
- case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
+ case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_U32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
- case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_I32:
+ case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
- case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_F32:
+ case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_F32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_F32);
- case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
+ case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_U32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
- case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_I32:
+ case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
- case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_F32:
+ case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_F32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_F32);
- case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U32:
- return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U32);
+ // case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U32:
+ // return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U32);
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_F32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_F32);
- case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U32:
- return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U32);
+ // case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U32:
+ // return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U32);
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_F32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_F32);
- case AMDGPU::WAVE_REDUCE_AND_PSEUDO_U32:
+ case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
- case AMDGPU::WAVE_REDUCE_AND_PSEUDO_I32:
- return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
- case AMDGPU::WAVE_REDUCE_AND_PSEUDO_F32:
- return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
- case AMDGPU::WAVE_REDUCE_OR_PSEUDO_U32:
- return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
- case AMDGPU::WAVE_REDUCE_OR_PSEUDO_I32:
- return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
- case AMDGPU::WAVE_REDUCE_OR_PSEUDO_F32:
+ // case AMDGPU::WAVE_REDUCE_AND_PSEUDO_I32:
+ // return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
+ // case AMDGPU::WAVE_REDUCE_AND_PSEUDO_F32:
+ // return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
+ case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
- case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_U32:
+ // case AMDGPU::WAVE_REDUCE_OR_PSEUDO_I32:
+ // return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
+ // case AMDGPU::WAVE_REDUCE_OR_PSEUDO_F32:
+ // return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
+ case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
case AMDGPU::S_UADDO_PSEUDO:
case AMDGPU::S_USUBO_PSEUDO: {
>From 45a02a1d01c6b1629dee41b8bf429d40300abe75 Mon Sep 17 00:00:00 2001
From: easyonaadit <aaditya.alokdeshpande at amd.com>
Date: Tue, 1 Oct 2024 12:40:16 +0530
Subject: [PATCH 11/13] changes to SIInstructions.td
---
llvm/lib/Target/AMDGPU/SIInstructions.td | 70 ++++++++++++------------
1 file changed, 35 insertions(+), 35 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 534b4d2c052482..c5883ff7839033 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -255,32 +255,32 @@ def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)),
(V_SET_INACTIVE_B32 0, VGPR_32:$src, 0, VGPR_32:$inactive, (IMPLICIT_DEF))>;
let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
- def WAVE_REDUCE_UMIN_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+ def WAVE_REDUCE_MIN_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
(ins VSrc_b32: $src, VSrc_b32:$strategy),
[(set i32:$sdst, (int_amdgcn_wave_reduce_umin i32:$src, i32:$strategy))]> {
}
- def WAVE_REDUCE_UMIN_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+ def WAVE_REDUCE_MIN_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst),
(ins VSrc_b32: $src, VSrc_b32:$strategy),
[(set i32:$sdst, (int_amdgcn_wave_reduce_min i32:$src, i32:$strategy))]> {
}
- def WAVE_REDUCE_UMIN_PSEUDO_F32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+ def WAVE_REDUCE_MIN_PSEUDO_F32 : VPseudoInstSI <(outs SGPR_32:$sdst),
(ins VSrc_b32: $src, VSrc_b32:$strategy),
[(set f32:$sdst, (int_amdgcn_wave_reduce_fmin f32:$src, i32:$strategy))]> {
}
- def WAVE_REDUCE_UMAX_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+ def WAVE_REDUCE_MAX_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
(ins VSrc_b32: $src, VSrc_b32:$strategy),
[(set i32:$sdst, (int_amdgcn_wave_reduce_umax i32:$src, i32:$strategy))]> {
}
- def WAVE_REDUCE_UMAX_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+ def WAVE_REDUCE_MAX_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst),
(ins VSrc_b32: $src, VSrc_b32:$strategy),
[(set i32:$sdst, (int_amdgcn_wave_reduce_max i32:$src, i32:$strategy))]> {
}
- def WAVE_REDUCE_UMAX_PSEUDO_F32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+ def WAVE_REDUCE_MAX_PSEUDO_F32 : VPseudoInstSI <(outs SGPR_32:$sdst),
(ins VSrc_b32: $src, VSrc_b32:$strategy),
[(set f32:$sdst, (int_amdgcn_wave_reduce_fmax f32:$src, i32:$strategy))]> {
}
@@ -295,57 +295,57 @@ let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses
[(set f32:$sdst, (int_amdgcn_wave_reduce_add f32:$src, i32:$strategy))]> {
}
- def WAVE_REDUCE_ADD_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
- (ins VSrc_b32: $src, VSrc_b32:$strategy),
- [(set i32:$sdst, (int_amdgcn_wave_reduce_uadd i32:$src, i32:$strategy))]> {
- }
+ //def WAVE_REDUCE_ADD_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+ // (ins VSrc_b32: $src, VSrc_b32:$strategy),
+ // [(set i32:$sdst, (int_amdgcn_wave_reduce_uadd i32:$src, i32:$strategy))]> {
+ //}
def WAVE_REDUCE_SUB_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst),
(ins VSrc_b32: $src, VSrc_b32:$strategy),
[(set i32:$sdst, (int_amdgcn_wave_reduce_sub i32:$src, i32:$strategy))]> {
}
- def WAVE_REDUCE_SUB_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
- (ins VSrc_b32: $src, VSrc_b32:$strategy),
- [(set i32:$sdst, (int_amdgcn_wave_reduce_usub i32:$src, i32:$strategy))]> {
- }
+ //def WAVE_REDUCE_SUB_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+ // (ins VSrc_b32: $src, VSrc_b32:$strategy),
+ // [(set i32:$sdst, (int_amdgcn_wave_reduce_usub i32:$src, i32:$strategy))]> {
+ //}
def WAVE_REDUCE_SUB_PSEUDO_F32 : VPseudoInstSI <(outs SGPR_32:$sdst),
(ins VSrc_b32: $src, VSrc_b32:$strategy),
[(set f32:$sdst, (int_amdgcn_wave_reduce_fsub f32:$src, i32:$strategy))]> {
}
- def WAVE_REDUCE_AND_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
- (ins VSrc_b32: $src, VSrc_b32:$strategy),
- [(set i32:$sdst, (int_amdgcn_wave_reduce_uand i32:$src, i32:$strategy))]> {
- }
-
- def WAVE_REDUCE_AND_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+ def WAVE_REDUCE_AND_PSEUDO_B32 : VPseudoInstSI <(outs SGPR_32:$sdst),
(ins VSrc_b32: $src, VSrc_b32:$strategy),
[(set i32:$sdst, (int_amdgcn_wave_reduce_and i32:$src, i32:$strategy))]> {
}
- def WAVE_REDUCE_AND_PSEUDO_F32 : VPseudoInstSI <(outs SGPR_32:$sdst),
- (ins VSrc_b32: $src, VSrc_b32:$strategy),
- [(set f32:$sdst, (int_amdgcn_wave_reduce_fand f32:$src, i32:$strategy))]> {
- }
-
- def WAVE_REDUCE_OR_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
- (ins VSrc_b32: $src, VSrc_b32:$strategy),
- [(set i32:$sdst, (int_amdgcn_wave_reduce_uor i32:$src, i32:$strategy))]> {
- }
+ //def WAVE_REDUCE_AND_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+ // (ins VSrc_b32: $src, VSrc_b32:$strategy),
+ // [(set i32:$sdst, (int_amdgcn_wave_reduce_and i32:$src, i32:$strategy))]> {
+ //}
- def WAVE_REDUCE_OR_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+ //def WAVE_REDUCE_AND_PSEUDO_F32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+ // (ins VSrc_b32: $src, VSrc_b32:$strategy),
+ // [(set f32:$sdst, (int_amdgcn_wave_reduce_fand f32:$src, i32:$strategy))]> {
+ //}
+
+ def WAVE_REDUCE_OR_PSEUDO_B32 : VPseudoInstSI <(outs SGPR_32:$sdst),
(ins VSrc_b32: $src, VSrc_b32:$strategy),
[(set i32:$sdst, (int_amdgcn_wave_reduce_or i32:$src, i32:$strategy))]> {
}
- def WAVE_REDUCE_OR_PSEUDO_F32 : VPseudoInstSI <(outs SGPR_32:$sdst),
- (ins VSrc_b32: $src, VSrc_b32:$strategy),
- [(set f32:$sdst, (int_amdgcn_wave_reduce_for f32:$src, i32:$strategy))]> {
- }
+ //def WAVE_REDUCE_OR_PSEUDO_I32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+ // (ins VSrc_b32: $src, VSrc_b32:$strategy),
+ // [(set i32:$sdst, (int_amdgcn_wave_reduce_or i32:$src, i32:$strategy))]> {
+ //}
+
+ //def WAVE_REDUCE_OR_PSEUDO_F32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+ // (ins VSrc_b32: $src, VSrc_b32:$strategy),
+ // [(set f32:$sdst, (int_amdgcn_wave_reduce_for f32:$src, i32:$strategy))]> {
+ //}
- def WAVE_REDUCE_XOR_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+ def WAVE_REDUCE_XOR_PSEUDO_B32 : VPseudoInstSI <(outs SGPR_32:$sdst),
(ins VSrc_b32: $src, VSrc_b32:$strategy),
[(set i32:$sdst, (int_amdgcn_wave_reduce_xor i32:$src, i32:$strategy))]> {
}
>From b5db313b9eb1ff7630887eae82d0568105648281 Mon Sep 17 00:00:00 2001
From: easyonaadit <aaditya.alokdeshpande at amd.com>
Date: Tue, 1 Oct 2024 18:29:20 +0530
Subject: [PATCH 12/13] Code cleanup in tableGen files.
---
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 39 ++++-------------------
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 9 ------
2 files changed, 7 insertions(+), 41 deletions(-)
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index b2b6e2039f1725..e4d6e4b2f54597 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2119,38 +2119,13 @@ class AMDGPUWaveReduce<LLVMType data_ty = llvm_any_ty> : Intrinsic<
],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<1>>]>;
-//multiclass AMDGPUWaveReducee {
-// foreach Opcode = ["umin", "umax", "add", "uadd", "sub", "usub", "and", "or", "xor"] in
-// def int_amdgcn_wave_reduce_#Opcode : AMDGPUWaveReduce;
-//}
-
-//multiclass AMDGPUMFp8MfmaIntrinsic<LLVMType DestTy> {
-// foreach kind = ["bf8_bf8", "bf8_fp8", "fp8_bf8", "fp8_fp8"] in
-// def NAME#"_"#kind : AMDGPUMFp8MfmaIntrinsic<DestTy>;
-//}
-
-//WaveReduceDefs<["umin", "umax", "add", "uadd", "sub", "usub", "and", "or", "xor"]>;
-//list<string> Operations
-
-def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce;
-def int_amdgcn_wave_reduce_min : AMDGPUWaveReduce;
-def int_amdgcn_wave_reduce_fmin : AMDGPUWaveReduce;
-def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce;
-def int_amdgcn_wave_reduce_max : AMDGPUWaveReduce;
-def int_amdgcn_wave_reduce_fmax : AMDGPUWaveReduce;
-def int_amdgcn_wave_reduce_add : AMDGPUWaveReduce;
-//def int_amdgcn_wave_reduce_uadd : AMDGPUWaveReduce;
-def int_amdgcn_wave_reduce_fadd : AMDGPUWaveReduce;
-def int_amdgcn_wave_reduce_sub : AMDGPUWaveReduce;
-//def int_amdgcn_wave_reduce_usub : AMDGPUWaveReduce;
-def int_amdgcn_wave_reduce_fsub : AMDGPUWaveReduce;
-def int_amdgcn_wave_reduce_and : AMDGPUWaveReduce;
-//def int_amdgcn_wave_reduce_uand : AMDGPUWaveReduce;
-//def int_amdgcn_wave_reduce_fand : AMDGPUWaveReduce;
-def int_amdgcn_wave_reduce_or : AMDGPUWaveReduce;
-//def int_amdgcn_wave_reduce_uor : AMDGPUWaveReduce;
-//def int_amdgcn_wave_reduce_for : AMDGPUWaveReduce;
-def int_amdgcn_wave_reduce_xor : AMDGPUWaveReduce;
+multiclass AMDGPUWaveReduceGenerator<list<string> Operations>{
+ foreach Opcode = Operations in
+ def Opcode : AMDGPUWaveReduce;
+}
+
+defvar Operations = ["umin", "min", "fmin", "umax", "max", "fmax", "add", "fadd", "sub", "fsub", "and", "or", "xor"];
+defm int_amdgcn_wave_reduce_ : AMDGPUWaveReduceGenerator<Operations>;
def int_amdgcn_readfirstlane :
Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 41431ee9ee2824..9816d0d354af80 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4872,8 +4872,6 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
case AMDGPU::S_OR_B32:{
// These operations with a uniform value i.e. SGPR are idempotent.
// Reduced value will be same as given sgpr.
- // bool IsWave32 = ST.isWave32();
- // unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
RetBB = &BB;
break;
@@ -4970,15 +4968,10 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
// of Active lanes then the XOR will result in the
// same value as that in the SGPR. This comes from
// the fact that A^A = 0 and A^0 = A.
-
Register ParityRegister = MRI.createVirtualRegister(DstRegClass);
-
auto ParityReg = BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
.addReg(NewAccumulator->getOperand(0).getReg())
.addImm(1);
-// S_MUL_I32
- // auto MulOp =
- // Can you have one float and one int op? I dont think you can, need to handle the float case seperately.
BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
.addReg(SrcReg)
.addReg(ParityReg->getOperand(0).getReg()) ;
@@ -4989,7 +4982,6 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
case AMDGPU::S_SUB_F32:{
// TODO --> use 2's compliment or subtract from 0 to find the negation of the number.
Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
-
// Take the negation of the source operand.
auto InvertedValReg = BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal).addImm(-1).addReg(SrcReg);
BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
@@ -4997,7 +4989,6 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
.addReg(NewAccumulator->getOperand(0).getReg());
break;
}
- // Doubt --> is SSA form still have to be followed for MIR?
case AMDGPU::S_ADD_U32:
case AMDGPU::S_ADD_I32:
case AMDGPU::S_ADD_F32:{
>From 7e9cb1c4f1f4afe02d7c8e2f4270318d89092877 Mon Sep 17 00:00:00 2001
From: easyonaadit <aaditya.alokdeshpande at amd.com>
Date: Tue, 8 Oct 2024 10:56:00 +0530
Subject: [PATCH 13/13] Bitcount changes to be copied
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 156 +++++++++++-----------
1 file changed, 75 insertions(+), 81 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 9816d0d354af80..4ffcee15225cd8 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4864,10 +4864,8 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
switch(Opc){
case AMDGPU::S_MIN_U32:
case AMDGPU::S_MIN_I32:
- case AMDGPU::S_MIN_F32:
case AMDGPU::S_MAX_U32:
case AMDGPU::S_MAX_I32:
- case AMDGPU::S_MAX_F32:
case AMDGPU::S_AND_B32:
case AMDGPU::S_OR_B32:{
// These operations with a uniform value i.e. SGPR are idempotent.
@@ -4877,88 +4875,86 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
break;
}
case AMDGPU::S_XOR_B32:
- case AMDGPU::S_ADD_U32:
case AMDGPU::S_ADD_I32:
- case AMDGPU::S_ADD_F32:
- case AMDGPU::S_SUB_U32:
- case AMDGPU::S_SUB_I32:
- case AMDGPU::S_SUB_F32:{
- MachineBasicBlock::iterator I = BB.end();
- Register SrcReg = MI.getOperand(1).getReg();
+ case AMDGPU::S_SUB_I32:{
+ // MachineBasicBlock::iterator I = BB.end();
+ // Register SrcReg = MI.getOperand(1).getReg();
- // Create Control flow for loop
- // Split MI's Machine Basic block into For loop
- auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
+ // // Create Control flow for loop
+ // // Split MI's Machine Basic block into For loop
+ // auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
- // Create virtual registers required for lowering.
+ // // Create virtual registers required for lowering.
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
- Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
- Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
+ Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
+ // Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
- Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
- Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
- Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
+ // Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
+ // Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
+ // Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
- Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
+ // Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
Register CountOfActiveLanesReg = MRI.createVirtualRegister(DstRegClass);
bool IsWave32 = ST.isWave32();
unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+ unsigned CountReg = IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
// Create initail values of induction variable from Exec, Accumulator and
// insert branch instr to newly created ComputeBlock
- uint32_t InitalValue = 0;
+ // uint32_t InitalValue = 0;
- auto TmpSReg =
- BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
- BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
- .addImm(InitalValue);
- BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH)).addMBB(ComputeLoop);
-
- // Start constructing ComputeLoop
- I = ComputeLoop->end();
- auto Accumulator =
- BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
- .addReg(InitalValReg)
- .addMBB(&BB);
- auto ActiveBits =
- BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
- .addReg(TmpSReg->getOperand(0).getReg())
- .addMBB(&BB);
-
- // Perform the computations
- unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
- auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
- .addReg(ActiveBits->getOperand(0).getReg());
- auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), CountOfActiveLanesReg)
- .addReg(Accumulator->getOperand(0).getReg())
- .addImm(1);
-
- // Manipulate the iterator to get the next active lane
- unsigned BITSETOpc =
- IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
- auto NewActiveBits =
- BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
- .addReg(FF1->getOperand(0).getReg())
- .addReg(ActiveBits->getOperand(0).getReg());
-
- // Add phi nodes
- Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
- .addMBB(ComputeLoop);
- ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
- .addMBB(ComputeLoop);
-
- // Creating branching
- unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
- BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
- .addReg(NewActiveBits->getOperand(0).getReg())
- .addImm(0);
- BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
- .addMBB(ComputeLoop);
-
- I = ComputeEnd->begin();
+ auto Exec =
+ BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
+
+ auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), CountOfActiveLanesReg)
+ .addReg(Exec->getOperand(0).getReg());
+
+ // BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
+ // .addImm(InitalValue);
+ // BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH)).addMBB(ComputeLoop);
+
+ // // Start constructing ComputeLoop
+ // I = ComputeLoop->end();
+ // auto Accumulator =
+ // BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
+ // .addReg(InitalValReg)
+ // .addMBB(&BB);
+ // auto ActiveBits =
+ // BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
+ // .addReg(TmpSReg->getOperand(0).getReg())
+ // .addMBB(&BB);
+
+ // // Perform the computations
+ // unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
+ // auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
+ // .addReg(ActiveBits->getOperand(0).getReg());
+
+ // // Manipulate the iterator to get the next active lane
+ // unsigned BITSETOpc =
+ // IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
+ // auto NewActiveBits =
+ // BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
+ // .addReg(FF1->getOperand(0).getReg())
+ // .addReg(ActiveBits->getOperand(0).getReg());
+
+ // // Add phi nodes
+ // Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
+ // .addMBB(ComputeLoop);
+ // ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
+ // .addMBB(ComputeLoop);
+
+ // // Creating branching
+ // unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
+ // BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
+ // .addReg(NewActiveBits->getOperand(0).getReg())
+ // .addImm(0);
+ // BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
+ // .addMBB(ComputeLoop);
+
+ // I = ComputeEnd->begin();
switch(Opc){
case AMDGPU::S_XOR_B32:{
// Performing an XOR operation on a uniform value
@@ -4968,38 +4964,36 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
// of Active lanes then the XOR will result in the
// same value as that in the SGPR. This comes from
// the fact that A^A = 0 and A^0 = A.
+
Register ParityRegister = MRI.createVirtualRegister(DstRegClass);
- auto ParityReg = BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
+
+ auto ParityReg = BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
.addReg(NewAccumulator->getOperand(0).getReg())
.addImm(1);
- BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
.addReg(SrcReg)
.addReg(ParityReg->getOperand(0).getReg()) ;
break;
}
- case AMDGPU::S_SUB_U32:
- case AMDGPU::S_SUB_I32:
- case AMDGPU::S_SUB_F32:{
+ case AMDGPU::S_SUB_I32:{
// TODO --> use 2's compliment or subtract from 0 to find the negation of the number.
Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
+
// Take the negation of the source operand.
- auto InvertedValReg = BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal).addImm(-1).addReg(SrcReg);
- BuildMI(*ComputeEnd, I, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
+ auto InvertedValReg = BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal).addImm(-1).addReg(SrcReg);
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
.addReg(InvertedValReg->getOperand(0).getReg())
.addReg(NewAccumulator->getOperand(0).getReg());
break;
}
- case AMDGPU::S_ADD_U32:
- case AMDGPU::S_ADD_I32:
- case AMDGPU::S_ADD_F32:{
- auto Opcode = Opc == AMDGPU::S_ADD_U32 || Opc == AMDGPU::S_ADD_I32 ? AMDGPU::S_MUL_I32 : AMDGPU::S_MUL_F32;
- BuildMI(*ComputeEnd, I, DL, TII->get(Opcode), DstReg)
+ case AMDGPU::S_ADD_I32:{
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
.addReg(SrcReg)
.addReg(NewAccumulator->getOperand(0).getReg());
break;
}
}
- RetBB = ComputeEnd;
+ RetBB = &BB;
}
}
} else {
More information about the llvm-commits
mailing list