[llvm] 27fa415 - [AMDGPU] Shrink MAD/FMA to MADAK/MADMK/FMAAK/FMAMK on GFX10
Jay Foad via llvm-commits
llvm-commits at lists.llvm.org
Mon May 16 07:25:58 PDT 2022
Author: Jay Foad
Date: 2022-05-16T15:15:23+01:00
New Revision: 27fa41583fdee54d40ae680408f697e6d8201a8c
URL: https://github.com/llvm/llvm-project/commit/27fa41583fdee54d40ae680408f697e6d8201a8c
DIFF: https://github.com/llvm/llvm-project/commit/27fa41583fdee54d40ae680408f697e6d8201a8c.diff
LOG: [AMDGPU] Shrink MAD/FMA to MADAK/MADMK/FMAAK/FMAMK on GFX10
On GFX10 VOP3 instructions can have a literal operand, so the conversion
from VOP3 MAD/FMA to VOP2 MADAK/MADMK/FMAAK/FMAMK will not happen in
SIFoldOperands. The only benefit of the VOP2 form is code size, so do it
in SIShrinkInstructions instead.
Differential Revision: https://reviews.llvm.org/D125567
Added:
Modified:
llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll
llvm/test/CodeGen/AMDGPU/madak.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 930fb82fbe28..9f00c78b256e 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -46,6 +46,7 @@ class SIShrinkInstructions : public MachineFunctionPass {
void copyExtraImplicitOps(MachineInstr &NewMI, MachineInstr &MI) const;
void shrinkScalarCompare(MachineInstr &MI) const;
void shrinkMIMG(MachineInstr &MI) const;
+ void shrinkMadFma(MachineInstr &MI) const;
bool shrinkScalarLogicOp(MachineInstr &MI) const;
bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R,
Register Reg, unsigned SubReg) const;
@@ -324,6 +325,82 @@ void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) const {
}
}
+// Shrink MAD to MADAK/MADMK and FMA to FMAAK/FMAMK.
+void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
+ if (!ST->hasVOP3Literal())
+ return;
+
+ if (TII->hasAnyModifiersSet(MI))
+ return;
+
+ const unsigned Opcode = MI.getOpcode();
+ MachineOperand &Src0 = *TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+ MachineOperand &Src1 = *TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+ MachineOperand &Src2 = *TII->getNamedOperand(MI, AMDGPU::OpName::src2);
+ unsigned NewOpcode = AMDGPU::INSTRUCTION_LIST_END;
+
+ bool Swap;
+
+ // Detect "Dst = VSrc * VGPR + Imm" and convert to AK form.
+ if (Src2.isImm() && !TII->isInlineConstant(Src2)) {
+ if (Src1.isReg() && TRI->isVGPR(*MRI, Src1.getReg()))
+ Swap = false;
+ else if (Src0.isReg() && TRI->isVGPR(*MRI, Src0.getReg()))
+ Swap = true;
+ else
+ return;
+
+ switch (Opcode) {
+ default:
+ llvm_unreachable("Unexpected mad/fma opcode!");
+ case AMDGPU::V_MAD_F32_e64:
+ NewOpcode = AMDGPU::V_MADAK_F32;
+ break;
+ case AMDGPU::V_FMA_F32_e64:
+ NewOpcode = AMDGPU::V_FMAAK_F32;
+ break;
+ }
+ }
+
+ // Detect "Dst = VSrc * Imm + VGPR" and convert to MK form.
+ if (Src2.isReg() && TRI->isVGPR(*MRI, Src2.getReg())) {
+ if (Src1.isImm() && !TII->isInlineConstant(Src1))
+ Swap = false;
+ else if (Src0.isImm() && !TII->isInlineConstant(Src0))
+ Swap = true;
+ else
+ return;
+
+ switch (Opcode) {
+ default:
+ llvm_unreachable("Unexpected mad/fma opcode!");
+ case AMDGPU::V_MAD_F32_e64:
+ NewOpcode = AMDGPU::V_MADMK_F32;
+ break;
+ case AMDGPU::V_FMA_F32_e64:
+ NewOpcode = AMDGPU::V_FMAMK_F32;
+ break;
+ }
+ }
+
+ if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END)
+ return;
+
+ if (Swap) {
+ // Swap Src0 and Src1 by building a new instruction.
+ BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(NewOpcode),
+ MI.getOperand(0).getReg())
+ .add(Src1)
+ .add(Src0)
+ .add(Src2)
+ .setMIFlags(MI.getFlags());
+ MI.eraseFromParent();
+ } else {
+ TII->removeModOperands(MI);
+ MI.setDesc(TII->get(NewOpcode));
+ }
+}
+
/// Attempt to shink AND/OR/XOR operations requiring non-inlineable literals.
/// For AND or OR, try using S_BITSET{0,1} to clear or set bits.
/// If the inverse of the immediate is legal, use ANDN2, ORN2 or
@@ -726,6 +803,16 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
continue;
}
+ if (!TII->isVOP3(MI))
+ continue;
+
+ // TODO: Also shrink F16 forms.
+ if (MI.getOpcode() == AMDGPU::V_MAD_F32_e64 ||
+ MI.getOpcode() == AMDGPU::V_FMA_F32_e64) {
+ shrinkMadFma(MI);
+ continue;
+ }
+
if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
continue;
diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll
index 67c11cc5beae..8af7575f03d0 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll
@@ -68,7 +68,7 @@ define amdgpu_ps float @_amdgpu_ps_main() #0 {
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_add_f32_e32 v4, v4, v10
; GCN-NEXT: v_mul_f32_e32 v3, v4, v6
-; GCN-NEXT: v_fma_f32 v4, v5, s0, 0x3ca3d70a
+; GCN-NEXT: v_fmaak_f32 v4, s0, v5, 0x3ca3d70a
; GCN-NEXT: v_mul_f32_e32 v1, v3, v1
; GCN-NEXT: v_mul_f32_e32 v2, v7, v4
; GCN-NEXT: v_fmac_f32_e32 v1, v2, v0
diff --git a/llvm/test/CodeGen/AMDGPU/madak.ll b/llvm/test/CodeGen/AMDGPU/madak.ll
index 28b8c846982f..40264342c295 100644
--- a/llvm/test/CodeGen/AMDGPU/madak.ll
+++ b/llvm/test/CodeGen/AMDGPU/madak.ll
@@ -135,8 +135,8 @@ define amdgpu_kernel void @madak_inline_imm_f32(float addrspace(1)* noalias %out
; GCN-DAG: {{buffer|flat|global}}_load_dword{{(_addtid)?}} [[VA:v[0-9]+]]
; GCN-NOT: v_madak_f32
; GFX6_8_9: v_mac_f32_e32 [[VK]], [[SB]], [[VA]]
-; GFX10-MAD: v_mad_f32 v{{[0-9]+}}, [[VA]], [[SB]], 0x41200000
-; GFX10-FMA: v_fma_f32 v{{[0-9]+}}, [[VA]], [[SB]], 0x41200000
+; GFX10-MAD: v_madak_f32 v{{[0-9]+}}, [[SB]], [[VA]], 0x41200000
+; GFX10-FMA: v_fmaak_f32 v{{[0-9]+}}, [[SB]], [[VA]], 0x41200000
; GFX940-FMA: v_fmac_f32_e32 v{{[0-9]+}}, [[SB]], [[VA]]
define amdgpu_kernel void @s_v_madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float %b) #0 {
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
More information about the llvm-commits
mailing list