[llvm] r267328 - [MachineCombiner] Support for floating-point FMA on ARM64 (re-commit r267098)
Gerolf Hoflehner via llvm-commits
llvm-commits at lists.llvm.org
Sat Apr 23 22:14:01 PDT 2016
Author: ghoflehner
Date: Sun Apr 24 00:14:01 2016
New Revision: 267328
URL: http://llvm.org/viewvc/llvm-project?rev=267328&view=rev
Log:
[MachineCombiner] Support for floating-point FMA on ARM64 (re-commit r267098)
The original patch caused crashes because it could derefence a null pointer
for SelectionDAGTargetInfo for targets that do not define it.
Evaluates fmul+fadd -> fmadd combines and similar code sequences in the
machine combiner. It adds support for float and double similar to the existing
integer implementation. The key features are:
- DAGCombiner checks whether it should combine greedily or let the machine
combiner do the evaluation. This is only supported on ARM64.
- It gives preference to throughput over latency: the heuristic used is
to combine always in loops. The targets decides whether the machine
combiner should optimize for throughput or latency.
- Supports for fmadd, f(n)msub, fmla, fmls patterns
- On by default at O3 ffast-math
Added:
llvm/trunk/test/CodeGen/AArch64/arm64-fma-combines.ll
llvm/trunk/test/CodeGen/AArch64/arm64-fml-combines.ll
Modified:
llvm/trunk/include/llvm/CodeGen/MachineCombinerPattern.h
llvm/trunk/include/llvm/CodeGen/SelectionDAGTargetInfo.h
llvm/trunk/include/llvm/Target/TargetInstrInfo.h
llvm/trunk/lib/CodeGen/MachineCombiner.cpp
llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/trunk/lib/CodeGen/TargetInstrInfo.cpp
llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.cpp
llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.h
llvm/trunk/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
llvm/trunk/lib/Target/AArch64/AArch64SelectionDAGInfo.h
Modified: llvm/trunk/include/llvm/CodeGen/MachineCombinerPattern.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/CodeGen/MachineCombinerPattern.h?rev=267328&r1=267327&r2=267328&view=diff
==============================================================================
--- llvm/trunk/include/llvm/CodeGen/MachineCombinerPattern.h (original)
+++ llvm/trunk/include/llvm/CodeGen/MachineCombinerPattern.h Sun Apr 24 00:14:01 2016
@@ -38,7 +38,40 @@ enum class MachineCombinerPattern {
MULSUBX_OP1,
MULSUBX_OP2,
MULADDXI_OP1,
- MULSUBXI_OP1
+ MULSUBXI_OP1,
+ // Floating Point
+ FMULADDS_OP1,
+ FMULADDS_OP2,
+ FMULSUBS_OP1,
+ FMULSUBS_OP2,
+ FMULADDD_OP1,
+ FMULADDD_OP2,
+ FMULSUBD_OP1,
+ FMULSUBD_OP2,
+ FMLAv1i32_indexed_OP1,
+ FMLAv1i32_indexed_OP2,
+ FMLAv1i64_indexed_OP1,
+ FMLAv1i64_indexed_OP2,
+ FMLAv2f32_OP2,
+ FMLAv2f32_OP1,
+ FMLAv2f64_OP1,
+ FMLAv2f64_OP2,
+ FMLAv2i32_indexed_OP1,
+ FMLAv2i32_indexed_OP2,
+ FMLAv2i64_indexed_OP1,
+ FMLAv2i64_indexed_OP2,
+ FMLAv4f32_OP1,
+ FMLAv4f32_OP2,
+ FMLAv4i32_indexed_OP1,
+ FMLAv4i32_indexed_OP2,
+ FMLSv1i32_indexed_OP2,
+ FMLSv1i64_indexed_OP2,
+ FMLSv2i32_indexed_OP2,
+ FMLSv2i64_indexed_OP2,
+ FMLSv2f32_OP2,
+ FMLSv2f64_OP2,
+ FMLSv4i32_indexed_OP2,
+ FMLSv4f32_OP2
};
} // end namespace llvm
Modified: llvm/trunk/include/llvm/CodeGen/SelectionDAGTargetInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/CodeGen/SelectionDAGTargetInfo.h?rev=267328&r1=267327&r2=267328&view=diff
==============================================================================
--- llvm/trunk/include/llvm/CodeGen/SelectionDAGTargetInfo.h (original)
+++ llvm/trunk/include/llvm/CodeGen/SelectionDAGTargetInfo.h Sun Apr 24 00:14:01 2016
@@ -17,6 +17,7 @@
#define LLVM_CODEGEN_SELECTIONDAGTARGETINFO_H
#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/Support/CodeGen.h"
namespace llvm {
@@ -138,6 +139,11 @@ public:
MachinePointerInfo SrcPtrInfo) const {
return std::make_pair(SDValue(), SDValue());
}
+ // Return true when the decision to generate FMA's (or FMS, FMLA etc) rather
+ // than FMUL and ADD is delegated to the machine combiner.
+ virtual bool GenerateFMAsInMachineCombiner(CodeGenOpt::Level OptLevel) const {
+ return false;
+ }
};
} // end llvm namespace
Modified: llvm/trunk/include/llvm/Target/TargetInstrInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Target/TargetInstrInfo.h?rev=267328&r1=267327&r2=267328&view=diff
==============================================================================
--- llvm/trunk/include/llvm/Target/TargetInstrInfo.h (original)
+++ llvm/trunk/include/llvm/Target/TargetInstrInfo.h Sun Apr 24 00:14:01 2016
@@ -818,6 +818,11 @@ public:
MachineInstr &Root,
SmallVectorImpl<MachineCombinerPattern> &Patterns) const;
+ /// Return true when a code sequence can improve throughput. It
+ /// should be called only for instructions in loops.
+ /// \param Pattern - combiner pattern
+ virtual bool isThroughputPattern(MachineCombinerPattern Pattern) const;
+
/// Return true if the input \P Inst is part of a chain of dependent ops
/// that are suitable for reassociation, otherwise return false.
/// If the instruction's operands must be commuted to have a previous
Modified: llvm/trunk/lib/CodeGen/MachineCombiner.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/MachineCombiner.cpp?rev=267328&r1=267327&r2=267328&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/MachineCombiner.cpp (original)
+++ llvm/trunk/lib/CodeGen/MachineCombiner.cpp Sun Apr 24 00:14:01 2016
@@ -40,6 +40,7 @@ class MachineCombiner : public MachineFu
const TargetRegisterInfo *TRI;
MCSchedModel SchedModel;
MachineRegisterInfo *MRI;
+ MachineLoopInfo *MLI; // Current MachineLoopInfo
MachineTraceMetrics *Traces;
MachineTraceMetrics::Ensemble *MinInstr;
@@ -86,6 +87,7 @@ char &llvm::MachineCombinerID = MachineC
INITIALIZE_PASS_BEGIN(MachineCombiner, "machine-combiner",
"Machine InstCombiner", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
INITIALIZE_PASS_END(MachineCombiner, "machine-combiner", "Machine InstCombiner",
false, false)
@@ -93,6 +95,7 @@ INITIALIZE_PASS_END(MachineCombiner, "ma
void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
AU.setPreservesCFG();
AU.addPreserved<MachineDominatorTree>();
+ AU.addRequired<MachineLoopInfo>();
AU.addPreserved<MachineLoopInfo>();
AU.addRequired<MachineTraceMetrics>();
AU.addPreserved<MachineTraceMetrics>();
@@ -354,6 +357,8 @@ bool MachineCombiner::combineInstruction
DEBUG(dbgs() << "Combining MBB " << MBB->getName() << "\n");
auto BlockIter = MBB->begin();
+ // Check if the block is in a loop.
+ const MachineLoop *ML = MLI->getLoopFor(MBB);
while (BlockIter != MBB->end()) {
auto &MI = *BlockIter++;
@@ -406,11 +411,15 @@ bool MachineCombiner::combineInstruction
if (!NewInstCount)
continue;
+ bool SubstituteAlways = false;
+ if (ML && TII->isThroughputPattern(P))
+ SubstituteAlways = true;
+
// Substitute when we optimize for codesize and the new sequence has
// fewer instructions OR
// the new sequence neither lengthens the critical path nor increases
// resource pressure.
- if (doSubstitute(NewInstCount, OldInstCount) ||
+ if (SubstituteAlways || doSubstitute(NewInstCount, OldInstCount) ||
(improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs,
InstrIdxForVirtReg, P) &&
preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs))) {
@@ -447,6 +456,7 @@ bool MachineCombiner::runOnMachineFuncti
SchedModel = STI.getSchedModel();
TSchedModel.init(SchedModel, &STI, TII);
MRI = &MF.getRegInfo();
+ MLI = &getAnalysis<MachineLoopInfo>();
Traces = &getAnalysis<MachineTraceMetrics>();
MinInstr = nullptr;
OptSize = MF.getFunction()->optForSize();
Modified: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp?rev=267328&r1=267327&r2=267328&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp Sun Apr 24 00:14:01 2016
@@ -24,6 +24,7 @@
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"
@@ -7716,6 +7717,11 @@ SDValue DAGCombiner::visitFADDForFMAComb
if (!HasFMAD && !HasFMA)
return SDValue();
+ const SelectionDAGTargetInfo *STI = DAG.getSubtarget().getSelectionDAGInfo();
+ ;
+ if (AllowFusion && STI && STI->GenerateFMAsInMachineCombiner(OptLevel))
+ return SDValue();
+
// Always prefer FMAD to FMA for precision.
unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA;
bool Aggressive = TLI.enableAggressiveFMAFusion(VT);
@@ -7899,6 +7905,10 @@ SDValue DAGCombiner::visitFSUBForFMAComb
if (!HasFMAD && !HasFMA)
return SDValue();
+ const SelectionDAGTargetInfo *STI = DAG.getSubtarget().getSelectionDAGInfo();
+ if (AllowFusion && STI && STI->GenerateFMAsInMachineCombiner(OptLevel))
+ return SDValue();
+
// Always prefer FMAD to FMA for precision.
unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA;
bool Aggressive = TLI.enableAggressiveFMAFusion(VT);
@@ -8367,7 +8377,6 @@ SDValue DAGCombiner::visitFADD(SDNode *N
AddToWorklist(Fused.getNode());
return Fused;
}
-
return SDValue();
}
Modified: llvm/trunk/lib/CodeGen/TargetInstrInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/TargetInstrInfo.cpp?rev=267328&r1=267327&r2=267328&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/TargetInstrInfo.cpp (original)
+++ llvm/trunk/lib/CodeGen/TargetInstrInfo.cpp Sun Apr 24 00:14:01 2016
@@ -655,7 +655,11 @@ bool TargetInstrInfo::getMachineCombiner
return false;
}
-
+/// Return true when a code sequence can improve loop throughput.
+bool
+TargetInstrInfo::isThroughputPattern(MachineCombinerPattern Pattern) const {
+ return false;
+}
/// Attempt the reassociation transformation to reduce critical path length.
/// See the above comments before getMachineCombinerPatterns().
void TargetInstrInfo::reassociateOps(
Modified: llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.cpp?rev=267328&r1=267327&r2=267328&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.cpp (original)
+++ llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.cpp Sun Apr 24 00:14:01 2016
@@ -2787,37 +2787,75 @@ static bool isCombineInstrCandidate64(un
return false;
}
//
+// FP Opcodes that can be combined with a FMUL
+static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
+ switch (Inst.getOpcode()) {
+ case AArch64::FADDSrr:
+ case AArch64::FADDDrr:
+ case AArch64::FADDv2f32:
+ case AArch64::FADDv2f64:
+ case AArch64::FADDv4f32:
+ case AArch64::FSUBSrr:
+ case AArch64::FSUBDrr:
+ case AArch64::FSUBv2f32:
+ case AArch64::FSUBv2f64:
+ case AArch64::FSUBv4f32:
+ return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath;
+ default:
+ break;
+ }
+ return false;
+}
+//
// Opcodes that can be combined with a MUL
static bool isCombineInstrCandidate(unsigned Opc) {
return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc));
}
-static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO,
- unsigned MulOpc, unsigned ZeroReg) {
+//
+// Utility routine that checks if \param MO is defined by an
+// \param CombineOpc instruction in the basic block \param MBB
+static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO,
+ unsigned CombineOpc, unsigned ZeroReg = 0,
+ bool CheckZeroReg = false) {
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
MachineInstr *MI = nullptr;
- // We need a virtual register definition.
+
if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))
MI = MRI.getUniqueVRegDef(MO.getReg());
// And it needs to be in the trace (otherwise, it won't have a depth).
- if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != MulOpc)
+ if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc)
return false;
-
- assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
- MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
- MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
-
- // The third input reg must be zero.
- if (MI->getOperand(3).getReg() != ZeroReg)
- return false;
-
// Must only used by the user we combine with.
if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
return false;
+ if (CheckZeroReg) {
+ assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
+ MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
+ MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
+ // The third input reg must be zero.
+ if (MI->getOperand(3).getReg() != ZeroReg)
+ return false;
+ }
+
return true;
}
+//
+// Is \param MO defined by an integer multiply and can be combined?
+static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO,
+ unsigned MulOpc, unsigned ZeroReg) {
+ return canCombine(MBB, MO, MulOpc, ZeroReg, true);
+}
+
+//
+// Is \param MO defined by a floating-point multiply and can be combined?
+static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO,
+ unsigned MulOpc) {
+ return canCombine(MBB, MO, MulOpc);
+}
+
// TODO: There are many more machine instruction opcodes to match:
// 1. Other data types (integer, vectors)
// 2. Other math / logic operations (xor, or)
@@ -2951,7 +2989,230 @@ static bool getMaddPatterns(MachineInstr
}
return Found;
}
+/// Floating-Point Support
+
+/// Find instructions that can be turned into madd.
+static bool getFMAPatterns(MachineInstr &Root,
+ SmallVectorImpl<MachineCombinerPattern> &Patterns) {
+
+ if (!isCombineInstrCandidateFP(Root))
+ return 0;
+
+ MachineBasicBlock &MBB = *Root.getParent();
+ bool Found = false;
+
+ switch (Root.getOpcode()) {
+ default:
+ assert(false && "Unsupported FP instruction in combiner\n");
+ break;
+ case AArch64::FADDSrr:
+ assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
+ "FADDWrr does not have register operands");
+ if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULSrr)) {
+ Patterns.push_back(MachineCombinerPattern::FMULADDS_OP1);
+ Found = true;
+ } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
+ AArch64::FMULv1i32_indexed)) {
+ Patterns.push_back(MachineCombinerPattern::FMLAv1i32_indexed_OP1);
+ Found = true;
+ }
+ if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULSrr)) {
+ Patterns.push_back(MachineCombinerPattern::FMULADDS_OP2);
+ Found = true;
+ } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+ AArch64::FMULv1i32_indexed)) {
+ Patterns.push_back(MachineCombinerPattern::FMLAv1i32_indexed_OP2);
+ Found = true;
+ }
+ break;
+ case AArch64::FADDDrr:
+ if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) {
+ Patterns.push_back(MachineCombinerPattern::FMULADDD_OP1);
+ Found = true;
+ } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
+ AArch64::FMULv1i64_indexed)) {
+ Patterns.push_back(MachineCombinerPattern::FMLAv1i64_indexed_OP1);
+ Found = true;
+ }
+ if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULDrr)) {
+ Patterns.push_back(MachineCombinerPattern::FMULADDD_OP2);
+ Found = true;
+ } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+ AArch64::FMULv1i64_indexed)) {
+ Patterns.push_back(MachineCombinerPattern::FMLAv1i64_indexed_OP2);
+ Found = true;
+ }
+ break;
+ case AArch64::FADDv2f32:
+ if (canCombineWithFMUL(MBB, Root.getOperand(1),
+ AArch64::FMULv2i32_indexed)) {
+ Patterns.push_back(MachineCombinerPattern::FMLAv2i32_indexed_OP1);
+ Found = true;
+ } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
+ AArch64::FMULv2f32)) {
+ Patterns.push_back(MachineCombinerPattern::FMLAv2f32_OP1);
+ Found = true;
+ }
+ if (canCombineWithFMUL(MBB, Root.getOperand(2),
+ AArch64::FMULv2i32_indexed)) {
+ Patterns.push_back(MachineCombinerPattern::FMLAv2i32_indexed_OP2);
+ Found = true;
+ } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+ AArch64::FMULv2f32)) {
+ Patterns.push_back(MachineCombinerPattern::FMLAv2f32_OP2);
+ Found = true;
+ }
+ break;
+ case AArch64::FADDv2f64:
+ if (canCombineWithFMUL(MBB, Root.getOperand(1),
+ AArch64::FMULv2i64_indexed)) {
+ Patterns.push_back(MachineCombinerPattern::FMLAv2i64_indexed_OP1);
+ Found = true;
+ } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
+ AArch64::FMULv2f64)) {
+ Patterns.push_back(MachineCombinerPattern::FMLAv2f64_OP1);
+ Found = true;
+ }
+ if (canCombineWithFMUL(MBB, Root.getOperand(2),
+ AArch64::FMULv2i64_indexed)) {
+ Patterns.push_back(MachineCombinerPattern::FMLAv2i64_indexed_OP2);
+ Found = true;
+ } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+ AArch64::FMULv2f64)) {
+ Patterns.push_back(MachineCombinerPattern::FMLAv2f64_OP2);
+ Found = true;
+ }
+ break;
+ case AArch64::FADDv4f32:
+ if (canCombineWithFMUL(MBB, Root.getOperand(1),
+ AArch64::FMULv4i32_indexed)) {
+ Patterns.push_back(MachineCombinerPattern::FMLAv4i32_indexed_OP1);
+ Found = true;
+ } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
+ AArch64::FMULv4f32)) {
+ Patterns.push_back(MachineCombinerPattern::FMLAv4f32_OP1);
+ Found = true;
+ }
+ if (canCombineWithFMUL(MBB, Root.getOperand(2),
+ AArch64::FMULv4i32_indexed)) {
+ Patterns.push_back(MachineCombinerPattern::FMLAv4i32_indexed_OP2);
+ Found = true;
+ } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+ AArch64::FMULv4f32)) {
+ Patterns.push_back(MachineCombinerPattern::FMLAv4f32_OP2);
+ Found = true;
+ }
+ break;
+
+ case AArch64::FSUBSrr:
+ if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULSrr)) {
+ Patterns.push_back(MachineCombinerPattern::FMULSUBS_OP1);
+ Found = true;
+ }
+ if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULSrr)) {
+ Patterns.push_back(MachineCombinerPattern::FMULSUBS_OP2);
+ Found = true;
+ } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+ AArch64::FMULv1i32_indexed)) {
+ Patterns.push_back(MachineCombinerPattern::FMLSv1i32_indexed_OP2);
+ Found = true;
+ }
+ break;
+ case AArch64::FSUBDrr:
+ if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) {
+ Patterns.push_back(MachineCombinerPattern::FMULSUBD_OP1);
+ Found = true;
+ }
+ if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULDrr)) {
+ Patterns.push_back(MachineCombinerPattern::FMULSUBD_OP2);
+ Found = true;
+ } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+ AArch64::FMULv1i64_indexed)) {
+ Patterns.push_back(MachineCombinerPattern::FMLSv1i64_indexed_OP2);
+ Found = true;
+ }
+ break;
+ case AArch64::FSUBv2f32:
+ if (canCombineWithFMUL(MBB, Root.getOperand(2),
+ AArch64::FMULv2i32_indexed)) {
+ Patterns.push_back(MachineCombinerPattern::FMLSv2i32_indexed_OP2);
+ Found = true;
+ } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+ AArch64::FMULv2f32)) {
+ Patterns.push_back(MachineCombinerPattern::FMLSv2f32_OP2);
+ Found = true;
+ }
+ break;
+ case AArch64::FSUBv2f64:
+ if (canCombineWithFMUL(MBB, Root.getOperand(2),
+ AArch64::FMULv2i64_indexed)) {
+ Patterns.push_back(MachineCombinerPattern::FMLSv2i64_indexed_OP2);
+ Found = true;
+ } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+ AArch64::FMULv2f64)) {
+ Patterns.push_back(MachineCombinerPattern::FMLSv2f64_OP2);
+ Found = true;
+ }
+ break;
+ case AArch64::FSUBv4f32:
+ if (canCombineWithFMUL(MBB, Root.getOperand(2),
+ AArch64::FMULv4i32_indexed)) {
+ Patterns.push_back(MachineCombinerPattern::FMLSv4i32_indexed_OP2);
+ Found = true;
+ } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+ AArch64::FMULv4f32)) {
+ Patterns.push_back(MachineCombinerPattern::FMLSv4f32_OP2);
+ Found = true;
+ }
+ break;
+ }
+ return Found;
+}
+/// Return true when a code sequence can improve throughput. It
+/// should be called only for instructions in loops.
+/// \param Pattern - combiner pattern
+bool
+AArch64InstrInfo::isThroughputPattern(MachineCombinerPattern Pattern) const {
+ switch (Pattern) {
+ default:
+ break;
+ case MachineCombinerPattern::FMULADDS_OP1:
+ case MachineCombinerPattern::FMULADDS_OP2:
+ case MachineCombinerPattern::FMULSUBS_OP1:
+ case MachineCombinerPattern::FMULSUBS_OP2:
+ case MachineCombinerPattern::FMULADDD_OP1:
+ case MachineCombinerPattern::FMULADDD_OP2:
+ case MachineCombinerPattern::FMULSUBD_OP1:
+ case MachineCombinerPattern::FMULSUBD_OP2:
+ case MachineCombinerPattern::FMLAv1i32_indexed_OP1:
+ case MachineCombinerPattern::FMLAv1i32_indexed_OP2:
+ case MachineCombinerPattern::FMLAv1i64_indexed_OP1:
+ case MachineCombinerPattern::FMLAv1i64_indexed_OP2:
+ case MachineCombinerPattern::FMLAv2f32_OP2:
+ case MachineCombinerPattern::FMLAv2f32_OP1:
+ case MachineCombinerPattern::FMLAv2f64_OP1:
+ case MachineCombinerPattern::FMLAv2f64_OP2:
+ case MachineCombinerPattern::FMLAv2i32_indexed_OP1:
+ case MachineCombinerPattern::FMLAv2i32_indexed_OP2:
+ case MachineCombinerPattern::FMLAv2i64_indexed_OP1:
+ case MachineCombinerPattern::FMLAv2i64_indexed_OP2:
+ case MachineCombinerPattern::FMLAv4f32_OP1:
+ case MachineCombinerPattern::FMLAv4f32_OP2:
+ case MachineCombinerPattern::FMLAv4i32_indexed_OP1:
+ case MachineCombinerPattern::FMLAv4i32_indexed_OP2:
+ case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
+ case MachineCombinerPattern::FMLSv1i64_indexed_OP2:
+ case MachineCombinerPattern::FMLSv2i32_indexed_OP2:
+ case MachineCombinerPattern::FMLSv2i64_indexed_OP2:
+ case MachineCombinerPattern::FMLSv2f32_OP2:
+ case MachineCombinerPattern::FMLSv2f64_OP2:
+ case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
+ case MachineCombinerPattern::FMLSv4f32_OP2:
+ return true;
+ } // end switch (Pattern)
+ return false;
+}
/// Return true when there is potentially a faster code sequence for an
/// instruction chain ending in \p Root. All potential patterns are listed in
/// the \p Pattern vector. Pattern should be sorted in priority order since the
@@ -2960,28 +3221,35 @@ static bool getMaddPatterns(MachineInstr
bool AArch64InstrInfo::getMachineCombinerPatterns(
MachineInstr &Root,
SmallVectorImpl<MachineCombinerPattern> &Patterns) const {
+ // Integer patterns
if (getMaddPatterns(Root, Patterns))
return true;
+ // Floating point patterns
+ if (getFMAPatterns(Root, Patterns))
+ return true;
return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns);
}
-/// genMadd - Generate madd instruction and combine mul and add.
-/// Example:
-/// MUL I=A,B,0
-/// ADD R,I,C
-/// ==> MADD R,A,B,C
-/// \param Root is the ADD instruction
+enum class FMAInstKind { Default, Indexed, Accumulator };
+/// genFusedMultiply - Generate fused multiply instructions.
+/// This function supports both integer and floating point instructions.
+/// A typical example:
+/// F|MUL I=A,B,0
+/// F|ADD R,I,C
+/// ==> F|MADD R,A,B,C
+/// \param Root is the F|ADD instruction
/// \param [out] InsInstrs is a vector of machine instructions and will
/// contain the generated madd instruction
/// \param IdxMulOpd is index of operand in Root that is the result of
-/// the MUL. In the example above IdxMulOpd is 1.
-/// \param MaddOpc the opcode fo the madd instruction
-static MachineInstr *genMadd(MachineFunction &MF, MachineRegisterInfo &MRI,
- const TargetInstrInfo *TII, MachineInstr &Root,
- SmallVectorImpl<MachineInstr *> &InsInstrs,
- unsigned IdxMulOpd, unsigned MaddOpc,
- const TargetRegisterClass *RC) {
+/// the F|MUL. In the example above IdxMulOpd is 1.
+/// \param MaddOpc the opcode fo the f|madd instruction
+static MachineInstr *
+genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI,
+ const TargetInstrInfo *TII, MachineInstr &Root,
+ SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd,
+ unsigned MaddOpc, const TargetRegisterClass *RC,
+ FMAInstKind kind = FMAInstKind::Default) {
assert(IdxMulOpd == 1 || IdxMulOpd == 2);
unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
@@ -3003,12 +3271,26 @@ static MachineInstr *genMadd(MachineFunc
if (TargetRegisterInfo::isVirtualRegister(SrcReg2))
MRI.constrainRegClass(SrcReg2, RC);
- MachineInstrBuilder MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc),
- ResultReg)
- .addReg(SrcReg0, getKillRegState(Src0IsKill))
- .addReg(SrcReg1, getKillRegState(Src1IsKill))
- .addReg(SrcReg2, getKillRegState(Src2IsKill));
- // Insert the MADD
+ MachineInstrBuilder MIB;
+ if (kind == FMAInstKind::Default)
+ MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
+ .addReg(SrcReg0, getKillRegState(Src0IsKill))
+ .addReg(SrcReg1, getKillRegState(Src1IsKill))
+ .addReg(SrcReg2, getKillRegState(Src2IsKill));
+ else if (kind == FMAInstKind::Indexed)
+ MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
+ .addReg(SrcReg2, getKillRegState(Src2IsKill))
+ .addReg(SrcReg0, getKillRegState(Src0IsKill))
+ .addReg(SrcReg1, getKillRegState(Src1IsKill))
+ .addImm(MUL->getOperand(3).getImm());
+ else if (kind == FMAInstKind::Accumulator)
+ MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
+ .addReg(SrcReg2, getKillRegState(Src2IsKill))
+ .addReg(SrcReg0, getKillRegState(Src0IsKill))
+ .addReg(SrcReg1, getKillRegState(Src1IsKill));
+ else
+ assert(false && "Invalid FMA instruction kind \n");
+ // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL)
InsInstrs.push_back(MIB);
return MUL;
}
@@ -3096,7 +3378,7 @@ void AArch64InstrInfo::genAlternativeCod
Opc = AArch64::MADDXrrr;
RC = &AArch64::GPR64RegClass;
}
- MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
break;
case MachineCombinerPattern::MULADDW_OP2:
case MachineCombinerPattern::MULADDX_OP2:
@@ -3111,7 +3393,7 @@ void AArch64InstrInfo::genAlternativeCod
Opc = AArch64::MADDXrrr;
RC = &AArch64::GPR64RegClass;
}
- MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
break;
case MachineCombinerPattern::MULADDWI_OP1:
case MachineCombinerPattern::MULADDXI_OP1: {
@@ -3203,7 +3485,7 @@ void AArch64InstrInfo::genAlternativeCod
Opc = AArch64::MSUBXrrr;
RC = &AArch64::GPR64RegClass;
}
- MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
break;
case MachineCombinerPattern::MULSUBWI_OP1:
case MachineCombinerPattern::MULSUBXI_OP1: {
@@ -3248,6 +3530,234 @@ void AArch64InstrInfo::genAlternativeCod
}
break;
}
+ // Floating Point Support
+ case MachineCombinerPattern::FMULADDS_OP1:
+ case MachineCombinerPattern::FMULADDD_OP1:
+ // MUL I=A,B,0
+ // ADD R,I,C
+ // ==> MADD R,A,B,C
+ // --- Create(MADD);
+ if (Pattern == MachineCombinerPattern::FMULADDS_OP1) {
+ Opc = AArch64::FMADDSrrr;
+ RC = &AArch64::FPR32RegClass;
+ } else {
+ Opc = AArch64::FMADDDrrr;
+ RC = &AArch64::FPR64RegClass;
+ }
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+ break;
+ case MachineCombinerPattern::FMULADDS_OP2:
+ case MachineCombinerPattern::FMULADDD_OP2:
+ // FMUL I=A,B,0
+ // FADD R,C,I
+ // ==> FMADD R,A,B,C
+ // --- Create(FMADD);
+ if (Pattern == MachineCombinerPattern::FMULADDS_OP2) {
+ Opc = AArch64::FMADDSrrr;
+ RC = &AArch64::FPR32RegClass;
+ } else {
+ Opc = AArch64::FMADDDrrr;
+ RC = &AArch64::FPR64RegClass;
+ }
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+ break;
+
+ case MachineCombinerPattern::FMLAv1i32_indexed_OP1:
+ Opc = AArch64::FMLAv1i32_indexed;
+ RC = &AArch64::FPR32RegClass;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+ FMAInstKind::Indexed);
+ break;
+ case MachineCombinerPattern::FMLAv1i32_indexed_OP2:
+ Opc = AArch64::FMLAv1i32_indexed;
+ RC = &AArch64::FPR32RegClass;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Indexed);
+ break;
+
+ case MachineCombinerPattern::FMLAv1i64_indexed_OP1:
+ Opc = AArch64::FMLAv1i64_indexed;
+ RC = &AArch64::FPR64RegClass;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+ FMAInstKind::Indexed);
+ break;
+ case MachineCombinerPattern::FMLAv1i64_indexed_OP2:
+ Opc = AArch64::FMLAv1i64_indexed;
+ RC = &AArch64::FPR64RegClass;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Indexed);
+ break;
+
+ case MachineCombinerPattern::FMLAv2i32_indexed_OP1:
+ case MachineCombinerPattern::FMLAv2f32_OP1:
+ RC = &AArch64::FPR64RegClass;
+ if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP1) {
+ Opc = AArch64::FMLAv2i32_indexed;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+ FMAInstKind::Indexed);
+ } else {
+ Opc = AArch64::FMLAv2f32;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+ FMAInstKind::Accumulator);
+ }
+ break;
+ case MachineCombinerPattern::FMLAv2i32_indexed_OP2:
+ case MachineCombinerPattern::FMLAv2f32_OP2:
+ RC = &AArch64::FPR64RegClass;
+ if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP2) {
+ Opc = AArch64::FMLAv2i32_indexed;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Indexed);
+ } else {
+ Opc = AArch64::FMLAv2f32;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Accumulator);
+ }
+ break;
+
+ case MachineCombinerPattern::FMLAv2i64_indexed_OP1:
+ case MachineCombinerPattern::FMLAv2f64_OP1:
+ RC = &AArch64::FPR128RegClass;
+ if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP1) {
+ Opc = AArch64::FMLAv2i64_indexed;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+ FMAInstKind::Indexed);
+ } else {
+ Opc = AArch64::FMLAv2f64;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+ FMAInstKind::Accumulator);
+ }
+ break;
+ case MachineCombinerPattern::FMLAv2i64_indexed_OP2:
+ case MachineCombinerPattern::FMLAv2f64_OP2:
+ RC = &AArch64::FPR128RegClass;
+ if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP2) {
+ Opc = AArch64::FMLAv2i64_indexed;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Indexed);
+ } else {
+ Opc = AArch64::FMLAv2f64;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Accumulator);
+ }
+ break;
+
+ case MachineCombinerPattern::FMLAv4i32_indexed_OP1:
+ case MachineCombinerPattern::FMLAv4f32_OP1:
+ RC = &AArch64::FPR128RegClass;
+ if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP1) {
+ Opc = AArch64::FMLAv4i32_indexed;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+ FMAInstKind::Indexed);
+ } else {
+ Opc = AArch64::FMLAv4f32;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+ FMAInstKind::Accumulator);
+ }
+ break;
+
+ case MachineCombinerPattern::FMLAv4i32_indexed_OP2:
+ case MachineCombinerPattern::FMLAv4f32_OP2:
+ RC = &AArch64::FPR128RegClass;
+ if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP2) {
+ Opc = AArch64::FMLAv4i32_indexed;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Indexed);
+ } else {
+ Opc = AArch64::FMLAv4f32;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Accumulator);
+ }
+ break;
+
+ case MachineCombinerPattern::FMULSUBS_OP1:
+ case MachineCombinerPattern::FMULSUBD_OP1: {
+ // FMUL I=A,B,0
+ // FSUB R,I,C
+ // ==> FNMSUB R,A,B,C // = -C + A*B
+ // --- Create(FNMSUB);
+ if (Pattern == MachineCombinerPattern::FMULSUBS_OP1) {
+ Opc = AArch64::FNMSUBSrrr;
+ RC = &AArch64::FPR32RegClass;
+ } else {
+ Opc = AArch64::FNMSUBDrrr;
+ RC = &AArch64::FPR64RegClass;
+ }
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+ break;
+ }
+ case MachineCombinerPattern::FMULSUBS_OP2:
+ case MachineCombinerPattern::FMULSUBD_OP2: {
+ // FMUL I=A,B,0
+ // FSUB R,C,I
+ // ==> FMSUB R,A,B,C (computes C - A*B)
+ // --- Create(FMSUB);
+ if (Pattern == MachineCombinerPattern::FMULSUBS_OP2) {
+ Opc = AArch64::FMSUBSrrr;
+ RC = &AArch64::FPR32RegClass;
+ } else {
+ Opc = AArch64::FMSUBDrrr;
+ RC = &AArch64::FPR64RegClass;
+ }
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+ break;
+
+ case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
+ Opc = AArch64::FMLSv1i32_indexed;
+ RC = &AArch64::FPR32RegClass;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Indexed);
+ break;
+
+ case MachineCombinerPattern::FMLSv1i64_indexed_OP2:
+ Opc = AArch64::FMLSv1i64_indexed;
+ RC = &AArch64::FPR64RegClass;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Indexed);
+ break;
+
+ case MachineCombinerPattern::FMLSv2f32_OP2:
+ case MachineCombinerPattern::FMLSv2i32_indexed_OP2:
+ RC = &AArch64::FPR64RegClass;
+ if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP2) {
+ Opc = AArch64::FMLSv2i32_indexed;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Indexed);
+ } else {
+ Opc = AArch64::FMLSv2f32;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Accumulator);
+ }
+ break;
+
+ case MachineCombinerPattern::FMLSv2f64_OP2:
+ case MachineCombinerPattern::FMLSv2i64_indexed_OP2:
+ RC = &AArch64::FPR128RegClass;
+ if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP2) {
+ Opc = AArch64::FMLSv2i64_indexed;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Indexed);
+ } else {
+ Opc = AArch64::FMLSv2f64;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Accumulator);
+ }
+ break;
+
+ case MachineCombinerPattern::FMLSv4f32_OP2:
+ case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
+ RC = &AArch64::FPR128RegClass;
+ if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP2) {
+ Opc = AArch64::FMLSv4i32_indexed;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Indexed);
+ } else {
+ Opc = AArch64::FMLSv4f32;
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+ FMAInstKind::Accumulator);
+ }
+ break;
+ }
} // end switch (Pattern)
// Record MUL and ADD/SUB for deletion
DelInstrs.push_back(MUL);
Modified: llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.h?rev=267328&r1=267327&r2=267328&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.h (original)
+++ llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.h Sun Apr 24 00:14:01 2016
@@ -174,6 +174,11 @@ public:
unsigned SrcReg2, int CmpMask, int CmpValue,
const MachineRegisterInfo *MRI) const override;
bool optimizeCondBranch(MachineInstr *MI) const override;
+
+ /// Return true when a code sequence can improve throughput. It
+ /// should be called only for instructions in loops.
+ /// \param Pattern - combiner pattern
+ bool isThroughputPattern(MachineCombinerPattern Pattern) const override;
/// Return true when there is potentially a faster code sequence
/// for an instruction chain ending in <Root>. All potential patterns are
/// listed in the <Patterns> array.
Modified: llvm/trunk/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp?rev=267328&r1=267327&r2=267328&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp (original)
+++ llvm/trunk/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp Sun Apr 24 00:14:01 2016
@@ -51,3 +51,9 @@ SDValue AArch64SelectionDAGInfo::EmitTar
}
return SDValue();
}
+bool AArch64SelectionDAGInfo::GenerateFMAsInMachineCombiner(
+ CodeGenOpt::Level OptLevel) const {
+ if (OptLevel >= CodeGenOpt::Aggressive)
+ return true;
+ return false;
+}
Modified: llvm/trunk/lib/Target/AArch64/AArch64SelectionDAGInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64SelectionDAGInfo.h?rev=267328&r1=267327&r2=267328&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AArch64/AArch64SelectionDAGInfo.h (original)
+++ llvm/trunk/lib/Target/AArch64/AArch64SelectionDAGInfo.h Sun Apr 24 00:14:01 2016
@@ -25,6 +25,7 @@ public:
SDValue Dst, SDValue Src, SDValue Size,
unsigned Align, bool isVolatile,
MachinePointerInfo DstPtrInfo) const override;
+ bool GenerateFMAsInMachineCombiner(CodeGenOpt::Level OptLevel) const override;
};
}
Added: llvm/trunk/test/CodeGen/AArch64/arm64-fma-combines.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/arm64-fma-combines.ll?rev=267328&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/arm64-fma-combines.ll (added)
+++ llvm/trunk/test/CodeGen/AArch64/arm64-fma-combines.ll Sun Apr 24 00:14:01 2016
@@ -0,0 +1,136 @@
+; RUN: llc < %s -O=3 -mtriple=arm64-apple-ios -mcpu=cyclone -enable-unsafe-fp-math | FileCheck %s
+define void @foo_2d(double* %src) {
+; CHECK-LABEL: %entry
+; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+entry:
+ %arrayidx1 = getelementptr inbounds double, double* %src, i64 5
+ %arrayidx2 = getelementptr inbounds double, double* %src, i64 11
+ %tmp = bitcast double* %arrayidx1 to <2 x double>*
+ %tmp1 = load double, double* %arrayidx2, align 8
+ %tmp2 = load double, double* %arrayidx1, align 8
+ %fmul = fmul fast double %tmp1, %tmp1
+ %fmul2 = fmul fast double %tmp2, 0x3F94AFD6A052BF5B
+ %fadd = fadd fast double %fmul, %fmul2
+ br label %for.body
+
+; CHECK-LABEL: %for.body
+; CHECK: fmla.2d {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; CHECK: fmla.2d {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0]
+; CHECK: fmla.d {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}[0]
+for.body: ; preds = %for.body, %entry
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %arrayidx3 = getelementptr inbounds double, double* %src, i64 %indvars.iv.next
+ %tmp3 = load double, double* %arrayidx3, align 8
+ %add = fadd fast double %tmp3, %tmp3
+ %mul = fmul fast double %add, %fadd
+ %e1 = insertelement <2 x double> undef, double %add, i32 0
+ %e2 = insertelement <2 x double> %e1, double %add, i32 1
+ %add2 = fadd fast <2 x double> %e2, <double 3.000000e+00, double -3.000000e+00>
+ %e3 = insertelement <2 x double> undef, double %mul, i32 0
+ %e4 = insertelement <2 x double> %e3, double %mul, i32 1
+ %mul2 = fmul fast <2 x double> %add2,<double 3.000000e+00, double -3.000000e+00>
+ %e5 = insertelement <2 x double> undef, double %add, i32 0
+ %e6 = insertelement <2 x double> %e5, double %add, i32 1
+ %add3 = fadd fast <2 x double> %mul2, <double 3.000000e+00, double -3.000000e+00>
+ %mulx = fmul fast <2 x double> %add2, %e2
+ %addx = fadd fast <2 x double> %mulx, %e4
+ %e7 = insertelement <2 x double> undef, double %mul, i32 0
+ %e8 = insertelement <2 x double> %e7, double %mul, i32 1
+ %e9 = fmul fast <2 x double> %addx, %add3
+ store <2 x double> %e9, <2 x double>* %tmp, align 8
+ %e10 = extractelement <2 x double> %add3, i32 0
+ %mul3 = fmul fast double %mul, %e10
+ %add4 = fadd fast double %mul3, %mul
+ store double %add4, double* %arrayidx2, align 8
+ %exitcond = icmp eq i64 %indvars.iv.next, 25
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end: ; preds = %for.body
+ ret void
+}
+define void @foo_2s(float* %src) {
+entry:
+ %arrayidx1 = getelementptr inbounds float, float* %src, i64 5
+ %arrayidx2 = getelementptr inbounds float, float* %src, i64 11
+ %tmp = bitcast float* %arrayidx1 to <2 x float>*
+ br label %for.body
+
+; CHECK-LABEL: %for.body
+; CHECK: fmla.2s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; CHECK: fmla.2s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0]
+; CHECK: fmla.s {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}[0]
+for.body: ; preds = %for.body, %entry
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %arrayidx3 = getelementptr inbounds float, float* %src, i64 %indvars.iv.next
+ %tmp1 = load float, float* %arrayidx3, align 8
+ %add = fadd fast float %tmp1, %tmp1
+ %mul = fmul fast float %add, %add
+ %e1 = insertelement <2 x float> undef, float %add, i32 0
+ %e2 = insertelement <2 x float> %e1, float %add, i32 1
+ %add2 = fadd fast <2 x float> %e2, <float 3.000000e+00, float -3.000000e+00>
+ %e3 = insertelement <2 x float> undef, float %mul, i32 0
+ %e4 = insertelement <2 x float> %e3, float %mul, i32 1
+ %mul2 = fmul fast <2 x float> %add2,<float 3.000000e+00, float -3.000000e+00>
+ %e5 = insertelement <2 x float> undef, float %add, i32 0
+ %e6 = insertelement <2 x float> %e5, float %add, i32 1
+ %add3 = fadd fast <2 x float> %mul2, <float 3.000000e+00, float -3.000000e+00>
+ %mulx = fmul fast <2 x float> %add2, %e2
+ %addx = fadd fast <2 x float> %mulx, %e4
+ %e7 = insertelement <2 x float> undef, float %mul, i32 0
+ %e8 = insertelement <2 x float> %e7, float %mul, i32 1
+ %e9 = fmul fast <2 x float> %addx, %add3
+ store <2 x float> %e9, <2 x float>* %tmp, align 8
+ %e10 = extractelement <2 x float> %add3, i32 0
+ %mul3 = fmul fast float %mul, %e10
+ %add4 = fadd fast float %mul3, %mul
+ store float %add4, float* %arrayidx2, align 8
+ %exitcond = icmp eq i64 %indvars.iv.next, 25
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end: ; preds = %for.body
+ ret void
+}
+define void @foo_4s(float* %src) {
+entry:
+ %arrayidx1 = getelementptr inbounds float, float* %src, i64 5
+ %arrayidx2 = getelementptr inbounds float, float* %src, i64 11
+ %tmp = bitcast float* %arrayidx1 to <4 x float>*
+ br label %for.body
+
+; CHECK-LABEL: %for.body
+; CHECK: fmla.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; CHECK: fmla.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0]
+for.body: ; preds = %for.body, %entry
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %arrayidx3 = getelementptr inbounds float, float* %src, i64 %indvars.iv.next
+ %tmp1 = load float, float* %arrayidx3, align 8
+ %add = fadd fast float %tmp1, %tmp1
+ %mul = fmul fast float %add, %add
+ %e1 = insertelement <4 x float> undef, float %add, i32 0
+ %e2 = insertelement <4 x float> %e1, float %add, i32 1
+ %add2 = fadd fast <4 x float> %e2, <float 3.000000e+00, float -3.000000e+00, float 5.000000e+00, float 7.000000e+00>
+ %e3 = insertelement <4 x float> undef, float %mul, i32 0
+ %e4 = insertelement <4 x float> %e3, float %mul, i32 1
+ %mul2 = fmul fast <4 x float> %add2,<float 3.000000e+00, float -3.000000e+00, float 5.000000e+00, float 7.000000e+00>
+ %e5 = insertelement <4 x float> undef, float %add, i32 0
+ %e6 = insertelement <4 x float> %e5, float %add, i32 1
+ %add3 = fadd fast <4 x float> %mul2, <float 3.000000e+00, float -3.000000e+00, float 5.000000e+00, float 7.000000e+00>
+ %mulx = fmul fast <4 x float> %add2, %e2
+ %addx = fadd fast <4 x float> %mulx, %e4
+ %e7 = insertelement <4 x float> undef, float %mul, i32 0
+ %e8 = insertelement <4 x float> %e7, float %mul, i32 1
+ %e9 = fmul fast <4 x float> %addx, %add3
+ store <4 x float> %e9, <4 x float>* %tmp, align 8
+ %e10 = extractelement <4 x float> %add3, i32 0
+ %mul3 = fmul fast float %mul, %e10
+ store float %mul3, float* %arrayidx2, align 8
+ %exitcond = icmp eq i64 %indvars.iv.next, 25
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end: ; preds = %for.body
+ ret void
+}
Added: llvm/trunk/test/CodeGen/AArch64/arm64-fml-combines.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/arm64-fml-combines.ll?rev=267328&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/arm64-fml-combines.ll (added)
+++ llvm/trunk/test/CodeGen/AArch64/arm64-fml-combines.ll Sun Apr 24 00:14:01 2016
@@ -0,0 +1,128 @@
+; RUN: llc < %s -O=3 -mtriple=arm64-apple-ios -mcpu=cyclone -enable-unsafe-fp-math | FileCheck %s
+define void @foo_2d(double* %src) {
+entry:
+ %arrayidx1 = getelementptr inbounds double, double* %src, i64 5
+ %arrayidx2 = getelementptr inbounds double, double* %src, i64 11
+ %tmp = bitcast double* %arrayidx1 to <2 x double>*
+ br label %for.body
+
+; CHECK-LABEL: %for.body
+; CHECK: fmls.2d {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; CHECK: fmls.2d {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0]
+; CHECK: fmls.d {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}[0]
+for.body: ; preds = %for.body, %entry
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %indvars.iv.next = sub nuw nsw i64 %indvars.iv, 1
+ %arrayidx3 = getelementptr inbounds double, double* %src, i64 %indvars.iv.next
+ %tmp1 = load double, double* %arrayidx3, align 8
+ %add = fadd fast double %tmp1, %tmp1
+ %mul = fmul fast double %add, %add
+ %e1 = insertelement <2 x double> undef, double %add, i32 0
+ %e2 = insertelement <2 x double> %e1, double %add, i32 1
+ %sub2 = fsub fast <2 x double> %e2, <double 3.000000e+00, double -3.000000e+00>
+ %e3 = insertelement <2 x double> undef, double %mul, i32 0
+ %e4 = insertelement <2 x double> %e3, double %mul, i32 1
+ %mul2 = fmul fast <2 x double> %sub2,<double 3.000000e+00, double -3.000000e+00>
+ %e5 = insertelement <2 x double> undef, double %add, i32 0
+ %e6 = insertelement <2 x double> %e5, double %add, i32 1
+ %sub3 = fsub fast <2 x double> <double 3.000000e+00, double -3.000000e+00>, %mul2
+ %mulx = fmul fast <2 x double> %sub2, %e2
+ %subx = fsub fast <2 x double> %e4, %mulx
+ %e7 = insertelement <2 x double> undef, double %mul, i32 0
+ %e8 = insertelement <2 x double> %e7, double %mul, i32 1
+ %e9 = fmul fast <2 x double> %subx, %sub3
+ store <2 x double> %e9, <2 x double>* %tmp, align 8
+ %e10 = extractelement <2 x double> %sub3, i32 0
+ %mul3 = fmul fast double %mul, %e10
+ %sub4 = fsub fast double %mul, %mul3
+ store double %sub4, double* %arrayidx2, align 8
+ %exitcond = icmp eq i64 %indvars.iv.next, 25
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end: ; preds = %for.body
+ ret void
+}
+define void @foo_2s(float* %src) {
+entry:
+ %arrayidx1 = getelementptr inbounds float, float* %src, i64 5
+ %arrayidx2 = getelementptr inbounds float, float* %src, i64 11
+ %tmp = bitcast float* %arrayidx1 to <2 x float>*
+ br label %for.body
+
+; CHECK-LABEL: %for.body
+; CHECK: fmls.2s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; CHECK: fmls.2s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0]
+; CHECK: fmls.s {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}[0]
+for.body: ; preds = %for.body, %entry
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %arrayidx3 = getelementptr inbounds float, float* %src, i64 %indvars.iv.next
+ %tmp1 = load float, float* %arrayidx3, align 8
+ %add = fadd fast float %tmp1, %tmp1
+ %mul = fmul fast float %add, %add
+ %e1 = insertelement <2 x float> undef, float %add, i32 0
+ %e2 = insertelement <2 x float> %e1, float %add, i32 1
+ %add2 = fsub fast <2 x float> %e2, <float 3.000000e+00, float -3.000000e+00>
+ %e3 = insertelement <2 x float> undef, float %mul, i32 0
+ %e4 = insertelement <2 x float> %e3, float %mul, i32 1
+ %mul2 = fmul fast <2 x float> %add2,<float 3.000000e+00, float -3.000000e+00>
+ %e5 = insertelement <2 x float> undef, float %add, i32 0
+ %e6 = insertelement <2 x float> %e5, float %add, i32 1
+ %add3 = fsub fast <2 x float> <float 3.000000e+00, float -3.000000e+00>, %mul2
+ %mulx = fmul fast <2 x float> %add2, %e2
+ %addx = fsub fast <2 x float> %e4, %mulx
+ %e7 = insertelement <2 x float> undef, float %mul, i32 0
+ %e8 = insertelement <2 x float> %e7, float %mul, i32 1
+ %e9 = fmul fast <2 x float> %addx, %add3
+ store <2 x float> %e9, <2 x float>* %tmp, align 8
+ %e10 = extractelement <2 x float> %add3, i32 0
+ %mul3 = fmul fast float %mul, %e10
+ %add4 = fsub fast float %mul, %mul3
+ store float %add4, float* %arrayidx2, align 8
+ %exitcond = icmp eq i64 %indvars.iv.next, 25
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end: ; preds = %for.body
+ ret void
+}
+define void @foo_4s(float* %src) {
+entry:
+ %arrayidx1 = getelementptr inbounds float, float* %src, i64 5
+ %arrayidx2 = getelementptr inbounds float, float* %src, i64 11
+ %tmp = bitcast float* %arrayidx1 to <4 x float>*
+ br label %for.body
+
+; CHECK-LABEL: %for.body
+; CHECK: fmls.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; CHECK: fmls.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0]
+for.body: ; preds = %for.body, %entry
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %arrayidx3 = getelementptr inbounds float, float* %src, i64 %indvars.iv.next
+ %tmp1 = load float, float* %arrayidx3, align 8
+ %add = fadd fast float %tmp1, %tmp1
+ %mul = fmul fast float %add, %add
+ %e1 = insertelement <4 x float> undef, float %add, i32 0
+ %e2 = insertelement <4 x float> %e1, float %add, i32 1
+ %add2 = fadd fast <4 x float> %e2, <float 3.000000e+00, float -3.000000e+00, float 5.000000e+00, float 7.000000e+00>
+ %e3 = insertelement <4 x float> undef, float %mul, i32 0
+ %e4 = insertelement <4 x float> %e3, float %mul, i32 1
+ %mul2 = fmul fast <4 x float> %add2,<float 3.000000e+00, float -3.000000e+00, float 5.000000e+00, float 7.000000e+00>
+ %e5 = insertelement <4 x float> undef, float %add, i32 0
+ %e6 = insertelement <4 x float> %e5, float %add, i32 1
+ %add3 = fsub fast <4 x float> <float 3.000000e+00, float -3.000000e+00, float 5.000000e+00, float 7.000000e+00> , %mul2
+ %mulx = fmul fast <4 x float> %add2, %e2
+ %addx = fsub fast <4 x float> %e4, %mulx
+ %e7 = insertelement <4 x float> undef, float %mul, i32 0
+ %e8 = insertelement <4 x float> %e7, float %mul, i32 1
+ %e9 = fmul fast <4 x float> %addx, %add3
+ store <4 x float> %e9, <4 x float>* %tmp, align 8
+ %e10 = extractelement <4 x float> %add3, i32 0
+ %mul3 = fmul fast float %mul, %e10
+ store float %mul3, float* %arrayidx2, align 8
+ %exitcond = icmp eq i64 %indvars.iv.next, 25
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end: ; preds = %for.body
+ ret void
+}
More information about the llvm-commits
mailing list