[llvm-commits] [llvm] r128665 - in /llvm/trunk: lib/Target/ARM/ARM.td lib/Target/ARM/ARMISelLowering.cpp lib/Target/ARM/ARMSubtarget.h test/CodeGen/ARM/vmul.ll
Evan Cheng
evan.cheng at apple.com
Thu Mar 31 12:38:48 PDT 2011
Author: evancheng
Date: Thu Mar 31 14:38:48 2011
New Revision: 128665
URL: http://llvm.org/viewvc/llvm-project?rev=128665&view=rev
Log:
Distribute (A + B) * C to (A * C) + (B * C) to make use of NEON multiplier
accumulator forwarding:
vadd d3, d0, d1
vmul d3, d3, d2
=>
vmul d3, d0, d2
vmla d3, d1, d2
Modified:
llvm/trunk/lib/Target/ARM/ARM.td
llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp
llvm/trunk/lib/Target/ARM/ARMSubtarget.h
llvm/trunk/test/CodeGen/ARM/vmul.ll
Modified: llvm/trunk/lib/Target/ARM/ARM.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARM.td?rev=128665&r1=128664&r2=128665&view=diff
==============================================================================
--- llvm/trunk/lib/Target/ARM/ARM.td (original)
+++ llvm/trunk/lib/Target/ARM/ARM.td Thu Mar 31 14:38:48 2011
@@ -51,6 +51,12 @@
// to just not use them.
def FeatureHasSlowFPVMLx : SubtargetFeature<"slowfpvmlx", "SlowFPVMLx", "true",
"Disable VFP / NEON MAC instructions">;
+
+// Cortex-A8 / A9 Advanced SIMD has multiplier accumulator forwarding.
+def FeatureVMLxForwarding : SubtargetFeature<"vmlx-forwarding",
+ "HasVMLxForwarding", "true",
+ "Has multiplier accumulator forwarding">;
+
// Some processors benefit from using NEON instructions for scalar
// single-precision FP operations.
def FeatureNEONForFP : SubtargetFeature<"neonfp", "UseNEONForSinglePrecisionFP",
@@ -100,11 +106,12 @@
def ProcA8 : SubtargetFeature<"a8", "ARMProcFamily", "CortexA8",
"Cortex-A8 ARM processors",
[FeatureSlowFPBrcc, FeatureNEONForFP,
- FeatureHasSlowFPVMLx, FeatureT2XtPk]>;
+ FeatureHasSlowFPVMLx, FeatureVMLxForwarding,
+ FeatureT2XtPk]>;
def ProcA9 : SubtargetFeature<"a9", "ARMProcFamily", "CortexA9",
"Cortex-A9 ARM processors",
- [FeatureHasSlowFPVMLx, FeatureT2XtPk,
- FeatureFP16]>;
+ [FeatureHasSlowFPVMLx, FeatureVMLxForwarding,
+ FeatureT2XtPk, FeatureFP16]>;
class ProcNoItin<string Name, list<SubtargetFeature> Features>
: Processor<Name, GenericItineraries, Features>;
Modified: llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp?rev=128665&r1=128664&r2=128665&view=diff
==============================================================================
--- llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp Thu Mar 31 14:38:48 2011
@@ -5224,6 +5224,42 @@
return SDValue();
}
+/// PerformVMULCombine
+/// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the
+/// special multiplier accumulator forwarding.
+/// vmul d3, d0, d2
+/// vmla d3, d1, d2
+/// is faster than
+/// vadd d3, d0, d1
+/// vmul d3, d3, d2
+static SDValue PerformVMULCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const ARMSubtarget *Subtarget) {
+ if (!Subtarget->hasVMLxForwarding())
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ unsigned Opcode = N0.getOpcode();
+ if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
+ Opcode != ISD::FADD && Opcode != ISD::FSUB) {
+ Opcode = N0.getOpcode();
+ if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
+ Opcode != ISD::FADD && Opcode != ISD::FSUB)
+ return SDValue();
+ std::swap(N0, N1);
+ }
+
+ EVT VT = N->getValueType(0);
+ DebugLoc DL = N->getDebugLoc();
+ SDValue N00 = N0->getOperand(0);
+ SDValue N01 = N0->getOperand(1);
+ return DAG.getNode(Opcode, DL, VT,
+ DAG.getNode(ISD::MUL, DL, VT, N00, N1),
+ DAG.getNode(ISD::MUL, DL, VT, N01, N1));
+}
+
static SDValue PerformMULCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *Subtarget) {
@@ -5236,6 +5272,8 @@
return SDValue();
EVT VT = N->getValueType(0);
+ if (VT.is64BitVector() || VT.is128BitVector())
+ return PerformVMULCombine(N, DCI, Subtarget);
if (VT != MVT::i32)
return SDValue();
Modified: llvm/trunk/lib/Target/ARM/ARMSubtarget.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMSubtarget.h?rev=128665&r1=128664&r2=128665&view=diff
==============================================================================
--- llvm/trunk/lib/Target/ARM/ARMSubtarget.h (original)
+++ llvm/trunk/lib/Target/ARM/ARMSubtarget.h Thu Mar 31 14:38:48 2011
@@ -61,6 +61,10 @@
/// whether the FP VML[AS] instructions are slow (if so, don't use them).
bool SlowFPVMLx;
+ /// HasVMLxForwarding - If true, NEON has special multiplier accumulator
+ /// forwarding to allow mul + mla being issued back to back.
+ bool HasVMLxForwarding;
+
/// SlowFPBrcc - True if floating point compare + branch is slow.
bool SlowFPBrcc;
@@ -182,6 +186,7 @@
bool hasT2ExtractPack() const { return HasT2ExtractPack; }
bool hasDataBarrier() const { return HasDataBarrier; }
bool useFPVMLx() const { return !SlowFPVMLx; }
+ bool hasVMLxForwarding() const { return HasVMLxForwarding; }
bool isFPBrccSlow() const { return SlowFPBrcc; }
bool isFPOnlySP() const { return FPOnlySP; }
bool prefers32BitThumb() const { return Pref32BitThumb; }
Modified: llvm/trunk/test/CodeGen/ARM/vmul.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/vmul.ll?rev=128665&r1=128664&r2=128665&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/ARM/vmul.ll (original)
+++ llvm/trunk/test/CodeGen/ARM/vmul.ll Thu Mar 31 14:38:48 2011
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s
define <8 x i8> @vmuli8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: vmuli8:
@@ -466,3 +466,29 @@
declare <16 x i8> @llvm.arm.neon.vld1.v16i8(i8*, i32) nounwind readonly
declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32) nounwind
+
+; Take advantage of the Cortex-A8 multiplier accumulator forward.
+
+%struct.uint8x8_t = type { <8 x i8> }
+
+define void @distribue2(%struct.uint8x8_t* nocapture %dst, i8* %src, i32 %mul) nounwind {
+entry:
+; CHECK: distribue2
+; CHECK-NOT: vadd.i8
+; CHECK: vmul.i8
+; CHECK: vmla.i8
+ %0 = trunc i32 %mul to i8
+ %1 = insertelement <8 x i8> undef, i8 %0, i32 0
+ %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
+ %3 = tail call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %src, i32 1)
+ %4 = bitcast <16 x i8> %3 to <2 x double>
+ %5 = extractelement <2 x double> %4, i32 1
+ %6 = bitcast double %5 to <8 x i8>
+ %7 = extractelement <2 x double> %4, i32 0
+ %8 = bitcast double %7 to <8 x i8>
+ %9 = add <8 x i8> %6, %8
+ %10 = mul <8 x i8> %9, %2
+ %11 = getelementptr inbounds %struct.uint8x8_t* %dst, i32 0, i32 0
+ store <8 x i8> %10, <8 x i8>* %11, align 8
+ ret void
+}
More information about the llvm-commits
mailing list