<div dir="ltr">Looks much nicer now, thanks!<div><br></div><div>-eric</div></div><br><div class="gmail_quote"><div dir="ltr">On Thu, Jun 2, 2016 at 11:10 AM Matthias Braun via llvm-commits <<a href="mailto:llvm-commits@lists.llvm.org">llvm-commits@lists.llvm.org</a>> wrote:<br></div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">Author: matze<br>
Date: Thu Jun 2 13:03:53 2016<br>
New Revision: 271555<br>
<br>
URL: <a href="http://llvm.org/viewvc/llvm-project?rev=271555&view=rev" rel="noreferrer" target="_blank">http://llvm.org/viewvc/llvm-project?rev=271555&view=rev</a><br>
Log:<br>
AArch64: Do not test for CPUs, use SubtargetFeatures<br>
<br>
Testing for specific CPUs has a number of problems, better use subtarget<br>
features:<br>
- When some tweak is added for a specific CPU it is often desirable for<br>
the next version of that CPU as well, yet we often forget to add it.<br>
- It is hard to keep track of checks scattered around the target code;<br>
Declaring all target specifics together with the CPU in the tablegen<br>
file is a clear representation.<br>
- Subtarget features can be tweaked from the command line.<br>
<br>
To discourage people from using CPU checks in the future I removed the<br>
isCortexXX(), isCyclone(), ... functions. I added an getProcFamily()<br>
function for exceptional circumstances but made it clear in the comment<br>
that usage is discouraged.<br>
<br>
Reformat feature list in AArch64.td to have 1 feature per line in<br>
alphabetical order to simplify merging and sorting for out of tree<br>
tweaks.<br>
<br>
No functional change intended.<br>
<br>
Differential Revision: <a href="http://reviews.llvm.org/D20762" rel="noreferrer" target="_blank">http://reviews.llvm.org/D20762</a><br>
<br>
Modified:<br>
llvm/trunk/lib/Target/AArch64/AArch64.td<br>
llvm/trunk/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp<br>
llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp<br>
llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.cpp<br>
llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.td<br>
llvm/trunk/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp<br>
llvm/trunk/lib/Target/AArch64/AArch64Subtarget.cpp<br>
llvm/trunk/lib/Target/AArch64/AArch64Subtarget.h<br>
llvm/trunk/lib/Target/AArch64/AArch64TargetMachine.cpp<br>
llvm/trunk/lib/Target/AArch64/AArch64TargetTransformInfo.cpp<br>
<br>
Modified: llvm/trunk/lib/Target/AArch64/AArch64.td<br>
URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64.td?rev=271555&r1=271554&r2=271555&view=diff" rel="noreferrer" target="_blank">http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64.td?rev=271555&r1=271554&r2=271555&view=diff</a><br>
==============================================================================<br>
--- llvm/trunk/lib/Target/AArch64/AArch64.td (original)<br>
+++ llvm/trunk/lib/Target/AArch64/AArch64.td Thu Jun 2 13:03:53 2016<br>
@@ -58,6 +58,50 @@ def FeatureReserveX18 : SubtargetFeature<br>
"Reserve X18, making it unavailable "<br>
"as a GPR">;<br>
<br>
+def FeatureMergeNarrowLd : SubtargetFeature<"merge-narrow-ld",<br>
+ "MergeNarrowLoads", "true",<br>
+ "Merge narrow load instructions">;<br>
+<br>
+def FeatureUseAA : SubtargetFeature<"use-aa", "UseAA", "true",<br>
+ "Use alias analysis during codegen">;<br>
+<br>
+def FeatureBalanceFPOps : SubtargetFeature<"balance-fp-ops", "BalanceFPOps",<br>
+ "true",<br>
+ "balance mix of odd and even D-registers for fp multiply(-accumulate) ops">;<br>
+<br>
+def FeaturePredictableSelectIsExpensive : SubtargetFeature<<br>
+ "predictable-select-expensive", "PredictableSelectIsExpensive", "true",<br>
+ "Prefer likely predicted branches over selects">;<br>
+<br>
+def FeatureCustomCheapAsMoveHandling : SubtargetFeature<"custom-cheap-as-move",<br>
+ "CustomAsCheapAsMove", "true",<br>
+ "Use custom code for TargetInstrInfo::isAsCheapAsAMove()">;<br>
+<br>
+def FeaturePostRAScheduler : SubtargetFeature<"use-postra-scheduler",<br>
+ "UsePostRAScheduler", "true", "Schedule again after register allocation">;<br>
+<br>
+def FeatureSlowMisaligned128Store : SubtargetFeature<"slow-misaligned-128store",<br>
+ "Misaligned128StoreIsSlow", "true", "Misaligned 128 bit stores are slow">;<br>
+<br>
+def FeatureAvoidQuadLdStPairs : SubtargetFeature<"no-quad-ldst-pairs",<br>
+ "AvoidQuadLdStPairs", "true",<br>
+ "Do not form quad load/store pair operations">;<br>
+<br>
+def FeatureAlternateSExtLoadCVTF32Pattern : SubtargetFeature<<br>
+ "alternate-sextload-cvt-f32-pattern", "UseAlternateSExtLoadCVTF32Pattern",<br>
+ "true", "Use alternative pattern for sextload convert to f32">;<br>
+<br>
+def FeatureMacroOpFusion : SubtargetFeature<<br>
+ "macroop-fusion", "HasMacroOpFusion", "true",<br>
+ "CPU supports macro op fusion">;<br>
+<br>
+def FeatureDisableLatencySchedHeuristic : SubtargetFeature<<br>
+ "disable-latency-sched-heuristic", "DisableLatencySchedHeuristic", "true",<br>
+ "Disable latency scheduling heuristic">;<br>
+<br>
+def FeatureUseRSqrt : SubtargetFeature<<br>
+ "use-reverse-square-root", "UseRSqrt", "true", "Use reverse square root">;<br>
+<br>
//===----------------------------------------------------------------------===//<br>
// Architectures.<br>
//<br>
@@ -94,57 +138,87 @@ include "AArch64SchedM1.td"<br>
include "AArch64SchedKryo.td"<br>
<br>
def ProcA35 : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35",<br>
- "Cortex-A35 ARM processors",<br>
- [FeatureFPARMv8,<br>
- FeatureNEON,<br>
- FeatureCrypto,<br>
+ "Cortex-A35 ARM processors", [<br>
FeatureCRC,<br>
- FeaturePerfMon]>;<br>
+ FeatureCrypto,<br>
+ FeatureFPARMv8,<br>
+ FeatureNEON,<br>
+ FeaturePerfMon<br>
+ ]>;<br>
<br>
def ProcA53 : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",<br>
- "Cortex-A53 ARM processors",<br>
- [FeatureFPARMv8,<br>
- FeatureNEON,<br>
- FeatureCrypto,<br>
+ "Cortex-A53 ARM processors", [<br>
+ FeatureBalanceFPOps,<br>
FeatureCRC,<br>
- FeaturePerfMon]>;<br>
+ FeatureCrypto,<br>
+ FeatureCustomCheapAsMoveHandling,<br>
+ FeatureFPARMv8,<br>
+ FeatureNEON,<br>
+ FeaturePerfMon,<br>
+ FeaturePostRAScheduler,<br>
+ FeatureUseAA<br>
+ ]>;<br>
<br>
def ProcA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",<br>
- "Cortex-A57 ARM processors",<br>
- [FeatureFPARMv8,<br>
- FeatureNEON,<br>
- FeatureCrypto,<br>
+ "Cortex-A57 ARM processors", [<br>
+ FeatureBalanceFPOps,<br>
FeatureCRC,<br>
- FeaturePerfMon]>;<br>
+ FeatureCrypto,<br>
+ FeatureCustomCheapAsMoveHandling,<br>
+ FeatureFPARMv8,<br>
+ FeatureMergeNarrowLd,<br>
+ FeatureNEON,<br>
+ FeaturePerfMon,<br>
+ FeaturePostRAScheduler,<br>
+ FeaturePredictableSelectIsExpensive<br>
+ ]>;<br>
<br>
def ProcCyclone : SubtargetFeature<"cyclone", "ARMProcFamily", "Cyclone",<br>
- "Cyclone",<br>
- [FeatureFPARMv8,<br>
- FeatureNEON,<br>
+ "Cyclone", [<br>
+ FeatureAlternateSExtLoadCVTF32Pattern,<br>
FeatureCrypto,<br>
+ FeatureDisableLatencySchedHeuristic,<br>
+ FeatureFPARMv8,<br>
+ FeatureMacroOpFusion,<br>
+ FeatureNEON,<br>
FeaturePerfMon,<br>
- FeatureZCRegMove, FeatureZCZeroing]>;<br>
+ FeatureSlowMisaligned128Store,<br>
+ FeatureZCRegMove,<br>
+ FeatureZCZeroing<br>
+ ]>;<br>
<br>
def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1",<br>
- "Samsung Exynos-M1 processors",<br>
- [FeatureFPARMv8,<br>
- FeatureNEON,<br>
- FeatureCrypto,<br>
+ "Samsung Exynos-M1 processors", [<br>
+ FeatureAvoidQuadLdStPairs,<br>
FeatureCRC,<br>
- FeaturePerfMon]>;<br>
+ FeatureCrypto,<br>
+ FeatureCustomCheapAsMoveHandling,<br>
+ FeatureFPARMv8,<br>
+ FeatureNEON,<br>
+ FeaturePerfMon,<br>
+ FeatureUseRSqrt<br>
+ ]>;<br>
<br>
def ProcKryo : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo",<br>
- "Qualcomm Kryo processors",<br>
- [FeatureFPARMv8,<br>
- FeatureNEON,<br>
- FeatureCrypto,<br>
+ "Qualcomm Kryo processors", [<br>
FeatureCRC,<br>
- FeaturePerfMon]>;<br>
-<br>
-def : ProcessorModel<"generic", NoSchedModel, [FeatureFPARMv8,<br>
- FeatureNEON,<br>
- FeatureCRC,<br>
- FeaturePerfMon]>;<br>
+ FeatureCrypto,<br>
+ FeatureCustomCheapAsMoveHandling,<br>
+ FeatureFPARMv8,<br>
+ FeatureMergeNarrowLd,<br>
+ FeatureNEON,<br>
+ FeaturePerfMon,<br>
+ FeaturePostRAScheduler,<br>
+ FeaturePredictableSelectIsExpensive<br>
+ ]>;<br>
+<br>
+def : ProcessorModel<"generic", NoSchedModel, [<br>
+ FeatureCRC,<br>
+ FeatureFPARMv8,<br>
+ FeatureNEON,<br>
+ FeaturePerfMon,<br>
+ FeaturePostRAScheduler<br>
+ ]>;<br>
<br>
// FIXME: Cortex-A35 is currently modelled as a Cortex-A53<br>
def : ProcessorModel<"cortex-a35", CortexA53Model, [ProcA35]>;<br>
<br>
Modified: llvm/trunk/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp<br>
URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp?rev=271555&r1=271554&r2=271555&view=diff" rel="noreferrer" target="_blank">http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp?rev=271555&r1=271554&r2=271555&view=diff</a><br>
==============================================================================<br>
--- llvm/trunk/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp (original)<br>
+++ llvm/trunk/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp Thu Jun 2 13:03:53 2016<br>
@@ -314,9 +314,7 @@ bool AArch64A57FPLoadBalancing::runOnMac<br>
if (skipFunction(*F.getFunction()))<br>
return false;<br>
<br>
- // Don't do anything if this isn't an A53 or A57.<br>
- if (!(F.getSubtarget<AArch64Subtarget>().isCortexA53() ||<br>
- F.getSubtarget<AArch64Subtarget>().isCortexA57()))<br>
+ if (!F.getSubtarget<AArch64Subtarget>().balanceFPOps())<br>
return false;<br>
<br>
bool Changed = false;<br>
<br>
Modified: llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp<br>
URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp?rev=271555&r1=271554&r2=271555&view=diff" rel="noreferrer" target="_blank">http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp?rev=271555&r1=271554&r2=271555&view=diff</a><br>
==============================================================================<br>
--- llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp (original)<br>
+++ llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp Thu Jun 2 13:03:53 2016<br>
@@ -634,9 +634,7 @@ AArch64TargetLowering::AArch64TargetLowe<br>
}<br>
}<br>
<br>
- // Prefer likely predicted branches to selects on out-of-order cores.<br>
- if (Subtarget->isCortexA57() || Subtarget->isKryo())<br>
- PredictableSelectIsExpensive = true;<br>
+ PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();<br>
}<br>
<br>
void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {<br>
@@ -814,12 +812,9 @@ bool AArch64TargetLowering::allowsMisali<br>
if (Subtarget->requiresStrictAlign())<br>
return false;<br>
<br>
- // FIXME: This is mostly true for Cyclone, but not necessarily others.<br>
if (Fast) {<br>
- // FIXME: Define an attribute for slow unaligned accesses instead of<br>
- // relying on the CPU type as a proxy.<br>
- // On Cyclone, unaligned 128-bit stores are slow.<br>
- *Fast = !Subtarget->isCyclone() || VT.getStoreSize() != 16 ||<br>
+ // Some CPUs are fine with unaligned stores except for 128-bit ones.<br>
+ *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||<br>
// See comments in performSTORECombine() for more details about<br>
// these conditions.<br>
<br>
@@ -8792,9 +8787,7 @@ static SDValue split16BStores(SDNode *N,<br>
// be included in TLI.allowsMisalignedMemoryAccesses(), and there should be<br>
// a call to that function here.<br>
<br>
- // Cyclone has bad performance on unaligned 16B stores when crossing line and<br>
- // page boundaries. We want to split such stores.<br>
- if (!Subtarget->isCyclone())<br>
+ if (!Subtarget->isMisaligned128StoreSlow())<br>
return SDValue();<br>
<br>
// Don't split at -Oz.<br>
<br>
Modified: llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.cpp<br>
URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.cpp?rev=271555&r1=271554&r2=271555&view=diff" rel="noreferrer" target="_blank">http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.cpp?rev=271555&r1=271554&r2=271555&view=diff</a><br>
==============================================================================<br>
--- llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.cpp (original)<br>
+++ llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.cpp Thu Jun 2 13:03:53 2016<br>
@@ -544,8 +544,7 @@ static bool canBeExpandedToORR(const Mac<br>
// FIXME: this implementation should be micro-architecture dependent, so a<br>
// micro-architecture target hook should be introduced here in future.<br>
bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const {<br>
- if (!Subtarget.isCortexA57() && !Subtarget.isCortexA53() &&<br>
- !Subtarget.isExynosM1() && !Subtarget.isKryo())<br>
+ if (!Subtarget.hasCustomCheapAsMoveHandling())<br>
return MI->isAsCheapAsAMove();<br>
<br>
unsigned Imm;<br>
@@ -559,7 +558,7 @@ bool AArch64InstrInfo::isAsCheapAsAMove(<br>
case AArch64::ADDXri:<br>
case AArch64::SUBWri:<br>
case AArch64::SUBXri:<br>
- return (Subtarget.isExynosM1() ||<br>
+ return (Subtarget.getProcFamily() == AArch64Subtarget::ExynosM1 ||<br>
MI->getOperand(3).getImm() == 0);<br>
<br>
// add/sub on register with shift<br>
@@ -568,7 +567,7 @@ bool AArch64InstrInfo::isAsCheapAsAMove(<br>
case AArch64::SUBWrs:<br>
case AArch64::SUBXrs:<br>
Imm = MI->getOperand(3).getImm();<br>
- return (Subtarget.isExynosM1() &&<br>
+ return (Subtarget.getProcFamily() == AArch64Subtarget::ExynosM1 &&<br>
AArch64_AM::getArithShiftValue(Imm) < 4);<br>
<br>
// logical ops on immediate<br>
@@ -609,7 +608,7 @@ bool AArch64InstrInfo::isAsCheapAsAMove(<br>
case AArch64::ORRWrs:<br>
case AArch64::ORRXrs:<br>
Imm = MI->getOperand(3).getImm();<br>
- return (Subtarget.isExynosM1() &&<br>
+ return (Subtarget.getProcFamily() == AArch64Subtarget::ExynosM1 &&<br>
AArch64_AM::getShiftValue(Imm) < 4 &&<br>
AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL);<br>
<br>
@@ -1522,8 +1521,8 @@ bool AArch64InstrInfo::isCandidateToMerg<br>
if (isLdStPairSuppressed(MI))<br>
return false;<br>
<br>
- // Do not pair quad ld/st for Exynos.<br>
- if (Subtarget.isExynosM1()) {<br>
+ // On some CPUs quad load/store pairs are slower than two single load/stores.<br>
+ if (Subtarget.avoidQuadLdStPairs()) {<br>
switch (MI->getOpcode()) {<br>
default:<br>
break;<br>
@@ -1801,8 +1800,8 @@ bool AArch64InstrInfo::shouldClusterMemO<br>
<br>
bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr *First,<br>
MachineInstr *Second) const {<br>
- if (Subtarget.isCyclone()) {<br>
- // Cyclone can fuse CMN, CMP, TST followed by Bcc.<br>
+ if (Subtarget.hasMacroOpFusion()) {<br>
+ // Fuse CMN, CMP, TST followed by Bcc.<br>
unsigned SecondOpcode = Second->getOpcode();<br>
if (SecondOpcode == AArch64::Bcc) {<br>
switch (First->getOpcode()) {<br>
@@ -1817,7 +1816,7 @@ bool AArch64InstrInfo::shouldScheduleAdj<br>
return true;<br>
}<br>
}<br>
- // Cyclone B0 also supports ALU operations followed by CBZ/CBNZ.<br>
+ // Fuse ALU operations followed by CBZ/CBNZ.<br>
if (SecondOpcode == AArch64::CBNZW || SecondOpcode == AArch64::CBNZX ||<br>
SecondOpcode == AArch64::CBZW || SecondOpcode == AArch64::CBZX) {<br>
switch (First->getOpcode()) {<br>
<br>
Modified: llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.td<br>
URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.td?rev=271555&r1=271554&r2=271555&view=diff" rel="noreferrer" target="_blank">http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.td?rev=271555&r1=271554&r2=271555&view=diff</a><br>
==============================================================================<br>
--- llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.td (original)<br>
+++ llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.td Thu Jun 2 13:03:53 2016<br>
@@ -34,7 +34,8 @@ def HasSPE : Predicate<"Subtar<br>
<br>
def IsLE : Predicate<"Subtarget->isLittleEndian()">;<br>
def IsBE : Predicate<"!Subtarget->isLittleEndian()">;<br>
-def IsCyclone : Predicate<"Subtarget->isCyclone()">;<br>
+def UseAlternateSExtLoadCVTF32<br>
+ : Predicate<"Subtarget->useAlternateSExtLoadCVTF32Pattern()">;<br>
<br>
//===----------------------------------------------------------------------===//<br>
// AArch64-specific DAG Nodes.<br>
@@ -4957,7 +4958,8 @@ class SExtLoadi8CVTf32Pat<dag addrmode,<br>
0),<br>
dsub)),<br>
0),<br>
- ssub)))>, Requires<[NotForCodeSize, IsCyclone]>;<br>
+ ssub)))>,<br>
+ Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32]>;<br>
<br>
def : SExtLoadi8CVTf32Pat<(ro8.Wpat GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext),<br>
(LDRBroW GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext)>;<br>
@@ -5010,7 +5012,8 @@ class SExtLoadi16CVTf64Pat<dag addrmode,<br>
0),<br>
dsub)),<br>
0),<br>
- dsub)))>, Requires<[NotForCodeSize, IsCyclone]>;<br>
+ dsub)))>,<br>
+ Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32]>;<br>
<br>
def : SExtLoadi16CVTf64Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext),<br>
(LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>;<br>
<br>
Modified: llvm/trunk/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp<br>
URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp?rev=271555&r1=271554&r2=271555&view=diff" rel="noreferrer" target="_blank">http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp?rev=271555&r1=271554&r2=271555&view=diff</a><br>
==============================================================================<br>
--- llvm/trunk/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp (original)<br>
+++ llvm/trunk/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp Thu Jun 2 13:03:53 2016<br>
@@ -160,10 +160,6 @@ struct AArch64LoadStoreOpt : public Mach<br>
// Find and promote load instructions which read directly from store.<br>
bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI);<br>
<br>
- // Check if converting two narrow loads into a single wider load with<br>
- // bitfield extracts could be enabled.<br>
- bool enableNarrowLdMerge(MachineFunction &Fn);<br>
-<br>
bool optimizeBlock(MachineBasicBlock &MBB, bool enableNarrowLdOpt);<br>
<br>
bool runOnMachineFunction(MachineFunction &Fn) override;<br>
@@ -1912,15 +1908,6 @@ bool AArch64LoadStoreOpt::optimizeBlock(<br>
return Modified;<br>
}<br>
<br>
-bool AArch64LoadStoreOpt::enableNarrowLdMerge(MachineFunction &Fn) {<br>
- bool ProfitableArch = Subtarget->isCortexA57() || Subtarget->isKryo();<br>
- // FIXME: The benefit from converting narrow loads into a wider load could be<br>
- // microarchitectural as it assumes that a single load with two bitfield<br>
- // extracts is cheaper than two narrow loads. Currently, this conversion is<br>
- // enabled only in cortex-a57 on which performance benefits were verified.<br>
- return ProfitableArch && !Subtarget->requiresStrictAlign();<br>
-}<br>
-<br>
bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {<br>
if (skipFunction(*Fn.getFunction()))<br>
return false;<br>
@@ -1936,7 +1923,8 @@ bool AArch64LoadStoreOpt::runOnMachineFu<br>
UsedRegs.resize(TRI->getNumRegs());<br>
<br>
bool Modified = false;<br>
- bool enableNarrowLdOpt = enableNarrowLdMerge(Fn);<br>
+ bool enableNarrowLdOpt =<br>
+ Subtarget->mergeNarrowLoads() && !Subtarget->requiresStrictAlign();<br>
for (auto &MBB : Fn)<br>
Modified |= optimizeBlock(MBB, enableNarrowLdOpt);<br>
<br>
<br>
Modified: llvm/trunk/lib/Target/AArch64/AArch64Subtarget.cpp<br>
URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64Subtarget.cpp?rev=271555&r1=271554&r2=271555&view=diff" rel="noreferrer" target="_blank">http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64Subtarget.cpp?rev=271555&r1=271554&r2=271555&view=diff</a><br>
==============================================================================<br>
--- llvm/trunk/lib/Target/AArch64/AArch64Subtarget.cpp (original)<br>
+++ llvm/trunk/lib/Target/AArch64/AArch64Subtarget.cpp Thu Jun 2 13:03:53 2016<br>
@@ -44,9 +44,36 @@ AArch64Subtarget::initializeSubtargetDep<br>
CPUString = "generic";<br>
<br>
ParseSubtargetFeatures(CPUString, FS);<br>
+ initializeProperties();<br>
+<br>
return *this;<br>
}<br>
<br>
+void AArch64Subtarget::initializeProperties() {<br>
+ // Initialize CPU specific properties. We should add a tablegen feature for<br>
+ // this in the future so we can specify it together with the subtarget<br>
+ // features.<br>
+ switch (ARMProcFamily) {<br>
+ case Cyclone:<br>
+ CacheLineSize = 64;<br>
+ PrefetchDistance = 280;<br>
+ MinPrefetchStride = 2048;<br>
+ MaxPrefetchIterationsAhead = 3;<br>
+ break;<br>
+ case CortexA57:<br>
+ MaxInterleaveFactor = 4;<br>
+ break;<br>
+ case Kryo:<br>
+ MaxInterleaveFactor = 4;<br>
+ VectorInsertExtractBaseCost = 2;<br>
+ break;<br>
+ case Others: break;<br>
+ case CortexA35: break;<br>
+ case CortexA53: break;<br>
+ case ExynosM1: break;<br>
+ }<br>
+}<br>
+<br>
AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU,<br>
const std::string &FS,<br>
const TargetMachine &TM, bool LittleEndian)<br>
@@ -110,8 +137,7 @@ void AArch64Subtarget::overrideSchedPoli<br>
// Enabling or Disabling the latency heuristic is a close call: It seems to<br>
// help nearly no benchmark on out-of-order architectures, on the other hand<br>
// it regresses register pressure on a few benchmarking.<br>
- if (isCyclone())<br>
- Policy.DisableLatencyHeuristic = true;<br>
+ Policy.DisableLatencyHeuristic = DisableLatencySchedHeuristic;<br>
}<br>
<br>
bool AArch64Subtarget::enableEarlyIfConversion() const {<br>
@@ -133,8 +159,5 @@ bool AArch64Subtarget::supportsAddressTo<br>
<br>
std::unique_ptr<PBQPRAConstraint><br>
AArch64Subtarget::getCustomPBQPConstraints() const {<br>
- if (!isCortexA57())<br>
- return nullptr;<br>
-<br>
- return llvm::make_unique<A57ChainingConstraint>();<br>
+ return balanceFPOps() ? llvm::make_unique<A57ChainingConstraint>() : nullptr;<br>
}<br>
<br>
Modified: llvm/trunk/lib/Target/AArch64/AArch64Subtarget.h<br>
URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64Subtarget.h?rev=271555&r1=271554&r2=271555&view=diff" rel="noreferrer" target="_blank">http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64Subtarget.h?rev=271555&r1=271554&r2=271555&view=diff</a><br>
==============================================================================<br>
--- llvm/trunk/lib/Target/AArch64/AArch64Subtarget.h (original)<br>
+++ llvm/trunk/lib/Target/AArch64/AArch64Subtarget.h Thu Jun 2 13:03:53 2016<br>
@@ -33,8 +33,8 @@ class StringRef;<br>
class Triple;<br>
<br>
class AArch64Subtarget : public AArch64GenSubtargetInfo {<br>
-protected:<br>
- enum ARMProcFamilyEnum {<br>
+public:<br>
+ enum ARMProcFamilyEnum : uint8_t {<br>
Others,<br>
CortexA35,<br>
CortexA53,<br>
@@ -44,6 +44,7 @@ protected:<br>
Kryo<br>
};<br>
<br>
+protected:<br>
/// ARMProcFamily - ARM processor family: Cortex-A53, Cortex-A57, and others.<br>
ARMProcFamilyEnum ARMProcFamily = Others;<br>
<br>
@@ -66,6 +67,24 @@ protected:<br>
<br>
// StrictAlign - Disallow unaligned memory accesses.<br>
bool StrictAlign = false;<br>
+ bool MergeNarrowLoads = false;<br>
+ bool UseAA = false;<br>
+ bool PredictableSelectIsExpensive = false;<br>
+ bool BalanceFPOps = false;<br>
+ bool CustomAsCheapAsMove = false;<br>
+ bool UsePostRAScheduler = false;<br>
+ bool Misaligned128StoreIsSlow = false;<br>
+ bool AvoidQuadLdStPairs = false;<br>
+ bool UseAlternateSExtLoadCVTF32Pattern = false;<br>
+ bool HasMacroOpFusion = false;<br>
+ bool DisableLatencySchedHeuristic = false;<br>
+ bool UseRSqrt = false;<br>
+ uint8_t MaxInterleaveFactor = 2;<br>
+ uint8_t VectorInsertExtractBaseCost = 3;<br>
+ uint16_t CacheLineSize = 0;<br>
+ uint16_t PrefetchDistance = 0;<br>
+ uint16_t MinPrefetchStride = 1;<br>
+ unsigned MaxPrefetchIterationsAhead = UINT_MAX;<br>
<br>
// ReserveX18 - X18 is not available as a general purpose register.<br>
bool ReserveX18;<br>
@@ -93,6 +112,9 @@ private:<br>
/// subtarget initialization.<br>
AArch64Subtarget &initializeSubtargetDependencies(StringRef FS);<br>
<br>
+ /// Initialize properties based on the selected processor family.<br>
+ void initializeProperties();<br>
+<br>
public:<br>
/// This constructor initializes the data members to match that<br>
/// of the specified triple.<br>
@@ -123,7 +145,15 @@ public:<br>
const Triple &getTargetTriple() const { return TargetTriple; }<br>
bool enableMachineScheduler() const override { return true; }<br>
bool enablePostRAScheduler() const override {<br>
- return isGeneric() || isCortexA53() || isCortexA57() || isKryo();<br>
+ return UsePostRAScheduler;<br>
+ }<br>
+<br>
+ /// Returns ARM processor family.<br>
+ /// Avoid this function! CPU specifics should be kept local to this class<br>
+ /// and preferably modeled with SubtargetFeatures or properties in<br>
+ /// initializeProperties().<br>
+ ARMProcFamilyEnum getProcFamily() const {<br>
+ return ARMProcFamily;<br>
}<br>
<br>
bool hasV8_1aOps() const { return HasV8_1aOps; }<br>
@@ -140,6 +170,30 @@ public:<br>
bool hasNEON() const { return HasNEON; }<br>
bool hasCrypto() const { return HasCrypto; }<br>
bool hasCRC() const { return HasCRC; }<br>
+ bool mergeNarrowLoads() const { return MergeNarrowLoads; }<br>
+ bool balanceFPOps() const { return BalanceFPOps; }<br>
+ bool predictableSelectIsExpensive() const {<br>
+ return PredictableSelectIsExpensive;<br>
+ }<br>
+ bool hasCustomCheapAsMoveHandling() const { return CustomAsCheapAsMove; }<br>
+ bool isMisaligned128StoreSlow() const { return Misaligned128StoreIsSlow; }<br>
+ bool avoidQuadLdStPairs() const { return AvoidQuadLdStPairs; }<br>
+ bool useAlternateSExtLoadCVTF32Pattern() const {<br>
+ return UseAlternateSExtLoadCVTF32Pattern;<br>
+ }<br>
+ bool hasMacroOpFusion() const { return HasMacroOpFusion; }<br>
+ bool useRSqrt() const { return UseRSqrt; }<br>
+ unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; }<br>
+ unsigned getVectorInsertExtractBaseCost() const {<br>
+ return VectorInsertExtractBaseCost;<br>
+ }<br>
+ unsigned getCacheLineSize() const { return CacheLineSize; }<br>
+ unsigned getPrefetchDistance() const { return PrefetchDistance; }<br>
+ unsigned getMinPrefetchStride() const { return MinPrefetchStride; }<br>
+ unsigned getMaxPrefetchIterationsAhead() const {<br>
+ return MaxPrefetchIterationsAhead;<br>
+ }<br>
+<br>
/// CPU has TBI (top byte of addresses is ignored during HW address<br>
/// translation) and OS enables it.<br>
bool supportsAddressTopByteIgnored() const;<br>
@@ -160,14 +214,7 @@ public:<br>
bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }<br>
bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }<br>
<br>
- bool isGeneric() const { return CPUString == "generic"; }<br>
- bool isCyclone() const { return CPUString == "cyclone"; }<br>
- bool isCortexA57() const { return CPUString == "cortex-a57"; }<br>
- bool isCortexA53() const { return CPUString == "cortex-a53"; }<br>
- bool isExynosM1() const { return CPUString == "exynos-m1"; }<br>
- bool isKryo() const { return CPUString == "kryo"; }<br>
-<br>
- bool useAA() const override { return isCortexA53(); }<br>
+ bool useAA() const override { return UseAA; }<br>
<br>
/// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size<br>
/// that still makes it profitable to inline the call.<br>
<br>
Modified: llvm/trunk/lib/Target/AArch64/AArch64TargetMachine.cpp<br>
URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64TargetMachine.cpp?rev=271555&r1=271554&r2=271555&view=diff" rel="noreferrer" target="_blank">http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64TargetMachine.cpp?rev=271555&r1=271554&r2=271555&view=diff</a><br>
==============================================================================<br>
--- llvm/trunk/lib/Target/AArch64/AArch64TargetMachine.cpp (original)<br>
+++ llvm/trunk/lib/Target/AArch64/AArch64TargetMachine.cpp Thu Jun 2 13:03:53 2016<br>
@@ -147,8 +147,7 @@ static void initReciprocals(AArch64Targe<br>
// (52 mantissa bits) are 2 and 3, respectively.<br>
unsigned ExtraStepsF = 2,<br>
ExtraStepsD = ExtraStepsF + 1;<br>
- // FIXME: Enable x^-1/2 only for Exynos M1 at the moment.<br>
- bool UseRsqrt = ST.isExynosM1();<br>
+ bool UseRsqrt = ST.useRSqrt();<br>
<br>
TM.Options.Reciprocals.setDefaults("sqrtf", UseRsqrt, ExtraStepsF);<br>
TM.Options.Reciprocals.setDefaults("sqrtd", UseRsqrt, ExtraStepsD);<br>
<br>
Modified: llvm/trunk/lib/Target/AArch64/AArch64TargetTransformInfo.cpp<br>
URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64TargetTransformInfo.cpp?rev=271555&r1=271554&r2=271555&view=diff" rel="noreferrer" target="_blank">http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64TargetTransformInfo.cpp?rev=271555&r1=271554&r2=271555&view=diff</a><br>
==============================================================================<br>
--- llvm/trunk/lib/Target/AArch64/AArch64TargetTransformInfo.cpp (original)<br>
+++ llvm/trunk/lib/Target/AArch64/AArch64TargetTransformInfo.cpp Thu Jun 2 13:03:53 2016<br>
@@ -368,9 +368,7 @@ int AArch64TTIImpl::getVectorInstrCost(u<br>
}<br>
<br>
// All other insert/extracts cost this much.<br>
- if (ST->isKryo())<br>
- return 2;<br>
- return 3;<br>
+ return ST->getVectorInsertExtractBaseCost();<br>
}<br>
<br>
int AArch64TTIImpl::getArithmeticInstrCost(<br>
@@ -529,9 +527,7 @@ int AArch64TTIImpl::getCostOfKeepingLive<br>
}<br>
<br>
unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) {<br>
- if (ST->isCortexA57() || ST->isKryo())<br>
- return 4;<br>
- return 2;<br>
+ return ST->getMaxInterleaveFactor();<br>
}<br>
<br>
void AArch64TTIImpl::getUnrollingPreferences(Loop *L,<br>
@@ -630,28 +626,17 @@ bool AArch64TTIImpl::getTgtMemIntrinsic(<br>
}<br>
<br>
unsigned AArch64TTIImpl::getCacheLineSize() {<br>
- if (ST->isCyclone())<br>
- return 64;<br>
- return BaseT::getCacheLineSize();<br>
+ return ST->getCacheLineSize();<br>
}<br>
<br>
unsigned AArch64TTIImpl::getPrefetchDistance() {<br>
- if (ST->isCyclone())<br>
- return 280;<br>
- return BaseT::getPrefetchDistance();<br>
+ return ST->getPrefetchDistance();<br>
}<br>
<br>
unsigned AArch64TTIImpl::getMinPrefetchStride() {<br>
- if (ST->isCyclone())<br>
- // The HW prefetcher handles accesses with strides up to 2KB.<br>
- return 2048;<br>
- return BaseT::getMinPrefetchStride();<br>
+ return ST->getMinPrefetchStride();<br>
}<br>
<br>
unsigned AArch64TTIImpl::getMaxPrefetchIterationsAhead() {<br>
- if (ST->isCyclone())<br>
- // Be conservative for now and don't prefetch ahead too much since the loop<br>
- // may terminate early.<br>
- return 3;<br>
- return BaseT::getMaxPrefetchIterationsAhead();<br>
+ return ST->getMaxPrefetchIterationsAhead();<br>
}<br>
<br>
<br>
_______________________________________________<br>
llvm-commits mailing list<br>
<a href="mailto:llvm-commits@lists.llvm.org" target="_blank">llvm-commits@lists.llvm.org</a><br>
<a href="http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits" rel="noreferrer" target="_blank">http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits</a><br>
</blockquote></div>