<div dir="ltr">Looks much nicer now, thanks!<div><br></div><div>-eric</div></div><br><div class="gmail_quote"><div dir="ltr">On Thu, Jun 2, 2016 at 11:10 AM Matthias Braun via llvm-commits <<a href="mailto:llvm-commits@lists.llvm.org">llvm-commits@lists.llvm.org</a>> wrote:<br></div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">Author: matze<br>

Date: Thu Jun  2 13:03:53 2016<br>

New Revision: 271555<br>

<br>

URL: <a href="http://llvm.org/viewvc/llvm-project?rev=271555&view=rev" rel="noreferrer" target="_blank">http://llvm.org/viewvc/llvm-project?rev=271555&view=rev</a><br>

Log:<br>

AArch64: Do not test for CPUs, use SubtargetFeatures<br>

<br>

Testing for specific CPUs has a number of problems, better use subtarget<br>

features:<br>

- When some tweak is added for a specific CPU it is often desirable for<br>

  the next version of that CPU as well, yet we often forget to add it.<br>

- It is hard to keep track of checks scattered around the target code;<br>

  Declaring all target specifics together with the CPU in the tablegen<br>

  file is a clear representation.<br>

- Subtarget features can be tweaked from the command line.<br>

<br>

To discourage people from using CPU checks in the future I removed the<br>

isCortexXX(), isCyclone(), ... functions. I added an getProcFamily()<br>

function for exceptional circumstances but made it clear in the comment<br>

that usage is discouraged.<br>

<br>

Reformat feature list in AArch64.td to have 1 feature per line in<br>

alphabetical order to simplify merging and sorting for out of tree<br>

tweaks.<br>

<br>

No functional change intended.<br>

<br>

Differential Revision: <a href="http://reviews.llvm.org/D20762" rel="noreferrer" target="_blank">http://reviews.llvm.org/D20762</a><br>

<br>

Modified:<br>

    llvm/trunk/lib/Target/AArch64/AArch64.td<br>

    llvm/trunk/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp<br>

    llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp<br>

    llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.cpp<br>

    llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.td<br>

    llvm/trunk/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp<br>

    llvm/trunk/lib/Target/AArch64/AArch64Subtarget.cpp<br>

    llvm/trunk/lib/Target/AArch64/AArch64Subtarget.h<br>

    llvm/trunk/lib/Target/AArch64/AArch64TargetMachine.cpp<br>

    llvm/trunk/lib/Target/AArch64/AArch64TargetTransformInfo.cpp<br>

<br>

Modified: llvm/trunk/lib/Target/AArch64/AArch64.td<br>

URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64.td?rev=271555&r1=271554&r2=271555&view=diff" rel="noreferrer" target="_blank">http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64.td?rev=271555&r1=271554&r2=271555&view=diff</a><br>

==============================================================================<br>

--- llvm/trunk/lib/Target/AArch64/AArch64.td (original)<br>

+++ llvm/trunk/lib/Target/AArch64/AArch64.td Thu Jun  2 13:03:53 2016<br>

@@ -58,6 +58,50 @@ def FeatureReserveX18 : SubtargetFeature<br>

                                          "Reserve X18, making it unavailable "<br>

                                          "as a GPR">;<br>

<br>

+def FeatureMergeNarrowLd : SubtargetFeature<"merge-narrow-ld",<br>

+                                            "MergeNarrowLoads", "true",<br>

+                                            "Merge narrow load instructions">;<br>

+<br>

+def FeatureUseAA : SubtargetFeature<"use-aa", "UseAA", "true",<br>

+                                    "Use alias analysis during codegen">;<br>

+<br>

+def FeatureBalanceFPOps : SubtargetFeature<"balance-fp-ops", "BalanceFPOps",<br>

+    "true",<br>

+    "balance mix of odd and even D-registers for fp multiply(-accumulate) ops">;<br>

+<br>

+def FeaturePredictableSelectIsExpensive : SubtargetFeature<<br>

+    "predictable-select-expensive", "PredictableSelectIsExpensive", "true",<br>

+    "Prefer likely predicted branches over selects">;<br>

+<br>

+def FeatureCustomCheapAsMoveHandling : SubtargetFeature<"custom-cheap-as-move",<br>

+    "CustomAsCheapAsMove", "true",<br>

+    "Use custom code for TargetInstrInfo::isAsCheapAsAMove()">;<br>

+<br>

+def FeaturePostRAScheduler : SubtargetFeature<"use-postra-scheduler",<br>

+    "UsePostRAScheduler", "true", "Schedule again after register allocation">;<br>

+<br>

+def FeatureSlowMisaligned128Store : SubtargetFeature<"slow-misaligned-128store",<br>

+    "Misaligned128StoreIsSlow", "true", "Misaligned 128 bit stores are slow">;<br>

+<br>

+def FeatureAvoidQuadLdStPairs : SubtargetFeature<"no-quad-ldst-pairs",<br>

+    "AvoidQuadLdStPairs", "true",<br>

+    "Do not form quad load/store pair operations">;<br>

+<br>

+def FeatureAlternateSExtLoadCVTF32Pattern : SubtargetFeature<<br>

+    "alternate-sextload-cvt-f32-pattern", "UseAlternateSExtLoadCVTF32Pattern",<br>

+    "true", "Use alternative pattern for sextload convert to f32">;<br>

+<br>

+def FeatureMacroOpFusion : SubtargetFeature<<br>

+    "macroop-fusion", "HasMacroOpFusion", "true",<br>

+    "CPU supports macro op fusion">;<br>

+<br>

+def FeatureDisableLatencySchedHeuristic : SubtargetFeature<<br>

+    "disable-latency-sched-heuristic", "DisableLatencySchedHeuristic", "true",<br>

+    "Disable latency scheduling heuristic">;<br>

+<br>

+def FeatureUseRSqrt : SubtargetFeature<<br>

+    "use-reverse-square-root", "UseRSqrt", "true", "Use reverse square root">;<br>

+<br>

 //===----------------------------------------------------------------------===//<br>

 // Architectures.<br>

 //<br>

@@ -94,57 +138,87 @@ include "AArch64SchedM1.td"<br>

 include "AArch64SchedKryo.td"<br>

<br>

 def ProcA35     : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35",<br>

-                                   "Cortex-A35 ARM processors",<br>

-                                   [FeatureFPARMv8,<br>

-                                   FeatureNEON,<br>

-                                   FeatureCrypto,<br>

+                                   "Cortex-A35 ARM processors", [<br>

                                    FeatureCRC,<br>

-                                   FeaturePerfMon]>;<br>

+                                   FeatureCrypto,<br>

+                                   FeatureFPARMv8,<br>

+                                   FeatureNEON,<br>

+                                   FeaturePerfMon<br>

+                                   ]>;<br>

<br>

 def ProcA53     : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",<br>

-                                   "Cortex-A53 ARM processors",<br>

-                                   [FeatureFPARMv8,<br>

-                                   FeatureNEON,<br>

-                                   FeatureCrypto,<br>

+                                   "Cortex-A53 ARM processors", [<br>

+                                   FeatureBalanceFPOps,<br>

                                    FeatureCRC,<br>

-                                   FeaturePerfMon]>;<br>

+                                   FeatureCrypto,<br>

+                                   FeatureCustomCheapAsMoveHandling,<br>

+                                   FeatureFPARMv8,<br>

+                                   FeatureNEON,<br>

+                                   FeaturePerfMon,<br>

+                                   FeaturePostRAScheduler,<br>

+                                   FeatureUseAA<br>

+                                   ]>;<br>

<br>

 def ProcA57     : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",<br>

-                                   "Cortex-A57 ARM processors",<br>

-                                   [FeatureFPARMv8,<br>

-                                   FeatureNEON,<br>

-                                   FeatureCrypto,<br>

+                                   "Cortex-A57 ARM processors", [<br>

+                                   FeatureBalanceFPOps,<br>

                                    FeatureCRC,<br>

-                                   FeaturePerfMon]>;<br>

+                                   FeatureCrypto,<br>

+                                   FeatureCustomCheapAsMoveHandling,<br>

+                                   FeatureFPARMv8,<br>

+                                   FeatureMergeNarrowLd,<br>

+                                   FeatureNEON,<br>

+                                   FeaturePerfMon,<br>

+                                   FeaturePostRAScheduler,<br>

+                                   FeaturePredictableSelectIsExpensive<br>

+                                   ]>;<br>

<br>

 def ProcCyclone : SubtargetFeature<"cyclone", "ARMProcFamily", "Cyclone",<br>

-                                   "Cyclone",<br>

-                                   [FeatureFPARMv8,<br>

-                                   FeatureNEON,<br>

+                                   "Cyclone", [<br>

+                                   FeatureAlternateSExtLoadCVTF32Pattern,<br>

                                    FeatureCrypto,<br>

+                                   FeatureDisableLatencySchedHeuristic,<br>

+                                   FeatureFPARMv8,<br>

+                                   FeatureMacroOpFusion,<br>

+                                   FeatureNEON,<br>

                                    FeaturePerfMon,<br>

-                                   FeatureZCRegMove, FeatureZCZeroing]>;<br>

+                                   FeatureSlowMisaligned128Store,<br>

+                                   FeatureZCRegMove,<br>

+                                   FeatureZCZeroing<br>

+                                   ]>;<br>

<br>

 def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1",<br>

-                                    "Samsung Exynos-M1 processors",<br>

-                                    [FeatureFPARMv8,<br>

-                                    FeatureNEON,<br>

-                                    FeatureCrypto,<br>

+                                    "Samsung Exynos-M1 processors", [<br>

+                                    FeatureAvoidQuadLdStPairs,<br>

                                     FeatureCRC,<br>

-                                    FeaturePerfMon]>;<br>

+                                    FeatureCrypto,<br>

+                                    FeatureCustomCheapAsMoveHandling,<br>

+                                    FeatureFPARMv8,<br>

+                                    FeatureNEON,<br>

+                                    FeaturePerfMon,<br>

+                                    FeatureUseRSqrt<br>

+                                    ]>;<br>

<br>

 def ProcKryo    : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo",<br>

-                                   "Qualcomm Kryo processors",<br>

-                                   [FeatureFPARMv8,<br>

-                                   FeatureNEON,<br>

-                                   FeatureCrypto,<br>

+                                   "Qualcomm Kryo processors", [<br>

                                    FeatureCRC,<br>

-                                   FeaturePerfMon]>;<br>

-<br>

-def : ProcessorModel<"generic", NoSchedModel, [FeatureFPARMv8,<br>

-                                              FeatureNEON,<br>

-                                              FeatureCRC,<br>

-                                              FeaturePerfMon]>;<br>

+                                   FeatureCrypto,<br>

+                                   FeatureCustomCheapAsMoveHandling,<br>

+                                   FeatureFPARMv8,<br>

+                                   FeatureMergeNarrowLd,<br>

+                                   FeatureNEON,<br>

+                                   FeaturePerfMon,<br>

+                                   FeaturePostRAScheduler,<br>

+                                   FeaturePredictableSelectIsExpensive<br>

+                                   ]>;<br>

+<br>

+def : ProcessorModel<"generic", NoSchedModel, [<br>

+                     FeatureCRC,<br>

+                     FeatureFPARMv8,<br>

+                     FeatureNEON,<br>

+                     FeaturePerfMon,<br>

+                     FeaturePostRAScheduler<br>

+                     ]>;<br>

<br>

 // FIXME: Cortex-A35 is currently modelled as a Cortex-A53<br>

 def : ProcessorModel<"cortex-a35", CortexA53Model, [ProcA35]>;<br>

<br>

Modified: llvm/trunk/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp<br>

URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp?rev=271555&r1=271554&r2=271555&view=diff" rel="noreferrer" target="_blank">http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp?rev=271555&r1=271554&r2=271555&view=diff</a><br>

==============================================================================<br>

--- llvm/trunk/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp (original)<br>

+++ llvm/trunk/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp Thu Jun  2 13:03:53 2016<br>

@@ -314,9 +314,7 @@ bool AArch64A57FPLoadBalancing::runOnMac<br>

   if (skipFunction(*F.getFunction()))<br>

     return false;<br>

<br>

-  // Don't do anything if this isn't an A53 or A57.<br>

-  if (!(F.getSubtarget<AArch64Subtarget>().isCortexA53() ||<br>

-        F.getSubtarget<AArch64Subtarget>().isCortexA57()))<br>

+  if (!F.getSubtarget<AArch64Subtarget>().balanceFPOps())<br>

     return false;<br>

<br>

   bool Changed = false;<br>

<br>

Modified: llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp<br>

URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp?rev=271555&r1=271554&r2=271555&view=diff" rel="noreferrer" target="_blank">http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp?rev=271555&r1=271554&r2=271555&view=diff</a><br>

==============================================================================<br>

--- llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp (original)<br>

+++ llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp Thu Jun  2 13:03:53 2016<br>

@@ -634,9 +634,7 @@ AArch64TargetLowering::AArch64TargetLowe<br>

     }<br>

   }<br>

<br>

-  // Prefer likely predicted branches to selects on out-of-order cores.<br>

-  if (Subtarget->isCortexA57() || Subtarget->isKryo())<br>

-    PredictableSelectIsExpensive = true;<br>

+  PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();<br>

 }<br>

<br>

 void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {<br>

@@ -814,12 +812,9 @@ bool AArch64TargetLowering::allowsMisali<br>

   if (Subtarget->requiresStrictAlign())<br>

     return false;<br>

<br>

-  // FIXME: This is mostly true for Cyclone, but not necessarily others.<br>

   if (Fast) {<br>

-    // FIXME: Define an attribute for slow unaligned accesses instead of<br>

-    // relying on the CPU type as a proxy.<br>

-    // On Cyclone, unaligned 128-bit stores are slow.<br>

-    *Fast = !Subtarget->isCyclone() || VT.getStoreSize() != 16 ||<br>

+    // Some CPUs are fine with unaligned stores except for 128-bit ones.<br>

+    *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||<br>

             // See comments in performSTORECombine() for more details about<br>

             // these conditions.<br>

<br>

@@ -8792,9 +8787,7 @@ static SDValue split16BStores(SDNode *N,<br>

   // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be<br>

   // a call to that function here.<br>

<br>

-  // Cyclone has bad performance on unaligned 16B stores when crossing line and<br>

-  // page boundaries. We want to split such stores.<br>

-  if (!Subtarget->isCyclone())<br>

+  if (!Subtarget->isMisaligned128StoreSlow())<br>

     return SDValue();<br>

<br>

   // Don't split at -Oz.<br>

<br>

Modified: llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.cpp<br>

URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.cpp?rev=271555&r1=271554&r2=271555&view=diff" rel="noreferrer" target="_blank">http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.cpp?rev=271555&r1=271554&r2=271555&view=diff</a><br>

==============================================================================<br>

--- llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.cpp (original)<br>

+++ llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.cpp Thu Jun  2 13:03:53 2016<br>

@@ -544,8 +544,7 @@ static bool canBeExpandedToORR(const Mac<br>

 // FIXME: this implementation should be micro-architecture dependent, so a<br>

 // micro-architecture target hook should be introduced here in future.<br>

 bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const {<br>

-  if (!Subtarget.isCortexA57() && !Subtarget.isCortexA53() &&<br>

-      !Subtarget.isExynosM1() && !Subtarget.isKryo())<br>

+  if (!Subtarget.hasCustomCheapAsMoveHandling())<br>

     return MI->isAsCheapAsAMove();<br>

<br>

   unsigned Imm;<br>

@@ -559,7 +558,7 @@ bool AArch64InstrInfo::isAsCheapAsAMove(<br>

   case AArch64::ADDXri:<br>

   case AArch64::SUBWri:<br>

   case AArch64::SUBXri:<br>

-    return (Subtarget.isExynosM1() ||<br>

+    return (Subtarget.getProcFamily() == AArch64Subtarget::ExynosM1 ||<br>

             MI->getOperand(3).getImm() == 0);<br>

<br>

   // add/sub on register with shift<br>

@@ -568,7 +567,7 @@ bool AArch64InstrInfo::isAsCheapAsAMove(<br>

   case AArch64::SUBWrs:<br>

   case AArch64::SUBXrs:<br>

     Imm = MI->getOperand(3).getImm();<br>

-    return (Subtarget.isExynosM1() &&<br>

+    return (Subtarget.getProcFamily() == AArch64Subtarget::ExynosM1 &&<br>

             AArch64_AM::getArithShiftValue(Imm) < 4);<br>

<br>

   // logical ops on immediate<br>

@@ -609,7 +608,7 @@ bool AArch64InstrInfo::isAsCheapAsAMove(<br>

   case AArch64::ORRWrs:<br>

   case AArch64::ORRXrs:<br>

     Imm = MI->getOperand(3).getImm();<br>

-    return (Subtarget.isExynosM1() &&<br>

+    return (Subtarget.getProcFamily() == AArch64Subtarget::ExynosM1 &&<br>

             AArch64_AM::getShiftValue(Imm) < 4 &&<br>

             AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL);<br>

<br>

@@ -1522,8 +1521,8 @@ bool AArch64InstrInfo::isCandidateToMerg<br>

   if (isLdStPairSuppressed(MI))<br>

     return false;<br>

<br>

-  // Do not pair quad ld/st for Exynos.<br>

-  if (Subtarget.isExynosM1()) {<br>

+  // On some CPUs quad load/store pairs are slower than two single load/stores.<br>

+  if (Subtarget.avoidQuadLdStPairs()) {<br>

     switch (MI->getOpcode()) {<br>

     default:<br>

       break;<br>

@@ -1801,8 +1800,8 @@ bool AArch64InstrInfo::shouldClusterMemO<br>

<br>

 bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr *First,<br>

                                               MachineInstr *Second) const {<br>

-  if (Subtarget.isCyclone()) {<br>

-    // Cyclone can fuse CMN, CMP, TST followed by Bcc.<br>

+  if (Subtarget.hasMacroOpFusion()) {<br>

+    // Fuse CMN, CMP, TST followed by Bcc.<br>

     unsigned SecondOpcode = Second->getOpcode();<br>

     if (SecondOpcode == AArch64::Bcc) {<br>

       switch (First->getOpcode()) {<br>

@@ -1817,7 +1816,7 @@ bool AArch64InstrInfo::shouldScheduleAdj<br>

         return true;<br>

       }<br>

     }<br>

-    // Cyclone B0 also supports ALU operations followed by CBZ/CBNZ.<br>

+    // Fuse ALU operations followed by CBZ/CBNZ.<br>

     if (SecondOpcode == AArch64::CBNZW || SecondOpcode == AArch64::CBNZX ||<br>

         SecondOpcode == AArch64::CBZW || SecondOpcode == AArch64::CBZX) {<br>

       switch (First->getOpcode()) {<br>

<br>

Modified: llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.td<br>

URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.td?rev=271555&r1=271554&r2=271555&view=diff" rel="noreferrer" target="_blank">http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.td?rev=271555&r1=271554&r2=271555&view=diff</a><br>

==============================================================================<br>

--- llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.td (original)<br>

+++ llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.td Thu Jun  2 13:03:53 2016<br>

@@ -34,7 +34,8 @@ def HasSPE           : Predicate<"Subtar<br>

<br>

 def IsLE             : Predicate<"Subtarget->isLittleEndian()">;<br>

 def IsBE             : Predicate<"!Subtarget->isLittleEndian()">;<br>

-def IsCyclone        : Predicate<"Subtarget->isCyclone()">;<br>

+def UseAlternateSExtLoadCVTF32<br>

+    : Predicate<"Subtarget->useAlternateSExtLoadCVTF32Pattern()">;<br>

<br>

 //===----------------------------------------------------------------------===//<br>

 // AArch64-specific DAG Nodes.<br>

@@ -4957,7 +4958,8 @@ class SExtLoadi8CVTf32Pat<dag addrmode,<br>

                                     0),<br>

                                   dsub)),<br>

                                0),<br>

-                             ssub)))>, Requires<[NotForCodeSize, IsCyclone]>;<br>

+                             ssub)))>,<br>

+    Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32]>;<br>

<br>

 def : SExtLoadi8CVTf32Pat<(ro8.Wpat GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext),<br>

                           (LDRBroW  GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext)>;<br>

@@ -5010,7 +5012,8 @@ class SExtLoadi16CVTf64Pat<dag addrmode,<br>

                                      0),<br>

                                    dsub)),<br>

                                0),<br>

-                             dsub)))>, Requires<[NotForCodeSize, IsCyclone]>;<br>

+                             dsub)))>,<br>

+    Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32]>;<br>

<br>

 def : SExtLoadi16CVTf64Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext),<br>

                            (LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>;<br>

<br>

Modified: llvm/trunk/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp<br>

URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp?rev=271555&r1=271554&r2=271555&view=diff" rel="noreferrer" target="_blank">http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp?rev=271555&r1=271554&r2=271555&view=diff</a><br>

==============================================================================<br>

--- llvm/trunk/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp (original)<br>

+++ llvm/trunk/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp Thu Jun  2 13:03:53 2016<br>

@@ -160,10 +160,6 @@ struct AArch64LoadStoreOpt : public Mach<br>

   // Find and promote load instructions which read directly from store.<br>

   bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI);<br>

<br>

-  // Check if converting two narrow loads into a single wider load with<br>

-  // bitfield extracts could be enabled.<br>

-  bool enableNarrowLdMerge(MachineFunction &Fn);<br>

-<br>

   bool optimizeBlock(MachineBasicBlock &MBB, bool enableNarrowLdOpt);<br>

<br>

   bool runOnMachineFunction(MachineFunction &Fn) override;<br>

@@ -1912,15 +1908,6 @@ bool AArch64LoadStoreOpt::optimizeBlock(<br>

   return Modified;<br>

 }<br>

<br>

-bool AArch64LoadStoreOpt::enableNarrowLdMerge(MachineFunction &Fn) {<br>

-  bool ProfitableArch = Subtarget->isCortexA57() || Subtarget->isKryo();<br>

-  // FIXME: The benefit from converting narrow loads into a wider load could be<br>

-  // microarchitectural as it assumes that a single load with two bitfield<br>

-  // extracts is cheaper than two narrow loads. Currently, this conversion is<br>

-  // enabled only in cortex-a57 on which performance benefits were verified.<br>

-  return ProfitableArch && !Subtarget->requiresStrictAlign();<br>

-}<br>

-<br>

 bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {<br>

   if (skipFunction(*Fn.getFunction()))<br>

     return false;<br>

@@ -1936,7 +1923,8 @@ bool AArch64LoadStoreOpt::runOnMachineFu<br>

   UsedRegs.resize(TRI->getNumRegs());<br>

<br>

   bool Modified = false;<br>

-  bool enableNarrowLdOpt = enableNarrowLdMerge(Fn);<br>

+  bool enableNarrowLdOpt =<br>

+    Subtarget->mergeNarrowLoads() && !Subtarget->requiresStrictAlign();<br>

   for (auto &MBB : Fn)<br>

     Modified |= optimizeBlock(MBB, enableNarrowLdOpt);<br>

<br>

<br>

Modified: llvm/trunk/lib/Target/AArch64/AArch64Subtarget.cpp<br>

URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64Subtarget.cpp?rev=271555&r1=271554&r2=271555&view=diff" rel="noreferrer" target="_blank">http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64Subtarget.cpp?rev=271555&r1=271554&r2=271555&view=diff</a><br>

==============================================================================<br>

--- llvm/trunk/lib/Target/AArch64/AArch64Subtarget.cpp (original)<br>

+++ llvm/trunk/lib/Target/AArch64/AArch64Subtarget.cpp Thu Jun  2 13:03:53 2016<br>

@@ -44,9 +44,36 @@ AArch64Subtarget::initializeSubtargetDep<br>

     CPUString = "generic";<br>

<br>

   ParseSubtargetFeatures(CPUString, FS);<br>

+  initializeProperties();<br>

+<br>

   return *this;<br>

 }<br>

<br>

+void AArch64Subtarget::initializeProperties() {<br>

+  // Initialize CPU specific properties. We should add a tablegen feature for<br>

+  // this in the future so we can specify it together with the subtarget<br>

+  // features.<br>

+  switch (ARMProcFamily) {<br>

+  case Cyclone:<br>

+    CacheLineSize = 64;<br>

+    PrefetchDistance = 280;<br>

+    MinPrefetchStride = 2048;<br>

+    MaxPrefetchIterationsAhead = 3;<br>

+    break;<br>

+  case CortexA57:<br>

+    MaxInterleaveFactor = 4;<br>

+    break;<br>

+  case Kryo:<br>

+    MaxInterleaveFactor = 4;<br>

+    VectorInsertExtractBaseCost = 2;<br>

+    break;<br>

+  case Others: break;<br>

+  case CortexA35: break;<br>

+  case CortexA53: break;<br>

+  case ExynosM1: break;<br>

+  }<br>

+}<br>

+<br>

 AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU,<br>

                                    const std::string &FS,<br>

                                    const TargetMachine &TM, bool LittleEndian)<br>

@@ -110,8 +137,7 @@ void AArch64Subtarget::overrideSchedPoli<br>

   // Enabling or Disabling the latency heuristic is a close call: It seems to<br>

   // help nearly no benchmark on out-of-order architectures, on the other hand<br>

   // it regresses register pressure on a few benchmarking.<br>

-  if (isCyclone())<br>

-    Policy.DisableLatencyHeuristic = true;<br>

+  Policy.DisableLatencyHeuristic = DisableLatencySchedHeuristic;<br>

 }<br>

<br>

 bool AArch64Subtarget::enableEarlyIfConversion() const {<br>

@@ -133,8 +159,5 @@ bool AArch64Subtarget::supportsAddressTo<br>

<br>

 std::unique_ptr<PBQPRAConstraint><br>

 AArch64Subtarget::getCustomPBQPConstraints() const {<br>

-  if (!isCortexA57())<br>

-    return nullptr;<br>

-<br>

-  return llvm::make_unique<A57ChainingConstraint>();<br>

+  return balanceFPOps() ? llvm::make_unique<A57ChainingConstraint>() : nullptr;<br>

 }<br>

<br>

Modified: llvm/trunk/lib/Target/AArch64/AArch64Subtarget.h<br>

URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64Subtarget.h?rev=271555&r1=271554&r2=271555&view=diff" rel="noreferrer" target="_blank">http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64Subtarget.h?rev=271555&r1=271554&r2=271555&view=diff</a><br>

==============================================================================<br>

--- llvm/trunk/lib/Target/AArch64/AArch64Subtarget.h (original)<br>

+++ llvm/trunk/lib/Target/AArch64/AArch64Subtarget.h Thu Jun  2 13:03:53 2016<br>

@@ -33,8 +33,8 @@ class StringRef;<br>

 class Triple;<br>

<br>

 class AArch64Subtarget : public AArch64GenSubtargetInfo {<br>

-protected:<br>

-  enum ARMProcFamilyEnum {<br>

+public:<br>

+  enum ARMProcFamilyEnum : uint8_t {<br>

     Others,<br>

     CortexA35,<br>

     CortexA53,<br>

@@ -44,6 +44,7 @@ protected:<br>

     Kryo<br>

   };<br>

<br>

+protected:<br>

   /// ARMProcFamily - ARM processor family: Cortex-A53, Cortex-A57, and others.<br>

   ARMProcFamilyEnum ARMProcFamily = Others;<br>

<br>

@@ -66,6 +67,24 @@ protected:<br>

<br>

   // StrictAlign - Disallow unaligned memory accesses.<br>

   bool StrictAlign = false;<br>

+  bool MergeNarrowLoads = false;<br>

+  bool UseAA = false;<br>

+  bool PredictableSelectIsExpensive = false;<br>

+  bool BalanceFPOps = false;<br>

+  bool CustomAsCheapAsMove = false;<br>

+  bool UsePostRAScheduler = false;<br>

+  bool Misaligned128StoreIsSlow = false;<br>

+  bool AvoidQuadLdStPairs = false;<br>

+  bool UseAlternateSExtLoadCVTF32Pattern = false;<br>

+  bool HasMacroOpFusion = false;<br>

+  bool DisableLatencySchedHeuristic = false;<br>

+  bool UseRSqrt = false;<br>

+  uint8_t MaxInterleaveFactor = 2;<br>

+  uint8_t VectorInsertExtractBaseCost = 3;<br>

+  uint16_t CacheLineSize = 0;<br>

+  uint16_t PrefetchDistance = 0;<br>

+  uint16_t MinPrefetchStride = 1;<br>

+  unsigned MaxPrefetchIterationsAhead = UINT_MAX;<br>

<br>

   // ReserveX18 - X18 is not available as a general purpose register.<br>

   bool ReserveX18;<br>

@@ -93,6 +112,9 @@ private:<br>

   /// subtarget initialization.<br>

   AArch64Subtarget &initializeSubtargetDependencies(StringRef FS);<br>

<br>

+  /// Initialize properties based on the selected processor family.<br>

+  void initializeProperties();<br>

+<br>

 public:<br>

   /// This constructor initializes the data members to match that<br>

   /// of the specified triple.<br>

@@ -123,7 +145,15 @@ public:<br>

   const Triple &getTargetTriple() const { return TargetTriple; }<br>

   bool enableMachineScheduler() const override { return true; }<br>

   bool enablePostRAScheduler() const override {<br>

-    return isGeneric() || isCortexA53() || isCortexA57() || isKryo();<br>

+    return UsePostRAScheduler;<br>

+  }<br>

+<br>

+  /// Returns ARM processor family.<br>

+  /// Avoid this function! CPU specifics should be kept local to this class<br>

+  /// and preferably modeled with SubtargetFeatures or properties in<br>

+  /// initializeProperties().<br>

+  ARMProcFamilyEnum getProcFamily() const {<br>

+    return ARMProcFamily;<br>

   }<br>

<br>

   bool hasV8_1aOps() const { return HasV8_1aOps; }<br>

@@ -140,6 +170,30 @@ public:<br>

   bool hasNEON() const { return HasNEON; }<br>

   bool hasCrypto() const { return HasCrypto; }<br>

   bool hasCRC() const { return HasCRC; }<br>

+  bool mergeNarrowLoads() const { return MergeNarrowLoads; }<br>

+  bool balanceFPOps() const { return BalanceFPOps; }<br>

+  bool predictableSelectIsExpensive() const {<br>

+    return PredictableSelectIsExpensive;<br>

+  }<br>

+  bool hasCustomCheapAsMoveHandling() const { return CustomAsCheapAsMove; }<br>

+  bool isMisaligned128StoreSlow() const { return Misaligned128StoreIsSlow; }<br>

+  bool avoidQuadLdStPairs() const { return AvoidQuadLdStPairs; }<br>

+  bool useAlternateSExtLoadCVTF32Pattern() const {<br>

+    return UseAlternateSExtLoadCVTF32Pattern;<br>

+  }<br>

+  bool hasMacroOpFusion() const { return HasMacroOpFusion; }<br>

+  bool useRSqrt() const { return UseRSqrt; }<br>

+  unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; }<br>

+  unsigned getVectorInsertExtractBaseCost() const {<br>

+    return VectorInsertExtractBaseCost;<br>

+  }<br>

+  unsigned getCacheLineSize() const { return CacheLineSize; }<br>

+  unsigned getPrefetchDistance() const { return PrefetchDistance; }<br>

+  unsigned getMinPrefetchStride() const { return MinPrefetchStride; }<br>

+  unsigned getMaxPrefetchIterationsAhead() const {<br>

+    return MaxPrefetchIterationsAhead;<br>

+  }<br>

+<br>

   /// CPU has TBI (top byte of addresses is ignored during HW address<br>

   /// translation) and OS enables it.<br>

   bool supportsAddressTopByteIgnored() const;<br>

@@ -160,14 +214,7 @@ public:<br>

   bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }<br>

   bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }<br>

<br>

-  bool isGeneric() const { return CPUString == "generic"; }<br>

-  bool isCyclone() const { return CPUString == "cyclone"; }<br>

-  bool isCortexA57() const { return CPUString == "cortex-a57"; }<br>

-  bool isCortexA53() const { return CPUString == "cortex-a53"; }<br>

-  bool isExynosM1() const { return CPUString == "exynos-m1"; }<br>

-  bool isKryo() const { return CPUString == "kryo"; }<br>

-<br>

-  bool useAA() const override { return isCortexA53(); }<br>

+  bool useAA() const override { return UseAA; }<br>

<br>

   /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size<br>

   /// that still makes it profitable to inline the call.<br>

<br>

Modified: llvm/trunk/lib/Target/AArch64/AArch64TargetMachine.cpp<br>

URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64TargetMachine.cpp?rev=271555&r1=271554&r2=271555&view=diff" rel="noreferrer" target="_blank">http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64TargetMachine.cpp?rev=271555&r1=271554&r2=271555&view=diff</a><br>

==============================================================================<br>

--- llvm/trunk/lib/Target/AArch64/AArch64TargetMachine.cpp (original)<br>

+++ llvm/trunk/lib/Target/AArch64/AArch64TargetMachine.cpp Thu Jun  2 13:03:53 2016<br>

@@ -147,8 +147,7 @@ static void initReciprocals(AArch64Targe<br>

   // (52 mantissa bits) are 2 and 3, respectively.<br>

   unsigned ExtraStepsF = 2,<br>

            ExtraStepsD = ExtraStepsF + 1;<br>

-  // FIXME: Enable x^-1/2 only for Exynos M1 at the moment.<br>

-  bool UseRsqrt = ST.isExynosM1();<br>

+  bool UseRsqrt = ST.useRSqrt();<br>

<br>

   TM.Options.Reciprocals.setDefaults("sqrtf", UseRsqrt, ExtraStepsF);<br>

   TM.Options.Reciprocals.setDefaults("sqrtd", UseRsqrt, ExtraStepsD);<br>

<br>

Modified: llvm/trunk/lib/Target/AArch64/AArch64TargetTransformInfo.cpp<br>

URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64TargetTransformInfo.cpp?rev=271555&r1=271554&r2=271555&view=diff" rel="noreferrer" target="_blank">http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64TargetTransformInfo.cpp?rev=271555&r1=271554&r2=271555&view=diff</a><br>

==============================================================================<br>

--- llvm/trunk/lib/Target/AArch64/AArch64TargetTransformInfo.cpp (original)<br>

+++ llvm/trunk/lib/Target/AArch64/AArch64TargetTransformInfo.cpp Thu Jun  2 13:03:53 2016<br>

@@ -368,9 +368,7 @@ int AArch64TTIImpl::getVectorInstrCost(u<br>

   }<br>

<br>

   // All other insert/extracts cost this much.<br>

-  if (ST->isKryo())<br>

-    return 2;<br>

-  return 3;<br>

+  return ST->getVectorInsertExtractBaseCost();<br>

 }<br>

<br>

 int AArch64TTIImpl::getArithmeticInstrCost(<br>

@@ -529,9 +527,7 @@ int AArch64TTIImpl::getCostOfKeepingLive<br>

 }<br>

<br>

 unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) {<br>

-  if (ST->isCortexA57() || ST->isKryo())<br>

-    return 4;<br>

-  return 2;<br>

+  return ST->getMaxInterleaveFactor();<br>

 }<br>

<br>

 void AArch64TTIImpl::getUnrollingPreferences(Loop *L,<br>

@@ -630,28 +626,17 @@ bool AArch64TTIImpl::getTgtMemIntrinsic(<br>

 }<br>

<br>

 unsigned AArch64TTIImpl::getCacheLineSize() {<br>

-  if (ST->isCyclone())<br>

-    return 64;<br>

-  return BaseT::getCacheLineSize();<br>

+  return ST->getCacheLineSize();<br>

 }<br>

<br>

 unsigned AArch64TTIImpl::getPrefetchDistance() {<br>

-  if (ST->isCyclone())<br>

-    return 280;<br>

-  return BaseT::getPrefetchDistance();<br>

+  return ST->getPrefetchDistance();<br>

 }<br>

<br>

 unsigned AArch64TTIImpl::getMinPrefetchStride() {<br>

-  if (ST->isCyclone())<br>

-    // The HW prefetcher handles accesses with strides up to 2KB.<br>

-    return 2048;<br>

-  return BaseT::getMinPrefetchStride();<br>

+  return ST->getMinPrefetchStride();<br>

 }<br>

<br>

 unsigned AArch64TTIImpl::getMaxPrefetchIterationsAhead() {<br>

-  if (ST->isCyclone())<br>

-    // Be conservative for now and don't prefetch ahead too much since the loop<br>

-    // may terminate early.<br>

-    return 3;<br>

-  return BaseT::getMaxPrefetchIterationsAhead();<br>

+  return ST->getMaxPrefetchIterationsAhead();<br>

 }<br>

<br>

<br>

_______________________________________________<br>

llvm-commits mailing list<br>

<a href="mailto:llvm-commits@lists.llvm.org" target="_blank">llvm-commits@lists.llvm.org</a><br>

<a href="http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits" rel="noreferrer" target="_blank">http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits</a><br>

</blockquote></div>