[llvm] r306547 - [ARM] Improve if-conversion for M-class CPUs without branch predictors

John Brawn via llvm-commits llvm-commits at lists.llvm.org
Wed Jun 28 07:11:16 PDT 2017


Author: john.brawn
Date: Wed Jun 28 07:11:15 2017
New Revision: 306547

URL: http://llvm.org/viewvc/llvm-project?rev=306547&view=rev
Log:
[ARM] Improve if-conversion for M-class CPUs without branch predictors

The current heuristic in isProfitableToIfCvt assumes we have a branch predictor,
and so gives the wrong answer in some cases when we don't. This patch adds a
subtarget feature to indicate that a subtarget has no branch predictor, and
changes the heuristic in isProfitableToiIfCvt when it's present. This gives a
slight overall improvement in a set of embedded benchmarks on Cortex-M4 and
Cortex-M33.

Differential Revision: https://reviews.llvm.org/D34398

Added:
    llvm/trunk/lib/Target/ARM/ARMScheduleM3.td
    llvm/trunk/test/CodeGen/Thumb2/ifcvt-no-branch-predictor.ll
Modified:
    llvm/trunk/lib/Target/ARM/ARM.td
    llvm/trunk/lib/Target/ARM/ARMBaseInstrInfo.cpp
    llvm/trunk/lib/Target/ARM/ARMSchedule.td
    llvm/trunk/lib/Target/ARM/ARMSubtarget.h

Modified: llvm/trunk/lib/Target/ARM/ARM.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARM.td?rev=306547&r1=306546&r2=306547&view=diff
==============================================================================
--- llvm/trunk/lib/Target/ARM/ARM.td (original)
+++ llvm/trunk/lib/Target/ARM/ARM.td Wed Jun 28 07:11:15 2017
@@ -222,6 +222,13 @@ def FeatureAvoidMOVsShOp : SubtargetFeat
 def FeatureHasRetAddrStack : SubtargetFeature<"ret-addr-stack", "HasRetAddrStack", "true",
                                      "Has return address stack">;
 
+// Some processors have no branch predictor, which changes the expected cost of
+// taking a branch which affects the choice of whether to use predicated
+// instructions.
+def FeatureHasNoBranchPredictor : SubtargetFeature<"no-branch-predictor",
+                                                   "HasBranchPredictor", "false",
+                                                   "Has no branch predictor">;
+
 /// DSP extension.
 def FeatureDSP : SubtargetFeature<"dsp", "HasDSP", "true",
                               "Supports DSP instructions in ARM and/or Thumb2">;
@@ -756,13 +763,19 @@ def : ProcessorModel<"cortex-r8",   Cort
                                                          FeatureHasSlowFPVMLx,
                                                          FeatureAvoidPartialCPSR]>;
 
-def : ProcNoItin<"cortex-m3",                           [ARMv7m, ProcM3]>;
-def : ProcNoItin<"sc300",                               [ARMv7m, ProcM3]>;
+def : ProcessorModel<"cortex-m3", CortexM3Model,        [ARMv7m,
+                                                         ProcM3,
+                                                         FeatureHasNoBranchPredictor]>;
+
+def : ProcessorModel<"sc300",     CortexM3Model,        [ARMv7m,
+                                                         ProcM3,
+                                                         FeatureHasNoBranchPredictor]>;
 
-def : ProcNoItin<"cortex-m4",                           [ARMv7em,
+def : ProcessorModel<"cortex-m4", CortexM3Model,        [ARMv7em,
                                                          FeatureVFP4,
                                                          FeatureVFPOnlySP,
-                                                         FeatureD16]>;
+                                                         FeatureD16,
+                                                         FeatureHasNoBranchPredictor]>;
 
 def : ProcNoItin<"cortex-m7",                           [ARMv7em,
                                                          FeatureFPARMv8,
@@ -771,11 +784,12 @@ def : ProcNoItin<"cortex-m7",
 def : ProcNoItin<"cortex-m23",                          [ARMv8mBaseline,
                                                          FeatureNoMovt]>;
 
-def : ProcNoItin<"cortex-m33",                          [ARMv8mMainline,
+def : ProcessorModel<"cortex-m33", CortexM3Model,       [ARMv8mMainline,
                                                          FeatureDSP,
                                                          FeatureFPARMv8,
                                                          FeatureD16,
-                                                         FeatureVFPOnlySP]>;
+                                                         FeatureVFPOnlySP,
+                                                         FeatureHasNoBranchPredictor]>;
 
 def : ProcNoItin<"cortex-a32",                           [ARMv8a,
                                                          FeatureHWDivThumb,

Modified: llvm/trunk/lib/Target/ARM/ARMBaseInstrInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMBaseInstrInfo.cpp?rev=306547&r1=306546&r2=306547&view=diff
==============================================================================
--- llvm/trunk/lib/Target/ARM/ARMBaseInstrInfo.cpp (original)
+++ llvm/trunk/lib/Target/ARM/ARMBaseInstrInfo.cpp Wed Jun 28 07:11:15 2017
@@ -1851,9 +1851,9 @@ isProfitableToIfCvt(MachineBasicBlock &M
 }
 
 bool ARMBaseInstrInfo::
-isProfitableToIfCvt(MachineBasicBlock &,
+isProfitableToIfCvt(MachineBasicBlock &TBB,
                     unsigned TCycles, unsigned TExtra,
-                    MachineBasicBlock &,
+                    MachineBasicBlock &FBB,
                     unsigned FCycles, unsigned FExtra,
                     BranchProbability Probability) const {
   if (!TCycles)
@@ -1863,14 +1863,43 @@ isProfitableToIfCvt(MachineBasicBlock &,
   // Here we scale up each component of UnpredCost to avoid precision issue when
   // scaling TCycles/FCycles by Probability.
   const unsigned ScalingUpFactor = 1024;
-  unsigned TUnpredCost = Probability.scale(TCycles * ScalingUpFactor);
-  unsigned FUnpredCost =
+
+  unsigned PredCost = (TCycles + FCycles + TExtra + FExtra) * ScalingUpFactor;
+  unsigned UnpredCost;
+  if (!Subtarget.hasBranchPredictor()) {
+    // When we don't have a branch predictor it's always cheaper to not take a
+    // branch than take it, so we have to take that into account.
+    unsigned NotTakenBranchCost = 1;
+    unsigned TakenBranchCost = Subtarget.getMispredictionPenalty();
+    unsigned TUnpredCycles, FUnpredCycles;
+    if (!FCycles) {
+      // Triangle: TBB is the fallthrough
+      TUnpredCycles = TCycles + NotTakenBranchCost;
+      FUnpredCycles = TakenBranchCost;
+    } else {
+      // Diamond: TBB is the block that is branched to, FBB is the fallthrough
+      TUnpredCycles = TCycles + TakenBranchCost;
+      FUnpredCycles = FCycles + NotTakenBranchCost;
+    }
+    // The total cost is the cost of each path scaled by their probabilites
+    unsigned TUnpredCost = Probability.scale(TUnpredCycles * ScalingUpFactor);
+    unsigned FUnpredCost = Probability.getCompl().scale(FUnpredCycles * ScalingUpFactor);
+    UnpredCost = TUnpredCost + FUnpredCost;
+    // When predicating assume that the first IT can be folded away but later
+    // ones cost one cycle each
+    if (Subtarget.isThumb2() && TCycles + FCycles > 4) {
+      PredCost += ((TCycles + FCycles - 4) / 4) * ScalingUpFactor;
+    }
+  } else {
+    unsigned TUnpredCost = Probability.scale(TCycles * ScalingUpFactor);
+    unsigned FUnpredCost =
       Probability.getCompl().scale(FCycles * ScalingUpFactor);
-  unsigned UnpredCost = TUnpredCost + FUnpredCost;
-  UnpredCost += 1 * ScalingUpFactor; // The branch itself
-  UnpredCost += Subtarget.getMispredictionPenalty() * ScalingUpFactor / 10;
+    UnpredCost = TUnpredCost + FUnpredCost;
+    UnpredCost += 1 * ScalingUpFactor; // The branch itself
+    UnpredCost += Subtarget.getMispredictionPenalty() * ScalingUpFactor / 10;
+  }
 
-  return (TCycles + FCycles + TExtra + FExtra) * ScalingUpFactor <= UnpredCost;
+  return PredCost <= UnpredCost;
 }
 
 bool

Modified: llvm/trunk/lib/Target/ARM/ARMSchedule.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMSchedule.td?rev=306547&r1=306546&r2=306547&view=diff
==============================================================================
--- llvm/trunk/lib/Target/ARM/ARMSchedule.td (original)
+++ llvm/trunk/lib/Target/ARM/ARMSchedule.td Wed Jun 28 07:11:15 2017
@@ -424,3 +424,4 @@ include "ARMScheduleA9.td"
 include "ARMScheduleSwift.td"
 include "ARMScheduleR52.td"
 include "ARMScheduleA57.td"
+include "ARMScheduleM3.td"

Added: llvm/trunk/lib/Target/ARM/ARMScheduleM3.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMScheduleM3.td?rev=306547&view=auto
==============================================================================
--- llvm/trunk/lib/Target/ARM/ARMScheduleM3.td (added)
+++ llvm/trunk/lib/Target/ARM/ARMScheduleM3.td Wed Jun 28 07:11:15 2017
@@ -0,0 +1,21 @@
+//=- ARMScheduleM3.td - ARM Cortex-M3 Scheduling Definitions -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for the ARM Cortex-M3 processor.
+//
+//===----------------------------------------------------------------------===//
+
+def CortexM3Model : SchedMachineModel {
+  let IssueWidth        = 1; // Only IT can be dual-issued, so assume single-issue
+  let MicroOpBufferSize = 0; // In-order
+  let LoadLatency       = 2; // Latency when not pipelined, not pc-relative
+  let MispredictPenalty = 2; // Best case branch taken cost
+
+  let CompleteModel = 0;
+}

Modified: llvm/trunk/lib/Target/ARM/ARMSubtarget.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMSubtarget.h?rev=306547&r1=306546&r2=306547&view=diff
==============================================================================
--- llvm/trunk/lib/Target/ARM/ARMSubtarget.h (original)
+++ llvm/trunk/lib/Target/ARM/ARMSubtarget.h Wed Jun 28 07:11:15 2017
@@ -246,6 +246,11 @@ protected:
   /// avoid issue "normal" call instructions to callees which do not return.
   bool HasRetAddrStack = false;
 
+  /// HasBranchPredictor - True if the subtarget has a branch predictor. Having
+  /// a branch predictor or not changes the expected cost of taking a branch
+  /// which affects the choice of whether to use predicated instructions.
+  bool HasBranchPredictor = true;
+
   /// HasMPExtension - True if the subtarget supports Multiprocessing
   /// extension (ARMv7 only).
   bool HasMPExtension = false;
@@ -554,6 +559,7 @@ public:
   bool cheapPredicableCPSRDef() const { return CheapPredicableCPSRDef; }
   bool avoidMOVsShifterOperand() const { return AvoidMOVsShifterOperand; }
   bool hasRetAddrStack() const { return HasRetAddrStack; }
+  bool hasBranchPredictor() const { return HasBranchPredictor; }
   bool hasMPExtension() const { return HasMPExtension; }
   bool hasDSP() const { return HasDSP; }
   bool useNaClTrap() const { return UseNaClTrap; }

Added: llvm/trunk/test/CodeGen/Thumb2/ifcvt-no-branch-predictor.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/Thumb2/ifcvt-no-branch-predictor.ll?rev=306547&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/Thumb2/ifcvt-no-branch-predictor.ll (added)
+++ llvm/trunk/test/CodeGen/Thumb2/ifcvt-no-branch-predictor.ll Wed Jun 28 07:11:15 2017
@@ -0,0 +1,154 @@
+; RUN: llc < %s -mtriple=thumbv7m -mcpu=cortex-m7 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BP
+; RUN: llc < %s -mtriple=thumbv7m -mcpu=cortex-m3 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-NOBP
+
+declare void @otherfn()
+
+; CHECK-LABEL: triangle1:
+; CHECK: itt ne
+; CHECK: movne
+; CHECK: strne
+define i32 @triangle1(i32 %n, i32* %p) {
+entry:
+  %tobool = icmp eq i32 %n, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:
+  store i32 1, i32* %p, align 4
+  br label %if.end
+
+if.end:
+  tail call void @otherfn()
+  ret i32 0
+}
+
+; CHECK-LABEL: triangle2:
+; CHECK-BP: itttt ne
+; CHECK-BP: movne
+; CHECK-BP: strne
+; CHECK-BP: movne
+; CHECK-BP: strne
+; CHECK-NOBP: cbz
+; CHECK-NOBP: movs
+; CHECK-NOBP: str
+; CHECK-NOBP: movs
+; CHECK-NOBP: str
+define i32 @triangle2(i32 %n, i32* %p, i32* %q) {
+entry:
+  %tobool = icmp eq i32 %n, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:
+  store i32 1, i32* %p, align 4
+  store i32 2, i32* %q, align 4
+  br label %if.end
+
+if.end:
+  tail call void @otherfn()
+  ret i32 0
+}
+
+; CHECK-LABEL: triangle3:
+; CHECK: cbz
+; CHECK: movs
+; CHECK: str
+; CHECK: movs
+; CHECK: str
+; CHECK: movs
+; CHECK: str
+define i32 @triangle3(i32 %n, i32* %p, i32* %q, i32* %r) {
+entry:
+  %tobool = icmp eq i32 %n, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:
+  store i32 1, i32* %p, align 4
+  store i32 2, i32* %q, align 4
+  store i32 3, i32* %r, align 4
+  br label %if.end
+
+if.end:
+  tail call void @otherfn()
+  ret i32 0
+}
+
+; CHECK-LABEL: diamond1:
+; CHECK: ite eq
+; CHECK: ldreq
+; CHECK: strne
+define i32 @diamond1(i32 %n, i32* %p) {
+entry:
+  %tobool = icmp eq i32 %n, 0
+  br i1 %tobool, label %if.else, label %if.then
+
+if.then:
+  store i32 %n, i32* %p, align 4
+  br label %if.end
+
+if.else:
+  %0 = load i32, i32* %p, align 4
+  br label %if.end
+
+if.end:
+  %n.addr.0 = phi i32 [ %n, %if.then ], [ %0, %if.else ]
+  tail call void @otherfn()
+  ret i32 %n.addr.0
+}
+
+; CHECK-LABEL: diamond2:
+; CHECK-BP: itte
+; CHECK-BP: streq
+; CHECK-BP: ldreq
+; CHECK-BP: strne
+; CHECK-NOBP: cbz
+; CHECK-NOBP: str
+; CHECK-NOBP: b
+; CHECK-NOBP: str
+; CHECK-NOBP: ldr
+define i32 @diamond2(i32 %n, i32 %m, i32* %p, i32* %q) {
+entry:
+  %tobool = icmp eq i32 %n, 0
+  br i1 %tobool, label %if.else, label %if.then
+
+if.then:
+  store i32 %n, i32* %p, align 4
+  br label %if.end
+
+if.else:
+  store i32 %m, i32* %q, align 4
+  %0 = load i32, i32* %p, align 4
+  br label %if.end
+
+if.end:
+  %n.addr.0 = phi i32 [ %n, %if.then ], [ %0, %if.else ]
+  tail call void @otherfn()
+  ret i32 %n.addr.0
+}
+
+; CHECK-LABEL: diamond3:
+; CHECK: cbz
+; CHECK: movs
+; CHECK: str
+; CHECK: b
+; CHECK: ldr
+; CHECK: ldr
+; CHECK: adds
+define i32 @diamond3(i32 %n, i32* %p, i32* %q) {
+entry:
+  %tobool = icmp eq i32 %n, 0
+  br i1 %tobool, label %if.else, label %if.then
+
+if.then:
+  store i32 1, i32* %p, align 4
+  br label %if.end
+
+if.else:
+  %0 = load i32, i32* %p, align 4
+  %1 = load i32, i32* %q, align 4
+  %add = add nsw i32 %1, %0
+  br label %if.end
+
+if.end:
+  %n.addr.0 = phi i32 [ %n, %if.then ], [ %add, %if.else ]
+  tail call void @otherfn()
+  ret i32 %n.addr.0
+}




More information about the llvm-commits mailing list