[llvm] r306547 - [ARM] Improve if-conversion for M-class CPUs without branch predictors
John Brawn via llvm-commits
llvm-commits at lists.llvm.org
Wed Jun 28 07:11:16 PDT 2017
Author: john.brawn
Date: Wed Jun 28 07:11:15 2017
New Revision: 306547
URL: http://llvm.org/viewvc/llvm-project?rev=306547&view=rev
Log:
[ARM] Improve if-conversion for M-class CPUs without branch predictors
The current heuristic in isProfitableToIfCvt assumes we have a branch predictor,
and so gives the wrong answer in some cases when we don't. This patch adds a
subtarget feature to indicate that a subtarget has no branch predictor, and
changes the heuristic in isProfitableToiIfCvt when it's present. This gives a
slight overall improvement in a set of embedded benchmarks on Cortex-M4 and
Cortex-M33.
Differential Revision: https://reviews.llvm.org/D34398
Added:
llvm/trunk/lib/Target/ARM/ARMScheduleM3.td
llvm/trunk/test/CodeGen/Thumb2/ifcvt-no-branch-predictor.ll
Modified:
llvm/trunk/lib/Target/ARM/ARM.td
llvm/trunk/lib/Target/ARM/ARMBaseInstrInfo.cpp
llvm/trunk/lib/Target/ARM/ARMSchedule.td
llvm/trunk/lib/Target/ARM/ARMSubtarget.h
Modified: llvm/trunk/lib/Target/ARM/ARM.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARM.td?rev=306547&r1=306546&r2=306547&view=diff
==============================================================================
--- llvm/trunk/lib/Target/ARM/ARM.td (original)
+++ llvm/trunk/lib/Target/ARM/ARM.td Wed Jun 28 07:11:15 2017
@@ -222,6 +222,13 @@ def FeatureAvoidMOVsShOp : SubtargetFeat
def FeatureHasRetAddrStack : SubtargetFeature<"ret-addr-stack", "HasRetAddrStack", "true",
"Has return address stack">;
+// Some processors have no branch predictor, which changes the expected cost of
+// taking a branch which affects the choice of whether to use predicated
+// instructions.
+def FeatureHasNoBranchPredictor : SubtargetFeature<"no-branch-predictor",
+ "HasBranchPredictor", "false",
+ "Has no branch predictor">;
+
/// DSP extension.
def FeatureDSP : SubtargetFeature<"dsp", "HasDSP", "true",
"Supports DSP instructions in ARM and/or Thumb2">;
@@ -756,13 +763,19 @@ def : ProcessorModel<"cortex-r8", Cort
FeatureHasSlowFPVMLx,
FeatureAvoidPartialCPSR]>;
-def : ProcNoItin<"cortex-m3", [ARMv7m, ProcM3]>;
-def : ProcNoItin<"sc300", [ARMv7m, ProcM3]>;
+def : ProcessorModel<"cortex-m3", CortexM3Model, [ARMv7m,
+ ProcM3,
+ FeatureHasNoBranchPredictor]>;
+
+def : ProcessorModel<"sc300", CortexM3Model, [ARMv7m,
+ ProcM3,
+ FeatureHasNoBranchPredictor]>;
-def : ProcNoItin<"cortex-m4", [ARMv7em,
+def : ProcessorModel<"cortex-m4", CortexM3Model, [ARMv7em,
FeatureVFP4,
FeatureVFPOnlySP,
- FeatureD16]>;
+ FeatureD16,
+ FeatureHasNoBranchPredictor]>;
def : ProcNoItin<"cortex-m7", [ARMv7em,
FeatureFPARMv8,
@@ -771,11 +784,12 @@ def : ProcNoItin<"cortex-m7",
def : ProcNoItin<"cortex-m23", [ARMv8mBaseline,
FeatureNoMovt]>;
-def : ProcNoItin<"cortex-m33", [ARMv8mMainline,
+def : ProcessorModel<"cortex-m33", CortexM3Model, [ARMv8mMainline,
FeatureDSP,
FeatureFPARMv8,
FeatureD16,
- FeatureVFPOnlySP]>;
+ FeatureVFPOnlySP,
+ FeatureHasNoBranchPredictor]>;
def : ProcNoItin<"cortex-a32", [ARMv8a,
FeatureHWDivThumb,
Modified: llvm/trunk/lib/Target/ARM/ARMBaseInstrInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMBaseInstrInfo.cpp?rev=306547&r1=306546&r2=306547&view=diff
==============================================================================
--- llvm/trunk/lib/Target/ARM/ARMBaseInstrInfo.cpp (original)
+++ llvm/trunk/lib/Target/ARM/ARMBaseInstrInfo.cpp Wed Jun 28 07:11:15 2017
@@ -1851,9 +1851,9 @@ isProfitableToIfCvt(MachineBasicBlock &M
}
bool ARMBaseInstrInfo::
-isProfitableToIfCvt(MachineBasicBlock &,
+isProfitableToIfCvt(MachineBasicBlock &TBB,
unsigned TCycles, unsigned TExtra,
- MachineBasicBlock &,
+ MachineBasicBlock &FBB,
unsigned FCycles, unsigned FExtra,
BranchProbability Probability) const {
if (!TCycles)
@@ -1863,14 +1863,43 @@ isProfitableToIfCvt(MachineBasicBlock &,
// Here we scale up each component of UnpredCost to avoid precision issue when
// scaling TCycles/FCycles by Probability.
const unsigned ScalingUpFactor = 1024;
- unsigned TUnpredCost = Probability.scale(TCycles * ScalingUpFactor);
- unsigned FUnpredCost =
+
+ unsigned PredCost = (TCycles + FCycles + TExtra + FExtra) * ScalingUpFactor;
+ unsigned UnpredCost;
+ if (!Subtarget.hasBranchPredictor()) {
+ // When we don't have a branch predictor it's always cheaper to not take a
+ // branch than take it, so we have to take that into account.
+ unsigned NotTakenBranchCost = 1;
+ unsigned TakenBranchCost = Subtarget.getMispredictionPenalty();
+ unsigned TUnpredCycles, FUnpredCycles;
+ if (!FCycles) {
+ // Triangle: TBB is the fallthrough
+ TUnpredCycles = TCycles + NotTakenBranchCost;
+ FUnpredCycles = TakenBranchCost;
+ } else {
+ // Diamond: TBB is the block that is branched to, FBB is the fallthrough
+ TUnpredCycles = TCycles + TakenBranchCost;
+ FUnpredCycles = FCycles + NotTakenBranchCost;
+ }
+ // The total cost is the cost of each path scaled by their probabilites
+ unsigned TUnpredCost = Probability.scale(TUnpredCycles * ScalingUpFactor);
+ unsigned FUnpredCost = Probability.getCompl().scale(FUnpredCycles * ScalingUpFactor);
+ UnpredCost = TUnpredCost + FUnpredCost;
+ // When predicating assume that the first IT can be folded away but later
+ // ones cost one cycle each
+ if (Subtarget.isThumb2() && TCycles + FCycles > 4) {
+ PredCost += ((TCycles + FCycles - 4) / 4) * ScalingUpFactor;
+ }
+ } else {
+ unsigned TUnpredCost = Probability.scale(TCycles * ScalingUpFactor);
+ unsigned FUnpredCost =
Probability.getCompl().scale(FCycles * ScalingUpFactor);
- unsigned UnpredCost = TUnpredCost + FUnpredCost;
- UnpredCost += 1 * ScalingUpFactor; // The branch itself
- UnpredCost += Subtarget.getMispredictionPenalty() * ScalingUpFactor / 10;
+ UnpredCost = TUnpredCost + FUnpredCost;
+ UnpredCost += 1 * ScalingUpFactor; // The branch itself
+ UnpredCost += Subtarget.getMispredictionPenalty() * ScalingUpFactor / 10;
+ }
- return (TCycles + FCycles + TExtra + FExtra) * ScalingUpFactor <= UnpredCost;
+ return PredCost <= UnpredCost;
}
bool
Modified: llvm/trunk/lib/Target/ARM/ARMSchedule.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMSchedule.td?rev=306547&r1=306546&r2=306547&view=diff
==============================================================================
--- llvm/trunk/lib/Target/ARM/ARMSchedule.td (original)
+++ llvm/trunk/lib/Target/ARM/ARMSchedule.td Wed Jun 28 07:11:15 2017
@@ -424,3 +424,4 @@ include "ARMScheduleA9.td"
include "ARMScheduleSwift.td"
include "ARMScheduleR52.td"
include "ARMScheduleA57.td"
+include "ARMScheduleM3.td"
Added: llvm/trunk/lib/Target/ARM/ARMScheduleM3.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMScheduleM3.td?rev=306547&view=auto
==============================================================================
--- llvm/trunk/lib/Target/ARM/ARMScheduleM3.td (added)
+++ llvm/trunk/lib/Target/ARM/ARMScheduleM3.td Wed Jun 28 07:11:15 2017
@@ -0,0 +1,21 @@
+//=- ARMScheduleM3.td - ARM Cortex-M3 Scheduling Definitions -*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for the ARM Cortex-M3 processor.
+//
+//===----------------------------------------------------------------------===//
+
+def CortexM3Model : SchedMachineModel {
+ let IssueWidth = 1; // Only IT can be dual-issued, so assume single-issue
+ let MicroOpBufferSize = 0; // In-order
+ let LoadLatency = 2; // Latency when not pipelined, not pc-relative
+ let MispredictPenalty = 2; // Best case branch taken cost
+
+ let CompleteModel = 0;
+}
Modified: llvm/trunk/lib/Target/ARM/ARMSubtarget.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMSubtarget.h?rev=306547&r1=306546&r2=306547&view=diff
==============================================================================
--- llvm/trunk/lib/Target/ARM/ARMSubtarget.h (original)
+++ llvm/trunk/lib/Target/ARM/ARMSubtarget.h Wed Jun 28 07:11:15 2017
@@ -246,6 +246,11 @@ protected:
/// avoid issue "normal" call instructions to callees which do not return.
bool HasRetAddrStack = false;
+ /// HasBranchPredictor - True if the subtarget has a branch predictor. Having
+ /// a branch predictor or not changes the expected cost of taking a branch
+ /// which affects the choice of whether to use predicated instructions.
+ bool HasBranchPredictor = true;
+
/// HasMPExtension - True if the subtarget supports Multiprocessing
/// extension (ARMv7 only).
bool HasMPExtension = false;
@@ -554,6 +559,7 @@ public:
bool cheapPredicableCPSRDef() const { return CheapPredicableCPSRDef; }
bool avoidMOVsShifterOperand() const { return AvoidMOVsShifterOperand; }
bool hasRetAddrStack() const { return HasRetAddrStack; }
+ bool hasBranchPredictor() const { return HasBranchPredictor; }
bool hasMPExtension() const { return HasMPExtension; }
bool hasDSP() const { return HasDSP; }
bool useNaClTrap() const { return UseNaClTrap; }
Added: llvm/trunk/test/CodeGen/Thumb2/ifcvt-no-branch-predictor.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/Thumb2/ifcvt-no-branch-predictor.ll?rev=306547&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/Thumb2/ifcvt-no-branch-predictor.ll (added)
+++ llvm/trunk/test/CodeGen/Thumb2/ifcvt-no-branch-predictor.ll Wed Jun 28 07:11:15 2017
@@ -0,0 +1,154 @@
+; RUN: llc < %s -mtriple=thumbv7m -mcpu=cortex-m7 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BP
+; RUN: llc < %s -mtriple=thumbv7m -mcpu=cortex-m3 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-NOBP
+
+declare void @otherfn()
+
+; CHECK-LABEL: triangle1:
+; CHECK: itt ne
+; CHECK: movne
+; CHECK: strne
+define i32 @triangle1(i32 %n, i32* %p) {
+entry:
+ %tobool = icmp eq i32 %n, 0
+ br i1 %tobool, label %if.end, label %if.then
+
+if.then:
+ store i32 1, i32* %p, align 4
+ br label %if.end
+
+if.end:
+ tail call void @otherfn()
+ ret i32 0
+}
+
+; CHECK-LABEL: triangle2:
+; CHECK-BP: itttt ne
+; CHECK-BP: movne
+; CHECK-BP: strne
+; CHECK-BP: movne
+; CHECK-BP: strne
+; CHECK-NOBP: cbz
+; CHECK-NOBP: movs
+; CHECK-NOBP: str
+; CHECK-NOBP: movs
+; CHECK-NOBP: str
+define i32 @triangle2(i32 %n, i32* %p, i32* %q) {
+entry:
+ %tobool = icmp eq i32 %n, 0
+ br i1 %tobool, label %if.end, label %if.then
+
+if.then:
+ store i32 1, i32* %p, align 4
+ store i32 2, i32* %q, align 4
+ br label %if.end
+
+if.end:
+ tail call void @otherfn()
+ ret i32 0
+}
+
+; CHECK-LABEL: triangle3:
+; CHECK: cbz
+; CHECK: movs
+; CHECK: str
+; CHECK: movs
+; CHECK: str
+; CHECK: movs
+; CHECK: str
+define i32 @triangle3(i32 %n, i32* %p, i32* %q, i32* %r) {
+entry:
+ %tobool = icmp eq i32 %n, 0
+ br i1 %tobool, label %if.end, label %if.then
+
+if.then:
+ store i32 1, i32* %p, align 4
+ store i32 2, i32* %q, align 4
+ store i32 3, i32* %r, align 4
+ br label %if.end
+
+if.end:
+ tail call void @otherfn()
+ ret i32 0
+}
+
+; CHECK-LABEL: diamond1:
+; CHECK: ite eq
+; CHECK: ldreq
+; CHECK: strne
+define i32 @diamond1(i32 %n, i32* %p) {
+entry:
+ %tobool = icmp eq i32 %n, 0
+ br i1 %tobool, label %if.else, label %if.then
+
+if.then:
+ store i32 %n, i32* %p, align 4
+ br label %if.end
+
+if.else:
+ %0 = load i32, i32* %p, align 4
+ br label %if.end
+
+if.end:
+ %n.addr.0 = phi i32 [ %n, %if.then ], [ %0, %if.else ]
+ tail call void @otherfn()
+ ret i32 %n.addr.0
+}
+
+; CHECK-LABEL: diamond2:
+; CHECK-BP: itte
+; CHECK-BP: streq
+; CHECK-BP: ldreq
+; CHECK-BP: strne
+; CHECK-NOBP: cbz
+; CHECK-NOBP: str
+; CHECK-NOBP: b
+; CHECK-NOBP: str
+; CHECK-NOBP: ldr
+define i32 @diamond2(i32 %n, i32 %m, i32* %p, i32* %q) {
+entry:
+ %tobool = icmp eq i32 %n, 0
+ br i1 %tobool, label %if.else, label %if.then
+
+if.then:
+ store i32 %n, i32* %p, align 4
+ br label %if.end
+
+if.else:
+ store i32 %m, i32* %q, align 4
+ %0 = load i32, i32* %p, align 4
+ br label %if.end
+
+if.end:
+ %n.addr.0 = phi i32 [ %n, %if.then ], [ %0, %if.else ]
+ tail call void @otherfn()
+ ret i32 %n.addr.0
+}
+
+; CHECK-LABEL: diamond3:
+; CHECK: cbz
+; CHECK: movs
+; CHECK: str
+; CHECK: b
+; CHECK: ldr
+; CHECK: ldr
+; CHECK: adds
+define i32 @diamond3(i32 %n, i32* %p, i32* %q) {
+entry:
+ %tobool = icmp eq i32 %n, 0
+ br i1 %tobool, label %if.else, label %if.then
+
+if.then:
+ store i32 1, i32* %p, align 4
+ br label %if.end
+
+if.else:
+ %0 = load i32, i32* %p, align 4
+ %1 = load i32, i32* %q, align 4
+ %add = add nsw i32 %1, %0
+ br label %if.end
+
+if.end:
+ %n.addr.0 = phi i32 [ %n, %if.then ], [ %add, %if.else ]
+ tail call void @otherfn()
+ ret i32 %n.addr.0
+}
More information about the llvm-commits
mailing list