[llvm-commits] [llvm] r171946 - in /llvm/trunk: include/llvm/CodeGen/TargetSchedule.h include/llvm/MC/MCSchedule.h include/llvm/Target/TargetSchedule.td lib/CodeGen/MachineScheduler.cpp lib/Target/ARM/ARMScheduleA9.td lib/Target/X86/X86Schedule.td lib/Target/X86/X86ScheduleAtom.td test/CodeGen/ARM/misched-inorder-latency.ll test/CodeGen/PowerPC/misched-inorder-latency.ll utils/TableGen/SubtargetEmitter.cpp

Fri Jan 11 09:38:25 PST 2013

----- Original Message -----
> From: "Andrew Trick" <atrick at apple.com>
> To: llvm-commits at cs.uiuc.edu
> Sent: Tuesday, January 8, 2013 9:36:49 PM
> Subject: [llvm-commits] [llvm] r171946 - in /llvm/trunk: include/llvm/CodeGen/TargetSchedule.h
> include/llvm/MC/MCSchedule.h include/llvm/Target/TargetSchedule.td lib/CodeGen/MachineScheduler.cpp
> lib/Target/ARM/ARMScheduleA9.td lib/Target/X86/X86Schedule.td lib/Target/X86/X86ScheduleAtom.td
> test/CodeGen/ARM/misched-inorder-latency.ll test/CodeGen/PowerPC/misched-inorder-latency.ll
> utils/TableGen/SubtargetEmitter.cpp
> 
> Author: atrick
> Date: Tue Jan  8 21:36:49 2013
> New Revision: 171946
> 
> URL: http://llvm.org/viewvc/llvm-project?rev=171946&view=rev
> Log:
> MIsched: add an ILP window property to machine model.
> 
> This was an experimental option, but needs to be defined
> per-target. e.g. PPC A2 needs to aggressively hide latency.
> 
> I converted some in-order scheduling tests to A2. Hal is working on
> more test cases.

Andy, thanks!

Also, do you want to do the same thing with enable-aa-sched-mi?

 -Hal

> 
> Added:
>     llvm/trunk/test/CodeGen/PowerPC/misched-inorder-latency.ll
>       - copied, changed from r171943,
>       llvm/trunk/test/CodeGen/ARM/misched-inorder-latency.ll
> Removed:
>     llvm/trunk/test/CodeGen/ARM/misched-inorder-latency.ll
> Modified:
>     llvm/trunk/include/llvm/CodeGen/TargetSchedule.h
>     llvm/trunk/include/llvm/MC/MCSchedule.h
>     llvm/trunk/include/llvm/Target/TargetSchedule.td
>     llvm/trunk/lib/CodeGen/MachineScheduler.cpp
>     llvm/trunk/lib/Target/ARM/ARMScheduleA9.td
>     llvm/trunk/lib/Target/X86/X86Schedule.td
>     llvm/trunk/lib/Target/X86/X86ScheduleAtom.td
>     llvm/trunk/utils/TableGen/SubtargetEmitter.cpp
> 
> Modified: llvm/trunk/include/llvm/CodeGen/TargetSchedule.h
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/CodeGen/TargetSchedule.h?rev=171946&r1=171945&r2=171946&view=diff
> ==============================================================================
> --- llvm/trunk/include/llvm/CodeGen/TargetSchedule.h (original)
> +++ llvm/trunk/include/llvm/CodeGen/TargetSchedule.h Tue Jan  8
> 21:36:49 2013
> @@ -84,6 +84,9 @@
>    /// \brief Maximum number of micro-ops that may be scheduled per
>    cycle.
>    unsigned getIssueWidth() const { return SchedModel.IssueWidth; }
>  
> +  /// \brief Number of cycles the OOO processor is expected to hide.
> +  unsigned getILPWindow() const { return SchedModel.ILPWindow; }
> +
>    /// \brief Return the number of issue slots required for this MI.
>    unsigned getNumMicroOps(const MachineInstr *MI,
>                            const MCSchedClassDesc *SC = 0) const;
> 
> Modified: llvm/trunk/include/llvm/MC/MCSchedule.h
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/MC/MCSchedule.h?rev=171946&r1=171945&r2=171946&view=diff
> ==============================================================================
> --- llvm/trunk/include/llvm/MC/MCSchedule.h (original)
> +++ llvm/trunk/include/llvm/MC/MCSchedule.h Tue Jan  8 21:36:49 2013
> @@ -155,7 +155,7 @@
>    //      Optional InstrItinerary OperandCycles provides expected
>    latency.
>    //      TODO: can't yet specify both min and expected latency per
>    operand.
>    int MinLatency;
> -  static const unsigned DefaultMinLatency = -1;
> +  static const int DefaultMinLatency = -1;
>  
>    // LoadLatency is the expected latency of load instructions.
>    //
> @@ -172,6 +172,16 @@
>    unsigned HighLatency;
>    static const unsigned DefaultHighLatency = 10;
>  
> +  // ILPWindow is the number of cycles that the scheduler
> effectively ignores
> +  // before attempting to hide latency. This should be zero for
> in-order cpus to
> +  // always hide expected latency. For out-of-order cpus, it may be
> tweaked as
> +  // desired to roughly approximate instruction buffers. The actual
> threshold is
> +  // not very important for an OOO processor, as long as it isn't
> too high. A
> +  // nonzero value helps avoid rescheduling to hide latency when its
> is fairly
> +  // obviously useless and makes register pressure heuristics more
> effective.
> +  unsigned ILPWindow;
> +  static const unsigned DefaultILPWindow = 0;
> +
>    // MispredictPenalty is the typical number of extra cycles the
>    processor
>    // takes to recover from a branch misprediction.
>    unsigned MispredictPenalty;
> @@ -196,6 +206,7 @@
>                    MinLatency(DefaultMinLatency),
>                    LoadLatency(DefaultLoadLatency),
>                    HighLatency(DefaultHighLatency),
> +                  ILPWindow(DefaultILPWindow),
>                    MispredictPenalty(DefaultMispredictPenalty),
>                    ProcID(0), ProcResourceTable(0),
>                    SchedClassTable(0),
>                    NumProcResourceKinds(0), NumSchedClasses(0),
> @@ -205,12 +216,12 @@
>    }
>  
>    // Table-gen driven ctor.
> -  MCSchedModel(unsigned iw, int ml, unsigned ll, unsigned hl,
> unsigned mp,
> -               unsigned pi, const MCProcResourceDesc *pr,
> +  MCSchedModel(unsigned iw, int ml, unsigned ll, unsigned hl,
> unsigned ilp,
> +               unsigned mp, unsigned pi, const MCProcResourceDesc
> *pr,
>                 const MCSchedClassDesc *sc, unsigned npr, unsigned
>                 nsc,
>                 const InstrItinerary *ii):
>      IssueWidth(iw), MinLatency(ml), LoadLatency(ll),
>      HighLatency(hl),
> -    MispredictPenalty(mp), ProcID(pi), ProcResourceTable(pr),
> +    ILPWindow(ilp), MispredictPenalty(mp), ProcID(pi),
> ProcResourceTable(pr),
>      SchedClassTable(sc), NumProcResourceKinds(npr),
>      NumSchedClasses(nsc),
>      InstrItineraries(ii) {}
>  
> 
> Modified: llvm/trunk/include/llvm/Target/TargetSchedule.td
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Target/TargetSchedule.td?rev=171946&r1=171945&r2=171946&view=diff
> ==============================================================================
> --- llvm/trunk/include/llvm/Target/TargetSchedule.td (original)
> +++ llvm/trunk/include/llvm/Target/TargetSchedule.td Tue Jan  8
> 21:36:49 2013
> @@ -76,6 +76,7 @@
>    int IssueWidth = -1; // Max micro-ops that may be scheduled per
>    cycle.
>    int MinLatency = -1; // Determines which instrucions are allowed
>    in a group.
>                         // (-1) inorder (0) ooo, (1): inorder +var
>                         latencies.
> +  int ILPWindow = -1;  // Cycles of latency likely hidden by
> hardware buffers.
>    int LoadLatency = -1; // Cycles for loads to access the cache.
>    int HighLatency = -1; // Approximation of cycles for "high
>    latency" ops.
>    int MispredictPenalty = -1; // Extra cycles for a mispredicted
>    branch.
> 
> Modified: llvm/trunk/lib/CodeGen/MachineScheduler.cpp
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/MachineScheduler.cpp?rev=171946&r1=171945&r2=171946&view=diff
> ==============================================================================
> --- llvm/trunk/lib/CodeGen/MachineScheduler.cpp (original)
> +++ llvm/trunk/lib/CodeGen/MachineScheduler.cpp Tue Jan  8 21:36:49
> 2013
> @@ -48,15 +48,6 @@
>  static bool ViewMISchedDAGs = false;
>  #endif // NDEBUG
>  
> -// Threshold to very roughly model an out-of-order processor's
> instruction
> -// buffers. If the actual value of this threshold matters much in
> practice, then
> -// it can be specified by the machine model. For now, it's an
> experimental
> -// tuning knob to determine when and if it matters.
> -static cl::opt<unsigned> ILPWindow("ilp-window", cl::Hidden,
> -  cl::desc("Allow expected latency to exceed the critical path by N
> cycles "
> -           "before attempting to balance ILP"),
> -  cl::init(10U));
> -
>  // Experimental heuristics
>  static cl::opt<bool> EnableLoadCluster("misched-cluster",
>  cl::Hidden,
>    cl::desc("Enable load clustering."), cl::init(true));
> @@ -1297,7 +1288,8 @@
>      if (L > RemLatency)
>        RemLatency = L;
>    }
> -  if (RemLatency + ExpectedLatency >= Rem->CriticalPath + ILPWindow
> +  unsigned CriticalPathLimit = Rem->CriticalPath +
> SchedModel->getILPWindow();
> +  if (RemLatency + ExpectedLatency >= CriticalPathLimit
>        && RemLatency > Rem->getMaxRemainingCount(SchedModel)) {
>      Policy.ReduceLatency = true;
>      DEBUG(dbgs() << "Increase ILP: " << Available.getName() <<
>      '\n');
> 
> Modified: llvm/trunk/lib/Target/ARM/ARMScheduleA9.td
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMScheduleA9.td?rev=171946&r1=171945&r2=171946&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/ARM/ARMScheduleA9.td (original)
> +++ llvm/trunk/lib/Target/ARM/ARMScheduleA9.td Tue Jan  8 21:36:49
> 2013
> @@ -1887,6 +1887,9 @@
>    let LoadLatency = 2; // Optimistic load latency assuming bypass.
>                         // This is overriden by OperandCycles if the
>                         // Itineraries are queried instead.
> +  let ILPWindow = 10; // Don't reschedule small blocks to hide
> +                      // latency. Minimum latency requirements are
> already
> +                      // modeled strictly by reserving resources.
>    let MispredictPenalty = 8; // Based on estimate of pipeline depth.
>  
>    let Itineraries = CortexA9Itineraries;
> 
> Modified: llvm/trunk/lib/Target/X86/X86Schedule.td
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86Schedule.td?rev=171946&r1=171945&r2=171946&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86Schedule.td (original)
> +++ llvm/trunk/lib/Target/X86/X86Schedule.td Tue Jan  8 21:36:49 2013
> @@ -470,12 +470,17 @@
>  // latencies. Since these latencies are not used for pipeline
>  hazards,
>  // they do not need to be exact.
>  //
> +// ILPWindow=10 is an arbitrary threshold that approximates cycles
> of
> +// latency hidden by instruction buffers. The actual value is not
> very
> +// important but should be zero for inorder and nonzero for OOO
> processors.
> +//
>  // The GenericModel contains no instruciton itineraries.
>  def GenericModel : SchedMachineModel {
>    let IssueWidth = 4;
>    let MinLatency = 0;
>    let LoadLatency = 4;
>    let HighLatency = 10;
> +  let ILPWindow = 10;
>  }
>  
>  include "X86ScheduleAtom.td"
> 
> Modified: llvm/trunk/lib/Target/X86/X86ScheduleAtom.td
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ScheduleAtom.td?rev=171946&r1=171945&r2=171946&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86ScheduleAtom.td (original)
> +++ llvm/trunk/lib/Target/X86/X86ScheduleAtom.td Tue Jan  8 21:36:49
> 2013
> @@ -525,6 +525,7 @@
>                         // OperandCycles may be used for expected
>                         latency.
>    let LoadLatency = 3; // Expected cycles, may be overriden by
>    OperandCycles.
>    let HighLatency = 30;// Expected, may be overriden by
>    OperandCycles.
> +  let ILPWindow = 0; // Always try to hide expected latency.
>  
>    let Itineraries = AtomItineraries;
>  }
> 
> Removed: llvm/trunk/test/CodeGen/ARM/misched-inorder-latency.ll
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/misched-inorder-latency.ll?rev=171945&view=auto
> ==============================================================================
> --- llvm/trunk/test/CodeGen/ARM/misched-inorder-latency.ll (original)
> +++ llvm/trunk/test/CodeGen/ARM/misched-inorder-latency.ll (removed)
> @@ -1,48 +0,0 @@
> -; RUN: llc < %s -enable-misched -march=thumb -mcpu=swift \
> -; RUN:          -pre-RA-sched=source -scheditins=false -ilp-window=0
> \
> -; RUN:          -disable-ifcvt-triangle-false -disable-post-ra |
> FileCheck %s
> -;
> -; For these tests, we set -ilp-window=0 to simulate in order
> processor.
> -
> -; %val1 is a 3-cycle load live out of %entry. It should be hoisted
> -; above the add.
> -; CHECK: @testload
> -; CHECK: %entry
> -; CHECK: ldr
> -; CHECK: adds
> -; CHECK: bne
> -; CHECK: %true
> -define i32 @testload(i32 *%ptr, i32 %sumin) {
> -entry:
> -  %sum1 = add i32 %sumin, 1
> -  %val1 = load i32* %ptr
> -  %p = icmp eq i32 %sumin, 0
> -  br i1 %p, label %true, label %end
> -true:
> -  %sum2 = add i32 %sum1, 1
> -  %ptr2 = getelementptr i32* %ptr, i32 1
> -  %val = load i32* %ptr2
> -  %val2 = add i32 %val1, %val
> -  br label %end
> -end:
> -  %valmerge = phi i32 [ %val1, %entry], [ %val2, %true ]
> -  %summerge = phi i32 [ %sum1, %entry], [ %sum2, %true ]
> -  %sumout = add i32 %valmerge, %summerge
> -  ret i32 %sumout
> -}
> -
> -; The prefetch gets a default latency of 3 cycles and should be
> hoisted
> -; above the add.
> -;
> -; CHECK: @testprefetch
> -; CHECK: %entry
> -; CHECK: pld
> -; CHECK: adds
> -; CHECK: bx
> -define i32 @testprefetch(i8 *%ptr, i32 %i) {
> -entry:
> -  %tmp = add i32 %i, 1
> -  tail call void @llvm.prefetch( i8* %ptr, i32 0, i32 3, i32 1 )
> -  ret i32 %tmp
> -}
> -declare void @llvm.prefetch(i8*, i32, i32, i32) nounwind
> 
> Copied: llvm/trunk/test/CodeGen/PowerPC/misched-inorder-latency.ll
> (from r171943,
> llvm/trunk/test/CodeGen/ARM/misched-inorder-latency.ll)
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/PowerPC/misched-inorder-latency.ll?p2=llvm/trunk/test/CodeGen/PowerPC/misched-inorder-latency.ll&p1=llvm/trunk/test/CodeGen/ARM/misched-inorder-latency.ll&r1=171943&r2=171946&rev=171946&view=diff
> ==============================================================================
> --- llvm/trunk/test/CodeGen/ARM/misched-inorder-latency.ll (original)
> +++ llvm/trunk/test/CodeGen/PowerPC/misched-inorder-latency.ll Tue
> Jan  8 21:36:49 2013
> @@ -1,15 +1,15 @@
> -; RUN: llc < %s -enable-misched -march=thumb -mcpu=swift \
> -; RUN:          -pre-RA-sched=source -scheditins=false -ilp-window=0
> \
> +; RUN: llc < %s -enable-misched -pre-RA-sched=source
> -scheditins=false \
>  ; RUN:          -disable-ifcvt-triangle-false -disable-post-ra |
>  FileCheck %s
>  ;
> -; For these tests, we set -ilp-window=0 to simulate in order
> processor.
> +target datalayout =
> "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
> +target triple = "powerpc64-bgq-linux"
>  
> -; %val1 is a 3-cycle load live out of %entry. It should be hoisted
> +; %val1 is a load live out of %entry. It should be hoisted
>  ; above the add.
> -; CHECK: @testload
> +; CHECK: testload:
>  ; CHECK: %entry
> -; CHECK: ldr
> -; CHECK: adds
> +; CHECK: lwz
> +; CHECK: addi
>  ; CHECK: bne
>  ; CHECK: %true
>  define i32 @testload(i32 *%ptr, i32 %sumin) {
> @@ -34,15 +34,22 @@
>  ; The prefetch gets a default latency of 3 cycles and should be
>  hoisted
>  ; above the add.
>  ;
> -; CHECK: @testprefetch
> +; CHECK: testprefetch:
>  ; CHECK: %entry
> -; CHECK: pld
> -; CHECK: adds
> -; CHECK: bx
> +; CHECK: dcbt
> +; CHECK: addi
> +; CHECK: blr
>  define i32 @testprefetch(i8 *%ptr, i32 %i) {
>  entry:
> -  %tmp = add i32 %i, 1
> +  %val1 = add i32 %i, 1
>    tail call void @llvm.prefetch( i8* %ptr, i32 0, i32 3, i32 1 )
> -  ret i32 %tmp
> +  %p = icmp eq i32 %i, 0
> +  br i1 %p, label %true, label %end
> +true:
> +  %val2 = add i32 %val1, 1
> +  br label %end
> +end:
> +  %valmerge = phi i32 [ %val1, %entry], [ %val2, %true ]
> +  ret i32 %valmerge
>  }
>  declare void @llvm.prefetch(i8*, i32, i32, i32) nounwind
> 
> Modified: llvm/trunk/utils/TableGen/SubtargetEmitter.cpp
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/utils/TableGen/SubtargetEmitter.cpp?rev=171946&r1=171945&r2=171946&view=diff
> ==============================================================================
> --- llvm/trunk/utils/TableGen/SubtargetEmitter.cpp (original)
> +++ llvm/trunk/utils/TableGen/SubtargetEmitter.cpp Tue Jan  8
> 21:36:49 2013
> @@ -1108,6 +1108,7 @@
>      EmitProcessorProp(OS, PI->ModelDef, "MinLatency", ',');
>      EmitProcessorProp(OS, PI->ModelDef, "LoadLatency", ',');
>      EmitProcessorProp(OS, PI->ModelDef, "HighLatency", ',');
> +    EmitProcessorProp(OS, PI->ModelDef, "ILPWindow", ',');
>      EmitProcessorProp(OS, PI->ModelDef, "MispredictPenalty", ',');
>      OS << "  " << PI->Index << ", // Processor ID\n";
>      if (PI->hasInstrSchedModel())
> 
> 
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
> 

-- 
Hal Finkel
Postdoctoral Appointee
Leadership Computing Facility
Argonne National Laboratory