[llvm-commits] [llvm] r171524 - in /llvm/trunk: lib/Target/X86/CMakeLists.txt lib/Target/X86/X86.h lib/Target/X86/X86.td lib/Target/X86/X86PadShortFunction.cpp lib/Target/X86/X86Subtarget.cpp lib/Target/X86/X86Subtarget.h lib/Target/X86/X86TargetMachine.cpp test/CodeGen/X86/atom-pad-short-functions.ll test/CodeGen/X86/fast-isel-x86-64.ll test/CodeGen/X86/ret-mmx.ll test/CodeGen/X86/select.ll
Nadav Rotem
nrotem at apple.com
Fri Jan 4 13:21:27 PST 2013
Also a few other comments. Please use DenseMap and not std::map. Please check for Os, and do not increase the code size if the 'optforsize' attribute is set.
On Jan 4, 2013, at 12:54 PM, Preston Gurd <preston.gurd at intel.com> wrote:
> Author: pgurd
> Date: Fri Jan 4 14:54:54 2013
> New Revision: 171524
>
> URL: http://llvm.org/viewvc/llvm-project?rev=171524&view=rev
> Log:
> The current Intel Atom microarchitecture has a feature whereby when a function
> returns early then it is slightly faster to execute a sequence of NOP
> instructions to wait until the return address is ready,
> as opposed to simply stalling on the ret instruction
> until the return address is ready.
>
> When compiling for X86 Atom only, this patch will run a pass, called
> "X86PadShortFunction" which will add NOP instructions where less than four
> cycles elapse between function entry and return.
>
> It includes tests.
>
> Patch by Andy Zhang.
>
>
> Added:
> llvm/trunk/lib/Target/X86/X86PadShortFunction.cpp
> llvm/trunk/test/CodeGen/X86/atom-pad-short-functions.ll
> Modified:
> llvm/trunk/lib/Target/X86/CMakeLists.txt
> llvm/trunk/lib/Target/X86/X86.h
> llvm/trunk/lib/Target/X86/X86.td
> llvm/trunk/lib/Target/X86/X86Subtarget.cpp
> llvm/trunk/lib/Target/X86/X86Subtarget.h
> llvm/trunk/lib/Target/X86/X86TargetMachine.cpp
> llvm/trunk/test/CodeGen/X86/fast-isel-x86-64.ll
> llvm/trunk/test/CodeGen/X86/ret-mmx.ll
> llvm/trunk/test/CodeGen/X86/select.ll
>
> Modified: llvm/trunk/lib/Target/X86/CMakeLists.txt
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/CMakeLists.txt?rev=171524&r1=171523&r2=171524&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/CMakeLists.txt (original)
> +++ llvm/trunk/lib/Target/X86/CMakeLists.txt Fri Jan 4 14:54:54 2013
> @@ -25,6 +25,7 @@
> X86JITInfo.cpp
> X86MCInstLower.cpp
> X86MachineFunctionInfo.cpp
> + X86PadShortFunction.cpp
> X86RegisterInfo.cpp
> X86SelectionDAGInfo.cpp
> X86Subtarget.cpp
>
> Modified: llvm/trunk/lib/Target/X86/X86.h
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86.h?rev=171524&r1=171523&r2=171524&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86.h (original)
> +++ llvm/trunk/lib/Target/X86/X86.h Fri Jan 4 14:54:54 2013
> @@ -63,6 +63,11 @@
> ///
> FunctionPass *createEmitX86CodeToMemory();
>
> +/// createX86PadShortFunctions - Return a pass that pads short functions
> +/// with NOOPs. This will prevent a stall when returning from the function
> +/// on the Atom.
> +FunctionPass *createX86PadShortFunctions();
> +
> } // End llvm namespace
>
> #endif
>
> Modified: llvm/trunk/lib/Target/X86/X86.td
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86.td?rev=171524&r1=171523&r2=171524&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86.td (original)
> +++ llvm/trunk/lib/Target/X86/X86.td Fri Jan 4 14:54:54 2013
> @@ -123,8 +123,11 @@
> def FeatureLeaForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true",
> "Use LEA for adjusting the stack pointer">;
> def FeatureSlowDivide : SubtargetFeature<"idiv-to-divb",
> - "HasSlowDivide", "true",
> - "Use small divide for positive values less than 256">;
> + "HasSlowDivide", "true",
> + "Use small divide for positive values less than 256">;
> +def FeaturePadShortFunctions : SubtargetFeature<"pad-short-functions",
> + "PadShortFunctions", "true",
> + "Pad short functions">;
>
> //===----------------------------------------------------------------------===//
> // X86 processors supported.
> @@ -167,7 +170,7 @@
> FeatureSlowBTMem]>;
> def : AtomProc<"atom", [ProcIntelAtom, FeatureSSSE3, FeatureCMPXCHG16B,
> FeatureMOVBE, FeatureSlowBTMem, FeatureLeaForSP,
> - FeatureSlowDivide]>;
> + FeatureSlowDivide, FeaturePadShortFunctions]>;
> // "Arrandale" along with corei3 and corei5
> def : Proc<"corei7", [FeatureSSE42, FeatureCMPXCHG16B,
> FeatureSlowBTMem, FeatureFastUAMem,
>
> Added: llvm/trunk/lib/Target/X86/X86PadShortFunction.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86PadShortFunction.cpp?rev=171524&view=auto
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86PadShortFunction.cpp (added)
> +++ llvm/trunk/lib/Target/X86/X86PadShortFunction.cpp Fri Jan 4 14:54:54 2013
> @@ -0,0 +1,184 @@
> +//===-------- X86PadShortFunction.cpp - pad short functions -----------===//
> +//
> +// The LLVM Compiler Infrastructure
> +//
> +// This file is distributed under the University of Illinois Open Source
> +// License. See LICENSE.TXT for details.
> +//
> +//===----------------------------------------------------------------------===//
> +//
> +// This file defines the pass which will pad short functions to prevent
> +// a stall if a function returns before the return address is ready. This
> +// is needed for some Intel Atom processors.
> +//
> +//===----------------------------------------------------------------------===//
> +
> +#include <map>
> +#include <algorithm>
> +
> +#define DEBUG_TYPE "x86-pad-short-functions"
> +#include "X86.h"
> +#include "X86InstrInfo.h"
> +#include "llvm/ADT/Statistic.h"
> +#include "llvm/CodeGen/MachineFunctionPass.h"
> +#include "llvm/CodeGen/MachineInstrBuilder.h"
> +#include "llvm/CodeGen/MachineRegisterInfo.h"
> +#include "llvm/CodeGen/Passes.h"
> +#include "llvm/Support/Debug.h"
> +#include "llvm/Support/raw_ostream.h"
> +#include "llvm/Target/TargetInstrInfo.h"
> +using namespace llvm;
> +
> +STATISTIC(NumBBsPadded, "Number of basic blocks padded");
> +
> +namespace {
> + struct PadShortFunc : public MachineFunctionPass {
> + static char ID;
> + PadShortFunc() : MachineFunctionPass(ID)
> + , Threshold(4)
> + {}
> +
> + virtual bool runOnMachineFunction(MachineFunction &MF);
> +
> + virtual const char *getPassName() const
> + {
> + return "X86 Atom pad short functions";
> + }
> +
> + private:
> + bool addPadding(MachineFunction &MF,
> + MachineBasicBlock &MBB,
> + MachineBasicBlock::iterator &MBBI,
> + unsigned int NOOPsToAdd);
> +
> + void findReturn(MachineFunction &MF,
> + MachineBasicBlock &MBB,
> + unsigned int Cycles);
> +
> + bool cyclesUntilReturn(MachineFunction &MF,
> + MachineBasicBlock &MBB,
> + unsigned int &Cycles,
> + MachineBasicBlock::iterator *Location = 0);
> +
> + const unsigned int Threshold;
> + std::map<int, unsigned int> ReturnBBs;
> + };
> +
> + char PadShortFunc::ID = 0;
> +}
> +
> +FunctionPass *llvm::createX86PadShortFunctions() {
> + return new PadShortFunc();
> +}
> +
> +/// runOnMachineFunction - Loop over all of the basic blocks, inserting
> +/// NOOP instructions before early exits.
> +bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) {
> + // Process all basic blocks.
> + ReturnBBs.clear();
> +
> + // Search through basic blocks and mark the ones that have early returns
> + findReturn(MF, *MF.begin(), 0);
> +
> + int BBNum;
> + MachineBasicBlock::iterator ReturnLoc;
> + MachineBasicBlock *MBB;
> +
> + unsigned int Cycles = 0;
> + unsigned int BBCycles;
> +
> + // Pad the identified basic blocks with NOOPs
> + for (std::map<int, unsigned int>::iterator I = ReturnBBs.begin();
> + I != ReturnBBs.end(); ++I) {
> + BBNum = I->first;
> + Cycles = I->second;
> +
> + if (Cycles < Threshold) {
> + MBB = MF.getBlockNumbered(BBNum);
> + if (!cyclesUntilReturn(MF, *MBB, BBCycles, &ReturnLoc))
> + continue;
> +
> + addPadding(MF, *MBB, ReturnLoc, Threshold - Cycles);
> + NumBBsPadded++;
> + }
> + }
> +
> + return false;
> +}
> +
> +/// findReturn - Starting at MBB, follow control flow and add all
> +/// basic blocks that contain a return to ReturnBBs.
> +void PadShortFunc::findReturn(MachineFunction &MF,
> + MachineBasicBlock &MBB,
> + unsigned int Cycles)
> +{
> + // If this BB has a return, note how many cycles it takes to get there.
> + bool hasReturn = cyclesUntilReturn(MF, MBB, Cycles);
> + if (Cycles >= Threshold)
> + return;
> +
> + if (hasReturn) {
> + int BBNum = MBB.getNumber();
> + ReturnBBs[BBNum] = std::max(ReturnBBs[BBNum], Cycles);
> +
> + return;
> + }
> +
> + // Follow branches in BB and look for returns
> + for (MachineBasicBlock::succ_iterator I = MBB.succ_begin();
> + I != MBB.succ_end(); ++I) {
> + findReturn(MF, **I, Cycles);
> + }
> +}
> +
> +/// cyclesUntilReturn - if the MBB has a return instruction, set Location to
> +/// to the instruction and return true. Return false otherwise.
> +/// Cycles will be incremented by the number of cycles taken to reach the
> +/// return or the end of the BB, whichever occurs first.
> +bool PadShortFunc::cyclesUntilReturn(MachineFunction &MF,
> + MachineBasicBlock &MBB,
> + unsigned int &Cycles,
> + MachineBasicBlock::iterator *Location)
> +{
> + const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
> + const TargetMachine &Target = MF.getTarget();
> +
> + for (MachineBasicBlock::iterator MBBI = MBB.begin(); MBBI != MBB.end();
> + ++MBBI) {
> + MachineInstr *MI = MBBI;
> + // Mark basic blocks with a return instruction. Calls to other functions
> + // do not count because the called function will be padded, if necessary
> + if (MI->isReturn() && !MI->isCall()) {
> + if (Location)
> + *Location = MBBI;
> + return true;
> + }
> +
> + Cycles += TII.getInstrLatency(Target.getInstrItineraryData(), MI);
> + }
> +
> + return false;
> +}
> +
> +/// addPadding - Add the given number of NOOP instructions to the function
> +/// right before the return at MBBI
> +bool PadShortFunc::addPadding(MachineFunction &MF,
> + MachineBasicBlock &MBB,
> + MachineBasicBlock::iterator &MBBI,
> + unsigned int NOOPsToAdd)
> +{
> + const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
> +
> + DebugLoc DL = MBBI->getDebugLoc();
> +
> + while (NOOPsToAdd-- > 0) {
> + // Since Atom has two instruction execution ports,
> + // the code emits two noops, which will be executed in parallell
> + // during one cycle.
> + BuildMI(MBB, MBBI, DL, TII.get(X86::NOOP));
> + BuildMI(MBB, MBBI, DL, TII.get(X86::NOOP));
> + }
> +
> + return true;
> +}
> +
>
> Modified: llvm/trunk/lib/Target/X86/X86Subtarget.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86Subtarget.cpp?rev=171524&r1=171523&r2=171524&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86Subtarget.cpp (original)
> +++ llvm/trunk/lib/Target/X86/X86Subtarget.cpp Fri Jan 4 14:54:54 2013
> @@ -350,6 +350,7 @@
> , UseLeaForSP(false)
> , HasSlowDivide(false)
> , PostRAScheduler(false)
> + , PadShortFunctions(false)
> , stackAlignment(4)
> // FIXME: this is a known good value for Yonah. How about others?
> , MaxInlineSizeThreshold(128)
>
> Modified: llvm/trunk/lib/Target/X86/X86Subtarget.h
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86Subtarget.h?rev=171524&r1=171523&r2=171524&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86Subtarget.h (original)
> +++ llvm/trunk/lib/Target/X86/X86Subtarget.h Fri Jan 4 14:54:54 2013
> @@ -146,6 +146,10 @@
> /// PostRAScheduler - True if using post-register-allocation scheduler.
> bool PostRAScheduler;
>
> + /// PadShortFunctions - True if the short functions should be padded to prevent
> + /// a stall when returning too early.
> + bool PadShortFunctions;
> +
> /// stackAlignment - The minimum alignment known to hold of the stack frame on
> /// entry to the function and which must be maintained by every function.
> unsigned stackAlignment;
> @@ -231,6 +235,7 @@
> bool hasCmpxchg16b() const { return HasCmpxchg16b; }
> bool useLeaForSP() const { return UseLeaForSP; }
> bool hasSlowDivide() const { return HasSlowDivide; }
> + bool padShortFunctions() const { return PadShortFunctions; }
>
> bool isAtom() const { return X86ProcFamily == IntelAtom; }
>
>
> Modified: llvm/trunk/lib/Target/X86/X86TargetMachine.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86TargetMachine.cpp?rev=171524&r1=171523&r2=171524&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86TargetMachine.cpp (original)
> +++ llvm/trunk/lib/Target/X86/X86TargetMachine.cpp Fri Jan 4 14:54:54 2013
> @@ -190,6 +190,10 @@
> addPass(createX86IssueVZeroUpperPass());
> ShouldPrint = true;
> }
> + if (getX86Subtarget().padShortFunctions()){
> + addPass(createX86PadShortFunctions());
> + ShouldPrint = true;
> + }
>
> return ShouldPrint;
> }
>
> Added: llvm/trunk/test/CodeGen/X86/atom-pad-short-functions.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/atom-pad-short-functions.ll?rev=171524&view=auto
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/atom-pad-short-functions.ll (added)
> +++ llvm/trunk/test/CodeGen/X86/atom-pad-short-functions.ll Fri Jan 4 14:54:54 2013
> @@ -0,0 +1,71 @@
> +; RUN: llc < %s -mcpu=atom -mtriple=i686-linux | FileCheck %s
> +
> +declare void @external_function(...)
> +
> +define i32 @test_return_val(i32 %a) nounwind {
> +; CHECK: test_return_val
> +; CHECK: movl
> +; CHECK: nop
> +; CHECK: nop
> +; CHECK: nop
> +; CHECK: nop
> +; CHECK: nop
> +; CHECK: nop
> +; CHECK: ret
> + ret i32 %a
> +}
> +
> +define i32 @test_add(i32 %a, i32 %b) nounwind {
> +; CHECK: test_add
> +; CHECK: addl
> +; CHECK: nop
> +; CHECK: nop
> +; CHECK: nop
> +; CHECK: nop
> +; CHECK: ret
> + %result = add i32 %a, %b
> + ret i32 %result
> +}
> +
> +define i32 @test_multiple_ret(i32 %a, i32 %b, i1 %c) nounwind {
> +; CHECK: @test_multiple_ret
> +; CHECK: je
> +
> +; CHECK: nop
> +; CHECK: nop
> +; CHECK: ret
> +
> +; CHECK: nop
> +; CHECK: nop
> +; CHECK: ret
> +
> + br i1 %c, label %bb1, label %bb2
> +
> +bb1:
> + ret i32 %a
> +
> +bb2:
> + ret i32 %b
> +}
> +
> +define void @test_call_others(i32 %x) nounwind
> +{
> +; CHECK: test_call_others
> +; CHECK: je
> + %tobool = icmp eq i32 %x, 0
> + br i1 %tobool, label %if.end, label %true.case
> +
> +; CHECK: jmp external_function
> +true.case:
> + tail call void bitcast (void (...)* @external_function to void ()*)() nounwind
> + br label %if.end
> +
> +; CHECK: nop
> +; CHECK: nop
> +; CHECK: nop
> +; CHECK: nop
> +; CHECK: ret
> +if.end:
> + ret void
> +
> +}
>
> Modified: llvm/trunk/test/CodeGen/X86/fast-isel-x86-64.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/fast-isel-x86-64.ll?rev=171524&r1=171523&r2=171524&view=diff
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/fast-isel-x86-64.ll (original)
> +++ llvm/trunk/test/CodeGen/X86/fast-isel-x86-64.ll Fri Jan 4 14:54:54 2013
> @@ -1,5 +1,5 @@
> -; RUN: llc < %s -mattr=-avx -fast-isel -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort | FileCheck %s
> -; RUN: llc < %s -mattr=+avx -fast-isel -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort | FileCheck %s --check-prefix=AVX
> +; RUN: llc < %s -mattr=-avx -fast-isel -mcpu=core2 -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort | FileCheck %s
> +; RUN: llc < %s -mattr=+avx -fast-isel -mcpu=core2 -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort | FileCheck %s --check-prefix=AVX
>
> target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
> target triple = "x86_64-apple-darwin10.0.0"
>
> Modified: llvm/trunk/test/CodeGen/X86/ret-mmx.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/ret-mmx.ll?rev=171524&r1=171523&r2=171524&view=diff
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/ret-mmx.ll (original)
> +++ llvm/trunk/test/CodeGen/X86/ret-mmx.ll Fri Jan 4 14:54:54 2013
> @@ -1,4 +1,4 @@
> -; RUN: llc < %s -mtriple=x86_64-apple-darwin11 -mattr=+mmx,+sse2 | FileCheck %s
> +; RUN: llc < %s -mtriple=x86_64-apple-darwin11 -mcpu=core2 -mattr=+mmx,+sse2 | FileCheck %s
> ; rdar://6602459
>
> @g_v1di = external global <1 x i64>
>
> Modified: llvm/trunk/test/CodeGen/X86/select.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/select.ll?rev=171524&r1=171523&r2=171524&view=diff
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/select.ll (original)
> +++ llvm/trunk/test/CodeGen/X86/select.ll Fri Jan 4 14:54:54 2013
> @@ -282,7 +282,7 @@
> ; ATOM: test13:
> ; ATOM: cmpl
> ; ATOM-NEXT: sbbl
> -; ATOM-NEXT: ret
> +; ATOM: ret
> }
>
> define i32 @test14(i32 %a, i32 %b) nounwind {
> @@ -299,7 +299,7 @@
> ; ATOM: cmpl
> ; ATOM-NEXT: sbbl
> ; ATOM-NEXT: notl
> -; ATOM-NEXT: ret
> +; ATOM: ret
> }
>
> ; rdar://10961709
>
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
More information about the llvm-commits
mailing list