[llvm-commits] [llvm] r168931 - in /llvm/trunk: include/llvm/Target/TargetTransformImpl.h include/llvm/TargetTransformInfo.h lib/Target/X86/X86ISelLowering.cpp lib/Target/X86/X86ISelLowering.h lib/Target/X86/X86TargetMachine.h lib/Transforms/Scalar/LoopIdiomRecognize.cpp test/Transforms/LoopIdiom/popcnt.ll
Galina Kistanova
gkistanova at gmail.com
Fri Dec 7 14:53:37 PST 2012
Hello,
The test llvm/trunk/test/Transforms/LoopIdiom/popcnt.ll
fails in builder clang-native-arm-cortex-a9
http://lab.llvm.org:8011/builders/clang-native-arm-cortex-a9/builds/3811/steps/check-all/logs/fail
Please have a look at it.
Thanks
Galina
On Thu, Nov 29, 2012 at 11:38 AM, Shuxin Yang <shuxin.llvm at gmail.com> wrote:
> Author: shuxin_yang
> Date: Thu Nov 29 13:38:54 2012
> New Revision: 168931
>
> URL: http://llvm.org/viewvc/llvm-project?rev=168931&view=rev
> Log:
> rdar://12100355 (part 1)
>
> This revision attempts to recognize following population-count pattern:
>
> while(a) { c++; ... ; a &= a - 1; ... },
> where <c> and <a>could be used multiple times in the loop body.
>
> TODO: On X8664 and ARM, __buildin_ctpop() are not expanded to a efficent
> instruction sequence, which need to be improved in the following commits.
>
> Reviewed by Nadav, really appreciate!
>
> Added:
> llvm/trunk/test/Transforms/LoopIdiom/popcnt.ll
> Modified:
> llvm/trunk/include/llvm/Target/TargetTransformImpl.h
> llvm/trunk/include/llvm/TargetTransformInfo.h
> llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
> llvm/trunk/lib/Target/X86/X86ISelLowering.h
> llvm/trunk/lib/Target/X86/X86TargetMachine.h
> llvm/trunk/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
>
> Modified: llvm/trunk/include/llvm/Target/TargetTransformImpl.h
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Target/TargetTransformImpl.h?rev=168931&r1=168930&r2=168931&view=diff
> ==============================================================================
> --- llvm/trunk/include/llvm/Target/TargetTransformImpl.h (original)
> +++ llvm/trunk/include/llvm/Target/TargetTransformImpl.h Thu Nov 29 13:38:54 2012
> @@ -26,7 +26,7 @@
> /// ScalarTargetTransformInfo interface. Different targets can implement
> /// this interface differently.
> class ScalarTargetTransformImpl : public ScalarTargetTransformInfo {
> -private:
> +protected:
> const TargetLowering *TLI;
>
> public:
>
> Modified: llvm/trunk/include/llvm/TargetTransformInfo.h
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/TargetTransformInfo.h?rev=168931&r1=168930&r2=168931&view=diff
> ==============================================================================
> --- llvm/trunk/include/llvm/TargetTransformInfo.h (original)
> +++ llvm/trunk/include/llvm/TargetTransformInfo.h Thu Nov 29 13:38:54 2012
> @@ -75,6 +75,18 @@
> /// LSR, and LowerInvoke use this interface.
> class ScalarTargetTransformInfo {
> public:
> + /// PopcntHwSupport - Hardware support for population count. Compared to the
> + /// SW implementation, HW support is supposed to significantly boost the
> + /// performance when the population is dense, and it may or not may degrade
> + /// performance if the population is sparse. A HW support is considered as
> + /// "Fast" if it can outperform, or is on a par with, SW implementaion when
> + /// the population is sparse; otherwise, it is considered as "Slow".
> + enum PopcntHwSupport {
> + None,
> + Fast,
> + Slow
> + };
> +
> virtual ~ScalarTargetTransformInfo() {}
>
> /// isLegalAddImmediate - Return true if the specified immediate is legal
> @@ -122,6 +134,11 @@
> virtual bool shouldBuildLookupTables() const {
> return true;
> }
> +
> + /// getPopcntHwSupport - Return hardware support for population count.
> + virtual PopcntHwSupport getPopcntHwSupport(unsigned IntTyWidthInBit) const {
> + return None;
> + }
> };
>
> /// VectorTargetTransformInfo - This interface is used by the vectorizers
>
> Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=168931&r1=168930&r2=168931&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
> +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Thu Nov 29 13:38:54 2012
> @@ -17670,6 +17670,17 @@
> return -1;
> }
>
> +ScalarTargetTransformInfo::PopcntHwSupport
> +X86ScalarTargetTransformImpl::getPopcntHwSupport(unsigned TyWidth) const {
> + assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
> + const X86Subtarget &ST = TLI->getTargetMachine().getSubtarget<X86Subtarget>();
> +
> + // TODO: Currently the __builtin_popcount() implementation using SSE3
> + // instructions is inefficient. Once the problem is fixed, we should
> + // call ST.hasSSE3() instead of ST.hasSSE4().
> + return ST.hasSSE41() ? Fast : None;
> +}
> +
> unsigned
> X86VectorTargetTransformInfo::getArithmeticInstrCost(unsigned Opcode,
> Type *Ty) const {
>
> Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.h
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.h?rev=168931&r1=168930&r2=168931&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86ISelLowering.h (original)
> +++ llvm/trunk/lib/Target/X86/X86ISelLowering.h Thu Nov 29 13:38:54 2012
> @@ -933,6 +933,14 @@
> const TargetLibraryInfo *libInfo);
> }
>
> + class X86ScalarTargetTransformImpl : public ScalarTargetTransformImpl {
> + public:
> + explicit X86ScalarTargetTransformImpl(const TargetLowering *TL) :
> + ScalarTargetTransformImpl(TL) {};
> +
> + virtual PopcntHwSupport getPopcntHwSupport(unsigned TyWidth) const;
> + };
> +
> class X86VectorTargetTransformInfo : public VectorTargetTransformImpl {
> public:
> explicit X86VectorTargetTransformInfo(const TargetLowering *TL) :
>
> Modified: llvm/trunk/lib/Target/X86/X86TargetMachine.h
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86TargetMachine.h?rev=168931&r1=168930&r2=168931&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86TargetMachine.h (original)
> +++ llvm/trunk/lib/Target/X86/X86TargetMachine.h Thu Nov 29 13:38:54 2012
> @@ -118,7 +118,7 @@
> X86SelectionDAGInfo TSInfo;
> X86TargetLowering TLInfo;
> X86JITInfo JITInfo;
> - ScalarTargetTransformImpl STTI;
> + X86ScalarTargetTransformImpl STTI;
> X86VectorTargetTransformInfo VTTI;
> public:
> X86_64TargetMachine(const Target &T, StringRef TT,
>
> Modified: llvm/trunk/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Scalar/LoopIdiomRecognize.cpp?rev=168931&r1=168930&r2=168931&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Transforms/Scalar/LoopIdiomRecognize.cpp (original)
> +++ llvm/trunk/lib/Transforms/Scalar/LoopIdiomRecognize.cpp Thu Nov 29 13:38:54 2012
> @@ -56,6 +56,7 @@
> #include "llvm/Support/raw_ostream.h"
> #include "llvm/DataLayout.h"
> #include "llvm/Target/TargetLibraryInfo.h"
> +#include "llvm/TargetTransformInfo.h"
> #include "llvm/Transforms/Utils/Local.h"
> using namespace llvm;
>
> @@ -63,16 +64,83 @@
> STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores");
>
> namespace {
> +
> + class LoopIdiomRecognize;
> +
> + /// This class defines some utility functions for loop idiom recognization.
> + class LIRUtil {
> + public:
> + /// Return true iff the block contains nothing but an uncondition branch
> + /// (aka goto instruction).
> + static bool isAlmostEmpty(BasicBlock *);
> +
> + static BranchInst *getBranch(BasicBlock *BB) {
> + return dyn_cast<BranchInst>(BB->getTerminator());
> + }
> +
> + /// Return the condition of the branch terminating the given basic block.
> + static Value *getBrCondtion(BasicBlock *);
> +
> + /// Derive the precondition block (i.e the block that guards the loop
> + /// preheader) from the given preheader.
> + static BasicBlock *getPrecondBb(BasicBlock *PreHead);
> + };
> +
> + /// This class is to recoginize idioms of population-count conducted in
> + /// a noncountable loop. Currently it only recognizes this pattern:
> + /// \code
> + /// while(x) {cnt++; ...; x &= x - 1; ...}
> + /// \endcode
> + class NclPopcountRecognize {
> + LoopIdiomRecognize &LIR;
> + Loop *CurLoop;
> + BasicBlock *PreCondBB;
> +
> + typedef IRBuilder<> IRBuilderTy;
> +
> + public:
> + explicit NclPopcountRecognize(LoopIdiomRecognize &TheLIR);
> + bool recognize();
> +
> + private:
> + /// Take a glimpse of the loop to see if we need to go ahead recoginizing
> + /// the idiom.
> + bool preliminaryScreen();
> +
> + /// Check if the given conditional branch is based on the comparison
> + /// beween a variable and zero, and if the variable is non-zero, the
> + /// control yeilds to the loop entry. If the branch matches the behavior,
> + /// the variable involved in the comparion is returned. This function will
> + /// be called to see if the precondition and postcondition of the loop
> + /// are in desirable form.
> + Value *matchCondition (BranchInst *Br, BasicBlock *NonZeroTarget) const;
> +
> + /// Return true iff the idiom is detected in the loop. and 1) \p CntInst
> + /// is set to the instruction counting the pupulation bit. 2) \p CntPhi
> + /// is set to the corresponding phi node. 3) \p Var is set to the value
> + /// whose population bits are being counted.
> + bool detectIdiom
> + (Instruction *&CntInst, PHINode *&CntPhi, Value *&Var) const;
> +
> + /// Insert ctpop intrinsic function and some obviously dead instructions.
> + void transform (Instruction *CntInst, PHINode *CntPhi, Value *Var);
> +
> + /// Create llvm.ctpop.* intrinsic function.
> + CallInst *createPopcntIntrinsic(IRBuilderTy &IRB, Value *Val, DebugLoc DL);
> + };
> +
> class LoopIdiomRecognize : public LoopPass {
> Loop *CurLoop;
> const DataLayout *TD;
> DominatorTree *DT;
> ScalarEvolution *SE;
> TargetLibraryInfo *TLI;
> + const ScalarTargetTransformInfo *STTI;
> public:
> static char ID;
> explicit LoopIdiomRecognize() : LoopPass(ID) {
> initializeLoopIdiomRecognizePass(*PassRegistry::getPassRegistry());
> + TD = 0; DT = 0; SE = 0; TLI = 0; STTI = 0;
> }
>
> bool runOnLoop(Loop *L, LPPassManager &LPM);
> @@ -110,6 +178,36 @@
> AU.addRequired<DominatorTree>();
> AU.addRequired<TargetLibraryInfo>();
> }
> +
> + const DataLayout *getDataLayout() {
> + return TD ? TD : TD=getAnalysisIfAvailable<DataLayout>();
> + }
> +
> + DominatorTree *getDominatorTree() {
> + return DT ? DT : (DT=&getAnalysis<DominatorTree>());
> + }
> +
> + ScalarEvolution *getScalarEvolution() {
> + return SE ? SE : (SE = &getAnalysis<ScalarEvolution>());
> + }
> +
> + TargetLibraryInfo *getTargetLibraryInfo() {
> + return TLI ? TLI : (TLI = &getAnalysis<TargetLibraryInfo>());
> + }
> +
> + const ScalarTargetTransformInfo *getScalarTargetTransformInfo() {
> + if (!STTI) {
> + TargetTransformInfo *TTI = getAnalysisIfAvailable<TargetTransformInfo>();
> + if (TTI) STTI = TTI->getScalarTargetTransformInfo();
> + }
> + return STTI;
> + }
> +
> + Loop *getLoop() const { return CurLoop; }
> +
> + private:
> + bool runOnNoncountableLoop();
> + bool runOnCountableLoop();
> };
> }
>
> @@ -172,24 +270,390 @@
> deleteDeadInstruction(I, SE, TLI);
> }
>
> -bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) {
> - CurLoop = L;
> +//===----------------------------------------------------------------------===//
> +//
> +// Implementation of LIRUtil
> +//
> +//===----------------------------------------------------------------------===//
> +
> +// This fucntion will return true iff the given block contains nothing but goto.
> +// A typical usage of this function is to check if the preheader fucntion is
> +// "almost" empty such that generated intrinsic function can be moved across
> +// preheader and to be placed at the end of the preconditiona block without
> +// concerning of breaking data dependence.
> +bool LIRUtil::isAlmostEmpty(BasicBlock *BB) {
> + if (BranchInst *Br = getBranch(BB)) {
> + return Br->isUnconditional() && BB->size() == 1;
> + }
> + return false;
> +}
>
> - // If the loop could not be converted to canonical form, it must have an
> - // indirectbr in it, just give up.
> - if (!L->getLoopPreheader())
> +Value *LIRUtil::getBrCondtion(BasicBlock *BB) {
> + BranchInst *Br = getBranch(BB);
> + return Br ? Br->getCondition() : 0;
> +}
> +
> +BasicBlock *LIRUtil::getPrecondBb(BasicBlock *PreHead) {
> + if (BasicBlock *BB = PreHead->getSinglePredecessor()) {
> + BranchInst *Br = getBranch(BB);
> + return Br && Br->isConditional() ? BB : 0;
> + }
> + return 0;
> +}
> +
> +//===----------------------------------------------------------------------===//
> +//
> +// Implementation of NclPopcountRecognize
> +//
> +//===----------------------------------------------------------------------===//
> +
> +NclPopcountRecognize::NclPopcountRecognize(LoopIdiomRecognize &TheLIR):
> + LIR(TheLIR), CurLoop(TheLIR.getLoop()), PreCondBB(0) {
> +}
> +
> +bool NclPopcountRecognize::preliminaryScreen() {
> + const ScalarTargetTransformInfo *STTI = LIR.getScalarTargetTransformInfo();
> + if (STTI->getPopcntHwSupport(32) != ScalarTargetTransformInfo::Fast)
> return false;
>
> - // Disable loop idiom recognition if the function's name is a common idiom.
> - StringRef Name = L->getHeader()->getParent()->getName();
> - if (Name == "memset" || Name == "memcpy")
> + // Counting population are usually conducted by few arithmetic instrutions.
> + // Such instructions can be easilly "absorbed" by vacant slots in a
> + // non-compact loop. Therefore, recognizing popcount idiom only makes sense
> + // in a compact loop.
> +
> + // Give up if the loop has multiple blocks or multiple backedges.
> + if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
> return false;
>
> - // The trip count of the loop must be analyzable.
> - SE = &getAnalysis<ScalarEvolution>();
> - if (!SE->hasLoopInvariantBackedgeTakenCount(L))
> + BasicBlock *LoopBody = *(CurLoop->block_begin());
> + if (LoopBody->size() >= 20) {
> + // The loop is too big, bail out.
> + return false;
> + }
> +
> + // It should have a preheader containing nothing but a goto instruction.
> + BasicBlock *PreHead = CurLoop->getLoopPreheader();
> + if (!PreHead || !LIRUtil::isAlmostEmpty(PreHead))
> + return false;
> +
> + // It should have a precondition block where the generated popcount instrinsic
> + // function will be inserted.
> + PreCondBB = LIRUtil::getPrecondBb(PreHead);
> + if (!PreCondBB)
> + return false;
> +
> + return true;
> +}
> +
> +Value *NclPopcountRecognize::matchCondition (BranchInst *Br,
> + BasicBlock *LoopEntry) const {
> + if (!Br || !Br->isConditional())
> + return 0;
> +
> + ICmpInst *Cond = dyn_cast<ICmpInst>(Br->getCondition());
> + if (!Cond)
> + return 0;
> +
> + ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1));
> + if (!CmpZero || !CmpZero->isZero())
> + return 0;
> +
> + ICmpInst::Predicate Pred = Cond->getPredicate();
> + if ((Pred == ICmpInst::ICMP_NE && Br->getSuccessor(0) == LoopEntry) ||
> + (Pred == ICmpInst::ICMP_EQ && Br->getSuccessor(1) == LoopEntry))
> + return Cond->getOperand(0);
> +
> + return 0;
> +}
> +
> +bool NclPopcountRecognize::detectIdiom(Instruction *&CntInst,
> + PHINode *&CntPhi,
> + Value *&Var) const {
> + // Following code tries to detect this idiom:
> + //
> + // if (x0 != 0)
> + // goto loop-exit // the precondition of the loop
> + // cnt0 = init-val;
> + // do {
> + // x1 = phi (x0, x2);
> + // cnt1 = phi(cnt0, cnt2);
> + //
> + // cnt2 = cnt1 + 1;
> + // ...
> + // x2 = x1 & (x1 - 1);
> + // ...
> + // } while(x != 0);
> + //
> + // loop-exit:
> + //
> +
> + // step 1: Check to see if the look-back branch match this pattern:
> + // "if (a!=0) goto loop-entry".
> + BasicBlock *LoopEntry;
> + Instruction *DefX2, *CountInst;
> + Value *VarX1, *VarX0;
> + PHINode *PhiX, *CountPhi;
> +
> + DefX2 = CountInst = 0;
> + VarX1 = VarX0 = 0;
> + PhiX = CountPhi = 0;
> + LoopEntry = *(CurLoop->block_begin());
> +
> + // step 1: Check if the loop-back branch is in desirable form.
> + {
> + if (Value *T = matchCondition (LIRUtil::getBranch(LoopEntry), LoopEntry))
> + DefX2 = dyn_cast<Instruction>(T);
> + else
> + return false;
> + }
> +
> + // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)"
> + {
> + if (DefX2->getOpcode() != Instruction::And)
> + return false;
> +
> + BinaryOperator *SubOneOp;
> +
> + if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0))))
> + VarX1 = DefX2->getOperand(1);
> + else {
> + VarX1 = DefX2->getOperand(0);
> + SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1));
> + }
> + if (!SubOneOp)
> + return false;
> +
> + Instruction *SubInst = cast<Instruction>(SubOneOp);
> + ConstantInt *Dec = dyn_cast<ConstantInt>(SubInst->getOperand(1));
> + if (!Dec ||
> + !((SubInst->getOpcode() == Instruction::Sub && Dec->isOne()) ||
> + (SubInst->getOpcode() == Instruction::Add && Dec->isAllOnesValue()))) {
> + return false;
> + }
> + }
> +
> + // step 3: Check the recurrence of variable X
> + {
> + PhiX = dyn_cast<PHINode>(VarX1);
> + if (!PhiX ||
> + (PhiX->getOperand(0) != DefX2 && PhiX->getOperand(1) != DefX2)) {
> + return false;
> + }
> + }
> +
> + // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1
> + {
> + CountInst = NULL;
> + for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI(),
> + IterE = LoopEntry->end(); Iter != IterE; Iter++) {
> + Instruction *Inst = Iter;
> + if (Inst->getOpcode() != Instruction::Add)
> + continue;
> +
> + ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
> + if (!Inc || !Inc->isOne())
> + continue;
> +
> + PHINode *Phi = dyn_cast<PHINode>(Inst->getOperand(0));
> + if (!Phi && Phi->getParent() != LoopEntry)
> + continue;
> +
> + // Check if the result of the instruction is live of the loop.
> + bool LiveOutLoop = false;
> + for (Value::use_iterator I = Inst->use_begin(), E = Inst->use_end();
> + I != E; I++) {
> + if ((cast<Instruction>(*I))->getParent() != LoopEntry) {
> + LiveOutLoop = true; break;
> + }
> + }
> +
> + if (LiveOutLoop) {
> + CountInst = Inst;
> + CountPhi = Phi;
> + break;
> + }
> + }
> +
> + if (!CountInst)
> + return false;
> + }
> +
> + // step 5: check if the precondition is in this form:
> + // "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;"
> + {
> + BranchInst *PreCondBr = LIRUtil::getBranch(PreCondBB);
> + Value *T = matchCondition (PreCondBr, CurLoop->getLoopPreheader());
> + if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1))
> + return false;
> +
> + CntInst = CountInst;
> + CntPhi = CountPhi;
> + Var = T;
> + }
> +
> + return true;
> +}
> +
> +void NclPopcountRecognize::transform(Instruction *CntInst,
> + PHINode *CntPhi, Value *Var) {
> +
> + ScalarEvolution *SE = LIR.getScalarEvolution();
> + TargetLibraryInfo *TLI = LIR.getTargetLibraryInfo();
> + BasicBlock *PreHead = CurLoop->getLoopPreheader();
> + BranchInst *PreCondBr = LIRUtil::getBranch(PreCondBB);
> + const DebugLoc DL = CntInst->getDebugLoc();
> +
> + // Assuming before transformation, the loop is following:
> + // if (x) // the precondition
> + // do { cnt++; x &= x - 1; } while(x);
> +
> + // Step 1: Insert the ctpop instruction at the end of the precondition block
> + IRBuilderTy Builder(PreCondBr);
> + Value *PopCnt, *PopCntZext, *NewCount;
> + {
> + PopCnt = createPopcntIntrinsic(Builder, Var, DL);
> + NewCount = PopCntZext =
> + Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType()));
> +
> + if (NewCount != PopCnt)
> + (cast<Instruction>(NewCount))->setDebugLoc(DL);
> +
> + // If the popoulation counter's initial value is not zero, insert Add Inst.
> + Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead);
> + ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
> + if (!InitConst || !InitConst->isZero()) {
> + NewCount = Builder.CreateAdd(PopCnt, InitConst);
> + (cast<Instruction>(NewCount))->setDebugLoc(DL);
> + }
> + }
> +
> + // Step 2: Replace the precondition from "if(x == 0) goto loop-exit" to
> + // "if(NewCount == 0) loop-exit". Withtout this change, the intrinsic
> + // function would be partial dead code, and downstream passes will drag
> + // it back from the precondition block to the preheader.
> + {
> + ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition());
> +
> + Value *Opnd0 = PopCntZext;
> + Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0);
> + if (PreCond->getOperand(0) != Var)
> + std::swap(Opnd0, Opnd1);
> +
> + ICmpInst *NewPreCond =
> + cast<ICmpInst>(Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1));
> + PreCond->replaceAllUsesWith(NewPreCond);
> +
> + deleteDeadInstruction(PreCond, *SE, TLI);
> + }
> +
> + // Step 3: Note that the population count is exactly the trip count of the
> + // loop in question, which enble us to to convert the loop from noncountable
> + // loop into a countable one. The benefit is twofold:
> + //
> + // - If the loop only counts population, the entire loop become dead after
> + // the transformation. It is lots easier to prove a countable loop dead
> + // than to prove a noncountable one. (In some C dialects, a infite loop
> + // isn't dead even if it computes nothing useful. In general, DCE needs
> + // to prove a noncountable loop finite before safely delete it.)
> + //
> + // - If the loop also performs something else, it remains alive.
> + // Since it is transformed to countable form, it can be aggressively
> + // optimized by some optimizations which are in general not applicable
> + // to a noncountable loop.
> + //
> + // After this step, this loop (conceptually) would look like following:
> + // newcnt = __builtin_ctpop(x);
> + // t = newcnt;
> + // if (x)
> + // do { cnt++; x &= x-1; t--) } while (t > 0);
> + BasicBlock *Body = *(CurLoop->block_begin());
> + {
> + BranchInst *LbBr = LIRUtil::getBranch(Body);
> + ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
> + Type *Ty = NewCount->getType();
> +
> + PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", Body->begin());
> +
> + Builder.SetInsertPoint(LbCond);
> + Value *Opnd1 = cast<Value>(TcPhi);
> + Value *Opnd2 = cast<Value>(ConstantInt::get(Ty, 1));
> + Instruction *TcDec =
> + cast<Instruction>(Builder.CreateSub(Opnd1, Opnd2, "tcdec", false, true));
> +
> + TcPhi->addIncoming(NewCount, PreHead);
> + TcPhi->addIncoming(TcDec, Body);
> +
> + CmpInst::Predicate Pred = (LbBr->getSuccessor(0) == Body) ?
> + CmpInst::ICMP_UGT : CmpInst::ICMP_SLE;
> + LbCond->setPredicate(Pred);
> + LbCond->setOperand(0, TcDec);
> + LbCond->setOperand(1, cast<Value>(ConstantInt::get(Ty, 0)));
> + }
> +
> + // Step 4: All the references to the original population counter outside
> + // the loop are replaced with the NewCount -- the value returned from
> + // __builtin_ctpop().
> + {
> + SmallVector<Value *, 4> CntUses;
> + for (Value::use_iterator I = CntInst->use_begin(), E = CntInst->use_end();
> + I != E; I++) {
> + if (cast<Instruction>(*I)->getParent() != Body)
> + CntUses.push_back(*I);
> + }
> + for (unsigned Idx = 0; Idx < CntUses.size(); Idx++) {
> + (cast<Instruction>(CntUses[Idx]))->replaceUsesOfWith(CntInst, NewCount);
> + }
> + }
> +
> + // step 5: Forget the "non-computable" trip-count SCEV associated with the
> + // loop. The loop would otherwise not be deleted even if it becomes empty.
> + SE->forgetLoop(CurLoop);
> +}
> +
> +CallInst *NclPopcountRecognize::createPopcntIntrinsic(IRBuilderTy &IRBuilder,
> + Value *Val, DebugLoc DL) {
> + Value *Ops[] = { Val };
> + Type *Tys[] = { Val->getType() };
> +
> + Module *M = (*(CurLoop->block_begin()))->getParent()->getParent();
> + Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys);
> + CallInst *CI = IRBuilder.CreateCall(Func, Ops);
> + CI->setDebugLoc(DL);
> +
> + return CI;
> +}
> +
> +/// recognize - detect population count idiom in a non-countable loop. If
> +/// detected, transform the relevant code to popcount intrinsic function
> +/// call, and return true; otherwise, return false.
> +bool NclPopcountRecognize::recognize() {
> +
> + if (!LIR.getScalarTargetTransformInfo())
> + return false;
> +
> + LIR.getScalarEvolution();
> +
> + if (!preliminaryScreen())
> + return false;
> +
> + Instruction *CntInst;
> + PHINode *CntPhi;
> + Value *Val;
> + if (!detectIdiom(CntInst, CntPhi, Val))
> return false;
> - const SCEV *BECount = SE->getBackedgeTakenCount(L);
> +
> + transform(CntInst, CntPhi, Val);
> + return true;
> +}
> +
> +//===----------------------------------------------------------------------===//
> +//
> +// Implementation of LoopIdiomRecognize
> +//
> +//===----------------------------------------------------------------------===//
> +
> +bool LoopIdiomRecognize::runOnCountableLoop() {
> + const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop);
> if (isa<SCEVCouldNotCompute>(BECount)) return false;
>
> // If this loop executes exactly one time, then it should be peeled, not
> @@ -199,24 +663,27 @@
> return false;
>
> // We require target data for now.
> - TD = getAnalysisIfAvailable<DataLayout>();
> - if (TD == 0) return false;
> + if (!getDataLayout())
> + return false;
> +
> + getDominatorTree();
>
> - DT = &getAnalysis<DominatorTree>();
> LoopInfo &LI = getAnalysis<LoopInfo>();
> TLI = &getAnalysis<TargetLibraryInfo>();
>
> + getTargetLibraryInfo();
> +
> SmallVector<BasicBlock*, 8> ExitBlocks;
> CurLoop->getUniqueExitBlocks(ExitBlocks);
>
> DEBUG(dbgs() << "loop-idiom Scanning: F["
> - << L->getHeader()->getParent()->getName()
> - << "] Loop %" << L->getHeader()->getName() << "\n");
> + << CurLoop->getHeader()->getParent()->getName()
> + << "] Loop %" << CurLoop->getHeader()->getName() << "\n");
>
> bool MadeChange = false;
> // Scan all the blocks in the loop that are not in subloops.
> - for (Loop::block_iterator BI = L->block_begin(), E = L->block_end(); BI != E;
> - ++BI) {
> + for (Loop::block_iterator BI = CurLoop->block_begin(),
> + E = CurLoop->block_end(); BI != E; ++BI) {
> // Ignore blocks in subloops.
> if (LI.getLoopFor(*BI) != CurLoop)
> continue;
> @@ -226,6 +693,33 @@
> return MadeChange;
> }
>
> +bool LoopIdiomRecognize::runOnNoncountableLoop() {
> + NclPopcountRecognize Popcount(*this);
> + if (Popcount.recognize())
> + return true;
> +
> + return false;
> +}
> +
> +bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) {
> + CurLoop = L;
> +
> + // If the loop could not be converted to canonical form, it must have an
> + // indirectbr in it, just give up.
> + if (!L->getLoopPreheader())
> + return false;
> +
> + // Disable loop idiom recognition if the function's name is a common idiom.
> + StringRef Name = L->getHeader()->getParent()->getName();
> + if (Name == "memset" || Name == "memcpy")
> + return false;
> +
> + SE = &getAnalysis<ScalarEvolution>();
> + if (SE->hasLoopInvariantBackedgeTakenCount(L))
> + return runOnCountableLoop();
> + return runOnNoncountableLoop();
> +}
> +
> /// runOnLoopBlock - Process the specified block, which lives in a counted loop
> /// with the specified backedge count. This block is known to be in the current
> /// loop and not in any subloops.
>
> Added: llvm/trunk/test/Transforms/LoopIdiom/popcnt.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopIdiom/popcnt.ll?rev=168931&view=auto
> ==============================================================================
> --- llvm/trunk/test/Transforms/LoopIdiom/popcnt.ll (added)
> +++ llvm/trunk/test/Transforms/LoopIdiom/popcnt.ll Thu Nov 29 13:38:54 2012
> @@ -0,0 +1,76 @@
> +; RUN: opt -loop-idiom < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -S | FileCheck %s
> +
> +;To recognize this pattern:
> +;int popcount(unsigned long long a) {
> +; int c = 0;
> +; while (a) {
> +; c++;
> +; a &= a - 1;
> +; }
> +; return c;
> +;}
> +;
> +; CHECK: entry
> +; CHECK: llvm.ctpop.i64
> +; CHECK: ret
> +define i32 @popcount(i64 %a) nounwind uwtable readnone ssp {
> +entry:
> + %tobool3 = icmp eq i64 %a, 0
> + br i1 %tobool3, label %while.end, label %while.body
> +
> +while.body: ; preds = %entry, %while.body
> + %c.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
> + %a.addr.04 = phi i64 [ %and, %while.body ], [ %a, %entry ]
> + %inc = add nsw i32 %c.05, 1
> + %sub = add i64 %a.addr.04, -1
> + %and = and i64 %sub, %a.addr.04
> + %tobool = icmp eq i64 %and, 0
> + br i1 %tobool, label %while.end, label %while.body
> +
> +while.end: ; preds = %while.body, %entry
> + %c.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.body ]
> + ret i32 %c.0.lcssa
> +}
> +
> +; To recognize this pattern:
> +;int popcount(unsigned long long a, int mydata1, int mydata2) {
> +; int c = 0;
> +; while (a) {
> +; c++;
> +; a &= a - 1;
> +; mydata1 *= c;
> +; mydata2 *= (int)a;
> +; }
> +; return c + mydata1 + mydata2;
> +;}
> +; CHECK: entry
> +; CHECK: llvm.ctpop.i64
> +; CHECK: ret
> +define i32 @popcount2(i64 %a, i32 %mydata1, i32 %mydata2) nounwind uwtable readnone ssp {
> +entry:
> + %tobool9 = icmp eq i64 %a, 0
> + br i1 %tobool9, label %while.end, label %while.body
> +
> +while.body: ; preds = %entry, %while.body
> + %c.013 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
> + %mydata2.addr.012 = phi i32 [ %mul1, %while.body ], [ %mydata2, %entry ]
> + %mydata1.addr.011 = phi i32 [ %mul, %while.body ], [ %mydata1, %entry ]
> + %a.addr.010 = phi i64 [ %and, %while.body ], [ %a, %entry ]
> + %inc = add nsw i32 %c.013, 1
> + %sub = add i64 %a.addr.010, -1
> + %and = and i64 %sub, %a.addr.010
> + %mul = mul nsw i32 %inc, %mydata1.addr.011
> + %conv = trunc i64 %and to i32
> + %mul1 = mul nsw i32 %conv, %mydata2.addr.012
> + %tobool = icmp eq i64 %and, 0
> + br i1 %tobool, label %while.end, label %while.body
> +
> +while.end: ; preds = %while.body, %entry
> + %c.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.body ]
> + %mydata2.addr.0.lcssa = phi i32 [ %mydata2, %entry ], [ %mul1, %while.body ]
> + %mydata1.addr.0.lcssa = phi i32 [ %mydata1, %entry ], [ %mul, %while.body ]
> + %add = add i32 %mydata2.addr.0.lcssa, %mydata1.addr.0.lcssa
> + %add2 = add i32 %add, %c.0.lcssa
> + ret i32 %add2
> +}
> +
>
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
More information about the llvm-commits
mailing list