[llvm] r337701 - [ARM][NFC] ParallelDSP reorganisation

Mon Jul 23 10:25:45 PDT 2018

This revision introduced a memory leak:
http://lab.llvm.org:8011/builders/sanitizer-x86_64-linux-fast/builds/21273/steps/check-llvm%20asan/logs/stdio.
Please take a look.

==11220==ERROR: LeakSanitizer: detected memory leaks

Direct leak of 1024 byte(s) in 2 object(s) allocated from:
    #0 0xa4d088 in operator new(unsigned long)
/b/sanitizer-x86_64-linux-fast/build/llvm/projects/compiler-rt/lib/asan/asan_new_delete.cc:92
    #1 0x19068f7 in AddMACCandidate(llvm::SmallVector<(anonymous
namespace)::OpChain*, 8u>&, llvm::Instruction const*, llvm::Value*,
llvm::Value*, int)
/b/sanitizer-x86_64-linux-fast/build/llvm/lib/Target/ARM/ARMParallelDSP.cpp:458:26
    #2 0x19000da in MatchParallelMACSequences
/b/sanitizer-x86_64-linux-fast/build/llvm/lib/Target/ARM/ARMParallelDSP.cpp:471:5
    #3 0x19000da in (anonymous
namespace)::ARMParallelDSP::MatchSMLAD(llvm::Function&)
/b/sanitizer-x86_64-linux-fast/build/llvm/lib/Target/ARM/ARMParallelDSP.cpp:602
    #4 0x18fddb7 in (anonymous
namespace)::ARMParallelDSP::runOnLoop(llvm::Loop*,
llvm::LPPassManager&)
/b/sanitizer-x86_64-linux-fast/build/llvm/lib/Target/ARM/ARMParallelDSP.cpp:195:17
    #5 0x3687f59 in
llvm::LPPassManager::runOnFunction(llvm::Function&)
/b/sanitizer-x86_64-linux-fast/build/llvm/lib/Analysis/LoopPass.cpp:217:23
    #6 0x47f755d in
llvm::FPPassManager::runOnFunction(llvm::Function&)
/b/sanitizer-x86_64-linux-fast/build/llvm/lib/IR/LegacyPassManager.cpp:1586:27
    #7 0x47f7b72 in llvm::FPPassManager::runOnModule(llvm::Module&)
/b/sanitizer-x86_64-linux-fast/build/llvm/lib/IR/LegacyPassManager.cpp:1609:16
    #8 0x47f88e8 in runOnModule
/b/sanitizer-x86_64-linux-fast/build/llvm/lib/IR/LegacyPassManager.cpp:1669:27
    #9 0x47f88e8 in llvm::legacy::PassManagerImpl::run(llvm::Module&)
/b/sanitizer-x86_64-linux-fast/build/llvm/lib/IR/LegacyPassManager.cpp:1774
    #10 0xa9286e in main
/b/sanitizer-x86_64-linux-fast/build/llvm/tools/opt/opt.cpp:810:12
    #11 0x7f53d46c62e0 in __libc_start_main
(/lib/x86_64-linux-gnu/libc.so.6+0x202e0)

SUMMARY: AddressSanitizer: 1024 byte(s) leaked in 2 allocation(s).

On Mon, Jul 23, 2018 at 8:26 AM Sam Parker via llvm-commits <
llvm-commits at lists.llvm.org> wrote:

> Author: sam_parker
> Date: Mon Jul 23 08:25:59 2018
> New Revision: 337701
>
> URL: http://llvm.org/viewvc/llvm-project?rev=337701&view=rev
> Log:
> [ARM][NFC] ParallelDSP reorganisation
>
> In preparing to allow ARMParallelDSP pass to parallelise more than
> smlads, I've restructed some elements:
>
> - The ParallelMAC struct has been renamed to BinOpChain.
> - The BinOpChain struct holds two value lists: LHS and RHS, as well
>   as inheriting from the OpChain base class.
> - The OpChain struct holds all the values of the represented chain
>   and has had the memory locations functionality inserted into it.
> - ParallelMACList becomes OpChainList and it now holds pointers
>   instead of objects.
>
> Differential Revision: https://reviews.llvm.org/D49020
>
> Modified:
>     llvm/trunk/lib/Target/ARM/ARMParallelDSP.cpp
>
> Modified: llvm/trunk/lib/Target/ARM/ARMParallelDSP.cpp
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMParallelDSP.cpp?rev=337701&r1=337700&r2=337701&view=diff
>
> ==============================================================================
> --- llvm/trunk/lib/Target/ARM/ARMParallelDSP.cpp (original)
> +++ llvm/trunk/lib/Target/ARM/ARMParallelDSP.cpp Mon Jul 23 08:25:59 2018
> @@ -43,38 +43,56 @@ using namespace PatternMatch;
>  STATISTIC(NumSMLAD , "Number of smlad instructions generated");
>
>  namespace {
> -  struct ParallelMAC;
> +  struct OpChain;
> +  struct BinOpChain;
>    struct Reduction;
>
> -  using ParallelMACList = SmallVector<ParallelMAC, 8>;
> +  using OpChainList     = SmallVector<OpChain*, 8>;
>    using ReductionList   = SmallVector<Reduction, 8>;
>    using ValueList       = SmallVector<Value*, 8>;
>    using MemInstList     = SmallVector<Instruction*, 8>;
> -  using PMACPair        = std::pair<ParallelMAC*,ParallelMAC*>;
> +  using PMACPair        = std::pair<BinOpChain*,BinOpChain*>;
>    using PMACPairList    = SmallVector<PMACPair, 8>;
>    using Instructions    = SmallVector<Instruction*,16>;
>    using MemLocList      = SmallVector<MemoryLocation, 4>;
>
> -  // 'ParallelMAC' and 'Reduction' are just some bookkeeping data
> structures.
> +  struct OpChain {
> +    Instruction   *Root;
> +    ValueList     AllValues;
> +    MemInstList   VecLd;    // List of all load instructions.
> +    MemLocList    MemLocs;  // All memory locations read by this tree.
> +    bool          ReadOnly = true;
> +
> +    OpChain(Instruction *I, ValueList &vl) : Root(I), AllValues(vl) { }
> +
> +    void SetMemoryLocations() {
> +      const auto Size = MemoryLocation::UnknownSize;
> +      for (auto *V : AllValues) {
> +        if (auto *I = dyn_cast<Instruction>(V)) {
> +          if (I->mayWriteToMemory())
> +            ReadOnly = false;
> +          if (auto *Ld = dyn_cast<LoadInst>(V))
> +            MemLocs.push_back(MemoryLocation(Ld->getPointerOperand(),
> Size));
> +        }
> +      }
> +    }
> +
> +    unsigned size() const { return AllValues.size(); }
> +  };
> +
> +  // 'BinOpChain' and 'Reduction' are just some bookkeeping data
> structures.
>    // 'Reduction' contains the phi-node and accumulator statement from
> where we
> -  // start pattern matching, and 'ParallelMAC' the multiplication
> +  // start pattern matching, and 'BinOpChain' the multiplication
>    // instructions that are candidates for parallel execution.
> -  struct ParallelMAC {
> -    Instruction *Mul;
> -    ValueList    VL;        // List of all (narrow) operands of this Mul
> -    MemInstList  VecLd;     // List of all load instructions of this Mul
> -    MemLocList   MemLocs;   // All memory locations read by this Mul
> -
> -    // The MAC-chains we currently recognise are simple chains that
> accumulate
> -    // their results with a reducing integer add statement, and consist of
> -    // a chain of adds and muls, which have only sext and load
> instructions as
> -    // operands. Thus, these chains don't write memory. We check that
> this is
> -    // true when we collect the operands, and use this in alias analysis
> checks
> -    // that different parallel MACs don't interfere with each other.
> -    bool ReadOnly;
> -
> -    ParallelMAC(Instruction *I, ValueList &V, bool RdOnly)
> -      : Mul(I), VL(V), ReadOnly(RdOnly) {};
> +  struct BinOpChain : public OpChain {
> +    ValueList     LHS;      // List of all (narrow) left hand operands.
> +    ValueList     RHS;      // List of all (narrow) right hand operands.
> +
> +    BinOpChain(Instruction *I, ValueList &lhs, ValueList &rhs) :
> +      OpChain(I, lhs), LHS(lhs), RHS(rhs) {
> +        for (auto *V : RHS)
> +          AllValues.push_back(V);
> +      }
>    };
>
>    struct Reduction {
> @@ -83,7 +101,7 @@ namespace {
>      Instruction     *AccIntAdd;       // The accumulating integer add
> statement,
>                                        // i.e, the reduction statement.
>
> -    ParallelMACList MACCandidates;    // The MAC candidates associated
> with
> +    OpChainList     MACCandidates;    // The MAC candidates associated
> with
>                                        // this reduction statement.
>      Reduction (PHINode *P, Instruction *Acc) : Phi(P), AccIntAdd(Acc) { };
>    };
> @@ -100,7 +118,7 @@ namespace {
>
>      bool InsertParallelMACs(Reduction &Reduction, PMACPairList
> &PMACPairs);
>      bool AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1, MemInstList
> &VecMem);
> -    PMACPairList CreateParallelMACPairs(ParallelMACList &Candidates);
> +    PMACPairList CreateParallelMACPairs(OpChainList &Candidates);
>      Instruction *CreateSMLADCall(LoadInst *VecLd0, LoadInst *VecLd1,
>                                   Instruction *Acc, Instruction
> *InsertAfter);
>
> @@ -303,7 +321,7 @@ bool ARMParallelDSP::AreSequentialLoads(
>  }
>
>  PMACPairList
> -ARMParallelDSP::CreateParallelMACPairs(ParallelMACList &Candidates) {
> +ARMParallelDSP::CreateParallelMACPairs(OpChainList &Candidates) {
>    const unsigned Elems = Candidates.size();
>    PMACPairList PMACPairs;
>
> @@ -314,10 +332,10 @@ ARMParallelDSP::CreateParallelMACPairs(P
>    // We can compare all elements, but then we need to compare and evaluate
>    // different solutions.
>    for(unsigned i=0; i<Elems-1; i+=2) {
> -    ParallelMAC &PMul0 = Candidates[i];
> -    ParallelMAC &PMul1 = Candidates[i+1];
> -    const Instruction *Mul0 = PMul0.Mul;
> -    const Instruction *Mul1 = PMul1.Mul;
> +    BinOpChain *PMul0 = static_cast<BinOpChain*>(Candidates[i]);
> +    BinOpChain *PMul1 = static_cast<BinOpChain*>(Candidates[i+1]);
> +    const Instruction *Mul0 = PMul0->Root;
> +    const Instruction *Mul1 = PMul1->Root;
>
>      if (Mul0 == Mul1)
>        continue;
> @@ -326,10 +344,13 @@ ARMParallelDSP::CreateParallelMACPairs(P
>                 dbgs() << "- "; Mul0->dump();
>                 dbgs() << "- "; Mul1->dump());
>
> -    const ValueList &VL0 = PMul0.VL;
> -    const ValueList &VL1 = PMul1.VL;
> +    const ValueList &Mul0_LHS = PMul0->LHS;
> +    const ValueList &Mul0_RHS = PMul0->RHS;
> +    const ValueList &Mul1_LHS = PMul1->LHS;
> +    const ValueList &Mul1_RHS = PMul1->RHS;
>
> -    if (!AreSymmetrical(VL0, VL1))
> +    if (!AreSymmetrical(Mul0_LHS, Mul1_LHS) ||
> +        !AreSymmetrical(Mul0_RHS, Mul1_RHS))
>        continue;
>
>      LLVM_DEBUG(dbgs() << "OK: mul operands list match:\n");
> @@ -337,23 +358,23 @@ ARMParallelDSP::CreateParallelMACPairs(P
>      // that its two pairs of consecutive loads, then these can be
> transformed
>      // into two wider loads and the users can be replaced with DSP
>      // intrinsics.
> -    for (unsigned x = 0; x < VL0.size(); x += 4) {
> -      auto *Ld0 = dyn_cast<LoadInst>(VL0[x]);
> -      auto *Ld1 = dyn_cast<LoadInst>(VL1[x]);
> -      auto *Ld2 = dyn_cast<LoadInst>(VL0[x+2]);
> -      auto *Ld3 = dyn_cast<LoadInst>(VL1[x+2]);
> +    for (unsigned x = 0; x < Mul0_LHS.size(); x += 2) {
> +      auto *Ld0 = dyn_cast<LoadInst>(Mul0_LHS[x]);
> +      auto *Ld1 = dyn_cast<LoadInst>(Mul1_LHS[x]);
> +      auto *Ld2 = dyn_cast<LoadInst>(Mul0_RHS[x]);
> +      auto *Ld3 = dyn_cast<LoadInst>(Mul1_RHS[x]);
>
>        LLVM_DEBUG(dbgs() << "Looking at operands " << x << ":\n";
> -                 dbgs() << "\t mul1: "; VL0[x]->dump();
> -                 dbgs() << "\t mul2: "; VL1[x]->dump();
> +                 dbgs() << "\t mul1: "; Mul0_LHS[x]->dump();
> +                 dbgs() << "\t mul2: "; Mul1_LHS[x]->dump();
>                   dbgs() << "and operands " << x + 2 << ":\n";
> -                 dbgs() << "\t mul1: "; VL0[x+2]->dump();
> -                 dbgs() << "\t mul2: "; VL1[x+2]->dump());
> +                 dbgs() << "\t mul1: "; Mul0_RHS[x]->dump();
> +                 dbgs() << "\t mul2: "; Mul1_RHS[x]->dump());
>
> -      if (AreSequentialLoads(Ld0, Ld1, Candidates[i].VecLd) &&
> -          AreSequentialLoads(Ld2, Ld3, Candidates[i+1].VecLd)) {
> +      if (AreSequentialLoads(Ld0, Ld1, PMul0->VecLd) &&
> +          AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
>          LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
> -        PMACPairs.push_back(std::make_pair(&PMul0, &PMul1));
> +        PMACPairs.push_back(std::make_pair(PMul0, PMul1));
>        }
>      }
>    }
> @@ -367,8 +388,8 @@ bool ARMParallelDSP::InsertParallelMACs(
>
>    for (auto &Pair : PMACPairs) {
>      LLVM_DEBUG(dbgs() << "Found parallel MACs!!\n";
> -               dbgs() << "- "; Pair.first->Mul->dump();
> -               dbgs() << "- "; Pair.second->Mul->dump());
> +               dbgs() << "- "; Pair.first->Root->dump();
> +               dbgs() << "- "; Pair.second->Root->dump());
>      auto *VecLd0 = cast<LoadInst>(Pair.first->VecLd[0]);
>      auto *VecLd1 = cast<LoadInst>(Pair.second->VecLd[0]);
>      Acc = CreateSMLADCall(VecLd0, VecLd1, Acc, InsertAfter);
> @@ -383,9 +404,8 @@ bool ARMParallelDSP::InsertParallelMACs(
>    return false;
>  }
>
> -static ReductionList MatchReductions(Function &F, Loop *TheLoop,
> -                                     BasicBlock *Header) {
> -  ReductionList Reductions;
> +static void MatchReductions(Function &F, Loop *TheLoop, BasicBlock
> *Header,
> +                            ReductionList &Reductions) {
>    RecurrenceDescriptor RecDesc;
>    const bool HasFnNoNaNAttr =
>      F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
> @@ -394,7 +414,7 @@ static ReductionList MatchReductions(Fun
>    // We need a preheader as getIncomingValueForBlock assumes there is one.
>    if (!TheLoop->getLoopPreheader()) {
>      LLVM_DEBUG(dbgs() << "No preheader found, bailing out\n");
> -    return Reductions;
> +    return;
>    }
>
>    for (PHINode &Phi : Header->phis()) {
> @@ -418,36 +438,29 @@ static ReductionList MatchReductions(Fun
>
>    LLVM_DEBUG(
>      dbgs() << "\nAccumulating integer additions (reductions) found:\n";
> -    for (auto R : Reductions) {
> +    for (auto &R : Reductions) {
>        dbgs() << "-  "; R.Phi->dump();
>        dbgs() << "-> "; R.AccIntAdd->dump();
>      }
>    );
> -  return Reductions;
>  }
>
> -static void AddMACCandidate(ParallelMACList &Candidates, const
> Instruction *Acc,
> +static void AddMACCandidate(OpChainList &Candidates,
> +                            const Instruction *Acc,
>                              Value *MulOp0, Value *MulOp1, int MulOpNum) {
>    Instruction *Mul = dyn_cast<Instruction>(Acc->getOperand(MulOpNum));
>    LLVM_DEBUG(dbgs() << "OK, found acc mul:\t"; Mul->dump());
> -  ValueList VL;
> -  if (IsNarrowSequence<16>(MulOp0, VL) &&
> -      IsNarrowSequence<16>(MulOp1, VL)) {
> +  ValueList LHS;
> +  ValueList RHS;
> +  if (IsNarrowSequence<16>(MulOp0, LHS) &&
> +      IsNarrowSequence<16>(MulOp1, RHS)) {
>      LLVM_DEBUG(dbgs() << "OK, found narrow mul: "; Mul->dump());
> -
> -    bool MayWriteMem = false;
> -    for (auto &V : VL) {
> -      if (dyn_cast<Instruction>(V)->mayWriteToMemory()) {
> -        MayWriteMem = true;
> -        break;
> -      }
> -    }
> -    Candidates.push_back(ParallelMAC(Mul, VL, !MayWriteMem));
> +    Candidates.push_back(new BinOpChain(Mul, LHS, RHS));
>    }
>  }
>
> -static ParallelMACList MatchParallelMACs(Reduction &R) {
> -  ParallelMACList Candidates;
> +static void MatchParallelMACSequences(Reduction &R,
> +                                      OpChainList &Candidates) {
>    const Instruction *Acc = R.AccIntAdd;
>    Value *A, *MulOp0, *MulOp1;
>    LLVM_DEBUG(dbgs() << "\n- Analysing:\t"; Acc->dump());
> @@ -473,7 +486,6 @@ static ParallelMACList MatchParallelMACs
>    // Because we start at the bottom of the chain, and we work our way up,
>    // the muls are added in reverse program order to the list.
>    std::reverse(Candidates.begin(), Candidates.end());
> -  return Candidates;
>  }
>
>  // Collects all instructions that are not part of the MAC chains, which
> is the
> @@ -492,23 +504,23 @@ static void AliasCandidates(BasicBlock *
>  // the memory locations accessed by the MAC-chains.
>  // TODO: we need the read statements when we accept more complicated
> chains.
>  static bool AreAliased(AliasAnalysis *AA, Instructions &Reads,
> -                       Instructions &Writes, ParallelMACList
> &MACCandidates) {
> +                       Instructions &Writes, OpChainList &MACCandidates) {
>    LLVM_DEBUG(dbgs() << "Alias checks:\n");
> -  for (auto &MAC : MACCandidates) {
> -    LLVM_DEBUG(dbgs() << "mul: "; MAC.Mul->dump());
> +  for (auto *MAC : MACCandidates) {
> +    LLVM_DEBUG(dbgs() << "mul: "; MAC->Root->dump());
>
>      // At the moment, we allow only simple chains that only consist of
> reads,
>      // accumulate their result with an integer add, and thus that don't
> write
>      // memory, and simply bail if they do.
> -    if (!MAC.ReadOnly)
> +    if (!MAC->ReadOnly)
>        return true;
>
>      // Now for all writes in the basic block, check that they don't alias
> with
>      // the memory locations accessed by our MAC-chain:
>      for (auto *I : Writes) {
>        LLVM_DEBUG(dbgs() << "- "; I->dump());
> -      assert(MAC.MemLocs.size() >= 2 && "expecting at least 2 memlocs");
> -      for (auto &MemLoc : MAC.MemLocs) {
> +      assert(MAC->MemLocs.size() >= 2 && "expecting at least 2 memlocs");
> +      for (auto &MemLoc : MAC->MemLocs) {
>          if (isModOrRefSet(intersectModRef(AA->getModRefInfo(I, MemLoc),
>                                            ModRefInfo::ModRef))) {
>            LLVM_DEBUG(dbgs() << "Yes, aliases found\n");
> @@ -522,24 +534,22 @@ static bool AreAliased(AliasAnalysis *AA
>    return false;
>  }
>
> -static bool SetMemoryLocations(ParallelMACList &Candidates) {
> -  const auto Size = MemoryLocation::UnknownSize;
> -  for (auto &C : Candidates) {
> +static bool CheckMACMemory(OpChainList &Candidates) {
> +  for (auto *C : Candidates) {
>      // A mul has 2 operands, and a narrow op consist of sext and a load;
> thus
>      // we expect at least 4 items in this operand value list.
> -    if (C.VL.size() < 4) {
> +    if (C->size() < 4) {
>        LLVM_DEBUG(dbgs() << "Operand list too short.\n");
>        return false;
>      }
> -
> -    for (unsigned i = 0; i < C.VL.size(); i += 4) {
> -      auto *LdOp0 = dyn_cast<LoadInst>(C.VL[i]);
> -      auto *LdOp1 = dyn_cast<LoadInst>(C.VL[i+2]);
> -      if (!LdOp0 || !LdOp1)
> +    C->SetMemoryLocations();
> +    ValueList &LHS = static_cast<BinOpChain*>(C)->LHS;
> +    ValueList &RHS = static_cast<BinOpChain*>(C)->RHS;
> +
> +    // Use +=2 to skip over the expected extend instructions.
> +    for (unsigned i = 0, e = LHS.size(); i < e; i += 2) {
> +      if (!isa<LoadInst>(LHS[i]) || !isa<LoadInst>(RHS[i]))
>          return false;
> -
> -      C.MemLocs.push_back(MemoryLocation(LdOp0->getPointerOperand(),
> Size));
> -      C.MemLocs.push_back(MemoryLocation(LdOp1->getPointerOperand(),
> Size));
>      }
>    }
>    return true;
> @@ -584,17 +594,20 @@ bool ARMParallelDSP::MatchSMLAD(Function
>               dbgs() << "Loop info:\n\n"; L->dump());
>
>    bool Changed = false;
> -  ReductionList Reductions = MatchReductions(F, L, Header);
> +  ReductionList Reductions;
> +  MatchReductions(F, L, Header, Reductions);
>
>    for (auto &R : Reductions) {
> -    ParallelMACList MACCandidates = MatchParallelMACs(R);
> -    if (!SetMemoryLocations(MACCandidates))
> +    OpChainList MACCandidates;
> +    MatchParallelMACSequences(R, MACCandidates);
> +    if (!CheckMACMemory(MACCandidates))
>        continue;
> +
>      R.MACCandidates = MACCandidates;
>
>      LLVM_DEBUG(dbgs() << "MAC candidates:\n";
>        for (auto &M : R.MACCandidates)
> -        M.Mul->dump();
> +        M->Root->dump();
>        dbgs() << "\n";);
>    }
>
> @@ -609,6 +622,8 @@ bool ARMParallelDSP::MatchSMLAD(Function
>        return false;
>      PMACPairList PMACPairs = CreateParallelMACPairs(R.MACCandidates);
>      Changed |= InsertParallelMACs(R, PMACPairs);
> +    for (auto *C : R.MACCandidates)
> +      delete C;
>    }
>
>    LLVM_DEBUG(if (Changed) dbgs() << "Header block:\n"; Header->dump(););
>
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at lists.llvm.org
> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20180723/8f156914/attachment-0001.html>