[llvm] r367389 - [ARM][ParallelDSP] Convert to function pass

Mon Aug 19 09:06:13 PDT 2019

Just a heads up that we bisected a regression in Chromium (really
WebRTC) to this revision:
https://bugs.chromium.org/p/webrtc/issues/detail?id=10887

We still don't know what's wrong, and the problem could lie in the
WebRTC code, but I wanted to give a heads up that this affected our
code.

On Wed, Jul 31, 2019 at 9:31 AM Sam Parker via llvm-commits
<llvm-commits at lists.llvm.org> wrote:
>
> Author: sam_parker
> Date: Wed Jul 31 00:32:03 2019
> New Revision: 367389
>
> URL: http://llvm.org/viewvc/llvm-project?rev=367389&view=rev
> Log:
> [ARM][ParallelDSP] Convert to function pass
>
> Run across a whole function, visiting each basic block one at a time.
>
> Differential Revision: https://reviews.llvm.org/D65324
>
> Added:
>     llvm/trunk/test/CodeGen/ARM/ParallelDSP/blocks.ll
> Modified:
>     llvm/trunk/lib/Target/ARM/ARMParallelDSP.cpp
>     llvm/trunk/test/CodeGen/ARM/O3-pipeline.ll
>     llvm/trunk/test/CodeGen/ARM/ParallelDSP/smlad12.ll
>
> Modified: llvm/trunk/lib/Target/ARM/ARMParallelDSP.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMParallelDSP.cpp?rev=367389&r1=367388&r2=367389&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/ARM/ARMParallelDSP.cpp (original)
> +++ llvm/trunk/lib/Target/ARM/ARMParallelDSP.cpp Wed Jul 31 00:32:03 2019
> @@ -1,4 +1,4 @@
> -//===- ParallelDSP.cpp - Parallel DSP Pass --------------------------------===//
> +//===- ARMParallelDSP.cpp - Parallel DSP Pass -----------------------------===//
>  //
>  // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
>  // See https://llvm.org/LICENSE.txt for license information.
> @@ -18,13 +18,10 @@
>  #include "llvm/ADT/SmallPtrSet.h"
>  #include "llvm/Analysis/AliasAnalysis.h"
>  #include "llvm/Analysis/LoopAccessAnalysis.h"
> -#include "llvm/Analysis/LoopPass.h"
> -#include "llvm/Analysis/LoopInfo.h"
>  #include "llvm/IR/Instructions.h"
>  #include "llvm/IR/NoFolder.h"
>  #include "llvm/Transforms/Scalar.h"
>  #include "llvm/Transforms/Utils/BasicBlockUtils.h"
> -#include "llvm/Transforms/Utils/LoopUtils.h"
>  #include "llvm/Pass.h"
>  #include "llvm/PassRegistry.h"
>  #include "llvm/PassSupport.h"
> @@ -156,13 +153,11 @@ namespace {
>      }
>    };
>
> -  class ARMParallelDSP : public LoopPass {
> +  class ARMParallelDSP : public FunctionPass {
>      ScalarEvolution   *SE;
>      AliasAnalysis     *AA;
>      TargetLibraryInfo *TLI;
>      DominatorTree     *DT;
> -    LoopInfo          *LI;
> -    Loop              *L;
>      const DataLayout  *DL;
>      Module            *M;
>      std::map<LoadInst*, LoadInst*> LoadPairs;
> @@ -184,63 +179,38 @@ namespace {
>      /// products to a 32-bit accumulate operand. Optionally, the instruction can
>      /// exchange the halfwords of the second operand before performing the
>      /// arithmetic.
> -    bool MatchSMLAD(Loop *L);
> +    bool MatchSMLAD(Function &F);
>
>    public:
>      static char ID;
>
> -    ARMParallelDSP() : LoopPass(ID) { }
> -
> -    bool doInitialization(Loop *L, LPPassManager &LPM) override {
> -      LoadPairs.clear();
> -      WideLoads.clear();
> -      return true;
> -    }
> +    ARMParallelDSP() : FunctionPass(ID) { }
>
>      void getAnalysisUsage(AnalysisUsage &AU) const override {
> -      LoopPass::getAnalysisUsage(AU);
> +      FunctionPass::getAnalysisUsage(AU);
>        AU.addRequired<AssumptionCacheTracker>();
>        AU.addRequired<ScalarEvolutionWrapperPass>();
>        AU.addRequired<AAResultsWrapperPass>();
>        AU.addRequired<TargetLibraryInfoWrapperPass>();
> -      AU.addRequired<LoopInfoWrapperPass>();
>        AU.addRequired<DominatorTreeWrapperPass>();
>        AU.addRequired<TargetPassConfig>();
> -      AU.addPreserved<LoopInfoWrapperPass>();
> +      AU.addPreserved<ScalarEvolutionWrapperPass>();
> +      AU.addPreserved<GlobalsAAWrapperPass>();
>        AU.setPreservesCFG();
>      }
>
> -    bool runOnLoop(Loop *TheLoop, LPPassManager &) override {
> +    bool runOnFunction(Function &F) override {
>        if (DisableParallelDSP)
>          return false;
> -      if (skipLoop(TheLoop))
> +      if (skipFunction(F))
>          return false;
>
> -      L = TheLoop;
>        SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
>        AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
>        TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
>        DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
> -      LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
>        auto &TPC = getAnalysis<TargetPassConfig>();
>
> -      BasicBlock *Header = TheLoop->getHeader();
> -      if (!Header)
> -        return false;
> -
> -      // TODO: We assume the loop header and latch to be the same block.
> -      // This is not a fundamental restriction, but lifting this would just
> -      // require more work to do the transformation and then patch up the CFG.
> -      if (Header != TheLoop->getLoopLatch()) {
> -        LLVM_DEBUG(dbgs() << "The loop header is not the loop latch: not "
> -                             "running pass ARMParallelDSP\n");
> -        return false;
> -      }
> -
> -      if (!TheLoop->getLoopPreheader())
> -        InsertPreheaderForLoop(L, DT, LI, nullptr, true);
> -
> -      Function &F = *Header->getParent();
>        M = F.getParent();
>        DL = &M->getDataLayout();
>
> @@ -265,17 +235,10 @@ namespace {
>          return false;
>        }
>
> -      LoopAccessInfo LAI(L, SE, TLI, AA, DT, LI);
> -
>        LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n");
>        LLVM_DEBUG(dbgs() << " - " << F.getName() << "\n\n");
>
> -      if (!RecordMemoryOps(Header)) {
> -        LLVM_DEBUG(dbgs() << " - No sequential loads found.\n");
> -        return false;
> -      }
> -
> -      bool Changes = MatchSMLAD(L);
> +      bool Changes = MatchSMLAD(F);
>        return Changes;
>      }
>    };
> @@ -337,6 +300,8 @@ bool ARMParallelDSP::IsNarrowSequence(Va
>  bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
>    SmallVector<LoadInst*, 8> Loads;
>    SmallVector<Instruction*, 8> Writes;
> +  LoadPairs.clear();
> +  WideLoads.clear();
>
>    // Collect loads and instruction that may write to memory. For now we only
>    // record loads which are simple, sign-extended and have a single user.
> @@ -415,7 +380,7 @@ bool ARMParallelDSP::RecordMemoryOps(Bas
>    return LoadPairs.size() > 1;
>  }
>
> -// Loop Pass that needs to identify integer add/sub reductions of 16-bit vector
> +// The pass needs to identify integer add/sub reductions of 16-bit vector
>  // multiplications.
>  // To use SMLAD:
>  // 1) we first need to find integer add then look for this pattern:
> @@ -446,13 +411,13 @@ bool ARMParallelDSP::RecordMemoryOps(Bas
>  // If loop invariants are used instead of loads, these need to be packed
>  // before the loop begins.
>  //
> -bool ARMParallelDSP::MatchSMLAD(Loop *L) {
> +bool ARMParallelDSP::MatchSMLAD(Function &F) {
>    // Search recursively back through the operands to find a tree of values that
>    // form a multiply-accumulate chain. The search records the Add and Mul
>    // instructions that form the reduction and allows us to find a single value
>    // to be used as the initial input to the accumlator.
> -  std::function<bool(Value*, Reduction&)> Search = [&]
> -    (Value *V, Reduction &R) -> bool {
> +  std::function<bool(Value*, BasicBlock*, Reduction&)> Search = [&]
> +    (Value *V, BasicBlock *BB, Reduction &R) -> bool {
>
>      // If we find a non-instruction, try to use it as the initial accumulator
>      // value. This may have already been found during the search in which case
> @@ -461,6 +426,9 @@ bool ARMParallelDSP::MatchSMLAD(Loop *L)
>      if (!I)
>        return R.InsertAcc(V);
>
> +    if (I->getParent() != BB)
> +      return false;
> +
>      switch (I->getOpcode()) {
>      default:
>        break;
> @@ -471,8 +439,8 @@ bool ARMParallelDSP::MatchSMLAD(Loop *L)
>        // Adds should be adding together two muls, or another add and a mul to
>        // be within the mac chain. One of the operands may also be the
>        // accumulator value at which point we should stop searching.
> -      bool ValidLHS = Search(I->getOperand(0), R);
> -      bool ValidRHS = Search(I->getOperand(1), R);
> +      bool ValidLHS = Search(I->getOperand(0), BB, R);
> +      bool ValidRHS = Search(I->getOperand(1), BB, R);
>        if (!ValidLHS && !ValidLHS)
>          return false;
>        else if (ValidLHS && ValidRHS) {
> @@ -498,36 +466,40 @@ bool ARMParallelDSP::MatchSMLAD(Loop *L)
>        return false;
>      }
>      case Instruction::SExt:
> -      return Search(I->getOperand(0), R);
> +      return Search(I->getOperand(0), BB, R);
>      }
>      return false;
>    };
>
>    bool Changed = false;
> -  SmallPtrSet<Instruction*, 4> AllAdds;
> -  BasicBlock *Latch = L->getLoopLatch();
>
> -  for (Instruction &I : reverse(*Latch)) {
> -    if (I.getOpcode() != Instruction::Add)
> +  for (auto &BB : F) {
> +    SmallPtrSet<Instruction*, 4> AllAdds;
> +    if (!RecordMemoryOps(&BB))
>        continue;
>
> -    if (AllAdds.count(&I))
> -      continue;
> +    for (Instruction &I : reverse(BB)) {
> +      if (I.getOpcode() != Instruction::Add)
> +        continue;
>
> -    const auto *Ty = I.getType();
> -    if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64))
> -      continue;
> +      if (AllAdds.count(&I))
> +        continue;
>
> -    Reduction R(&I);
> -    if (!Search(&I, R))
> -      continue;
> +      const auto *Ty = I.getType();
> +      if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64))
> +        continue;
>
> -    if (!CreateParallelPairs(R))
> -      continue;
> +      Reduction R(&I);
> +      if (!Search(&I, &BB, R))
> +        continue;
>
> -    InsertParallelMACs(R);
> -    Changed = true;
> -    AllAdds.insert(R.getAdds().begin(), R.getAdds().end());
> +      if (!CreateParallelPairs(R))
> +        continue;
> +
> +      InsertParallelMACs(R);
> +      Changed = true;
> +      AllAdds.insert(R.getAdds().begin(), R.getAdds().end());
> +    }
>    }
>
>    return Changed;
> @@ -745,6 +717,6 @@ Pass *llvm::createARMParallelDSPPass() {
>  char ARMParallelDSP::ID = 0;
>
>  INITIALIZE_PASS_BEGIN(ARMParallelDSP, "arm-parallel-dsp",
> -                "Transform loops to use DSP intrinsics", false, false)
> +                "Transform functions to use DSP intrinsics", false, false)
>  INITIALIZE_PASS_END(ARMParallelDSP, "arm-parallel-dsp",
> -                "Transform loops to use DSP intrinsics", false, false)
> +                "Transform functions to use DSP intrinsics", false, false)
>
> Modified: llvm/trunk/test/CodeGen/ARM/O3-pipeline.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/O3-pipeline.ll?rev=367389&r1=367388&r2=367389&view=diff
> ==============================================================================
> --- llvm/trunk/test/CodeGen/ARM/O3-pipeline.ll (original)
> +++ llvm/trunk/test/CodeGen/ARM/O3-pipeline.ll Wed Jul 31 00:32:03 2019
> @@ -37,8 +37,7 @@
>  ; CHECK-NEXT:      Scalar Evolution Analysis
>  ; CHECK-NEXT:      Basic Alias Analysis (stateless AA impl)
>  ; CHECK-NEXT:      Function Alias Analysis Results
> -; CHECK-NEXT:      Loop Pass Manager
> -; CHECK-NEXT:        Transform loops to use DSP intrinsics
> +; CHECK-NEXT:      Transform functions to use DSP intrinsics
>  ; CHECK-NEXT:      Interleaved Access Pass
>  ; CHECK-NEXT:      ARM IR optimizations
>  ; CHECK-NEXT:      Dominator Tree Construction
>
> Added: llvm/trunk/test/CodeGen/ARM/ParallelDSP/blocks.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/ParallelDSP/blocks.ll?rev=367389&view=auto
> ==============================================================================
> --- llvm/trunk/test/CodeGen/ARM/ParallelDSP/blocks.ll (added)
> +++ llvm/trunk/test/CodeGen/ARM/ParallelDSP/blocks.ll Wed Jul 31 00:32:03 2019
> @@ -0,0 +1,79 @@
> +; RUN: opt -arm-parallel-dsp -mtriple=armv7-a -S %s -o - | FileCheck %s
> +
> +; CHECK-LABEL: single_block
> +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
> +; CHECK: [[A:%[^ ]+]] = load i32, i32* [[CAST_A]]
> +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
> +; CHECK: [[B:%[^ ]+]] = load i32, i32* [[CAST_B]]
> +; CHECK  call i32 @llvm.arm.smlad(i32 [[A]], i32 [[B]], i32 %acc)
> +define i32 @single_block(i16* %a, i16* %b, i32 %acc) {
> +entry:
> +  %ld.a.0 = load i16, i16* %a
> +  %sext.a.0 = sext i16 %ld.a.0 to i32
> +  %ld.b.0 = load i16, i16* %b
> +  %sext.b.0 = sext i16 %ld.b.0 to i32
> +  %mul.0 = mul i32 %sext.a.0, %sext.b.0
> +  %addr.a.1 = getelementptr i16, i16* %a, i32 1
> +  %addr.b.1 = getelementptr i16, i16* %b, i32 1
> +  %ld.a.1 = load i16, i16* %addr.a.1
> +  %sext.a.1 = sext i16 %ld.a.1 to i32
> +  %ld.b.1 = load i16, i16* %addr.b.1
> +  %sext.b.1 = sext i16 %ld.b.1 to i32
> +  %mul.1 = mul i32 %sext.a.1, %sext.b.1
> +  %add = add i32 %mul.0, %mul.1
> +  %res = add i32 %add, %acc
> +  ret i32 %res
> +}
> +
> +; CHECK-LABEL: multi_block
> +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
> +; CHECK: [[A:%[^ ]+]] = load i32, i32* [[CAST_A]]
> +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
> +; CHECK: [[B:%[^ ]+]] = load i32, i32* [[CAST_B]]
> +; CHECK  call i32 @llvm.arm.smlad(i32 [[A]], i32 [[B]], i32 0)
> +define i32 @multi_block(i16* %a, i16* %b, i32 %acc) {
> +entry:
> +  %ld.a.0 = load i16, i16* %a
> +  %sext.a.0 = sext i16 %ld.a.0 to i32
> +  %ld.b.0 = load i16, i16* %b
> +  %sext.b.0 = sext i16 %ld.b.0 to i32
> +  %mul.0 = mul i32 %sext.a.0, %sext.b.0
> +  %addr.a.1 = getelementptr i16, i16* %a, i32 1
> +  %addr.b.1 = getelementptr i16, i16* %b, i32 1
> +  %ld.a.1 = load i16, i16* %addr.a.1
> +  %sext.a.1 = sext i16 %ld.a.1 to i32
> +  %ld.b.1 = load i16, i16* %addr.b.1
> +  %sext.b.1 = sext i16 %ld.b.1 to i32
> +  %mul.1 = mul i32 %sext.a.1, %sext.b.1
> +  %add = add i32 %mul.0, %mul.1
> +  br label %bb.1
> +
> +bb.1:
> +  %res = add i32 %add, %acc
> +  ret i32 %res
> +}
> +
> +; CHECK-LABEL: multi_block_1
> +; CHECK-NOT: call i32 @llvm.arm.smlad
> +define i32 @multi_block_1(i16* %a, i16* %b, i32 %acc) {
> +entry:
> +  %ld.a.0 = load i16, i16* %a
> +  %sext.a.0 = sext i16 %ld.a.0 to i32
> +  %ld.b.0 = load i16, i16* %b
> +  %sext.b.0 = sext i16 %ld.b.0 to i32
> +  %mul.0 = mul i32 %sext.a.0, %sext.b.0
> +  br label %bb.1
> +
> +bb.1:
> +  %addr.a.1 = getelementptr i16, i16* %a, i32 1
> +  %addr.b.1 = getelementptr i16, i16* %b, i32 1
> +  %ld.a.1 = load i16, i16* %addr.a.1
> +  %sext.a.1 = sext i16 %ld.a.1 to i32
> +  %ld.b.1 = load i16, i16* %addr.b.1
> +  %sext.b.1 = sext i16 %ld.b.1 to i32
> +  %mul.1 = mul i32 %sext.a.1, %sext.b.1
> +  %add = add i32 %mul.0, %mul.1
> +  %res = add i32 %add, %acc
> +  ret i32 %res
> +}
> +
>
> Modified: llvm/trunk/test/CodeGen/ARM/ParallelDSP/smlad12.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/ParallelDSP/smlad12.ll?rev=367389&r1=367388&r2=367389&view=diff
> ==============================================================================
> --- llvm/trunk/test/CodeGen/ARM/ParallelDSP/smlad12.ll (original)
> +++ llvm/trunk/test/CodeGen/ARM/ParallelDSP/smlad12.ll Wed Jul 31 00:32:03 2019
> @@ -2,7 +2,7 @@
>  ;
>  ; The loop header is not the loop latch.
>  ;
> -; CHECK-NOT:  call i32 @llvm.arm.smlad
> +; CHECK:  call i32 @llvm.arm.smlad
>  ;
>  define dso_local i32 @test(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
>  entry:
>
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at lists.llvm.org
> https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits