[llvm] r367389 - [ARM][ParallelDSP] Convert to function pass

Thu Aug 22 01:09:55 PDT 2019

Thanks Hans, sorry I've been on holiday. I think it's triggering an underlying bug, so I'll look into a proper fix today.

Sam

Sam Parker

Compilation Tools Engineer | Arm

. . . . . . . . . . . . . . . . . . . . . . . . . . .

Arm.com

________________________________
From: Hans Wennborg <hans at chromium.org>
Sent: 21 August 2019 14:09
To: Sam Parker <Sam.Parker at arm.com>
Cc: llvm-commits <llvm-commits at lists.llvm.org>
Subject: Re: [llvm] r367389 - [ARM][ParallelDSP] Convert to function pass

It does look like a miscompile. Filed
https://bugs.llvm.org/show_bug.cgi?id=43073

On Mon, Aug 19, 2019 at 6:06 PM Hans Wennborg <hans at chromium.org> wrote:
>
> Just a heads up that we bisected a regression in Chromium (really
> WebRTC) to this revision:
> https://bugs.chromium.org/p/webrtc/issues/detail?id=10887
>
> We still don't know what's wrong, and the problem could lie in the
> WebRTC code, but I wanted to give a heads up that this affected our
> code.
>
> On Wed, Jul 31, 2019 at 9:31 AM Sam Parker via llvm-commits
> <llvm-commits at lists.llvm.org> wrote:
> >
> > Author: sam_parker
> > Date: Wed Jul 31 00:32:03 2019
> > New Revision: 367389
> >
> > URL: http://llvm.org/viewvc/llvm-project?rev=367389&view=rev
> > Log:
> > [ARM][ParallelDSP] Convert to function pass
> >
> > Run across a whole function, visiting each basic block one at a time.
> >
> > Differential Revision: https://reviews.llvm.org/D65324
> >
> > Added:
> >     llvm/trunk/test/CodeGen/ARM/ParallelDSP/blocks.ll
> > Modified:
> >     llvm/trunk/lib/Target/ARM/ARMParallelDSP.cpp
> >     llvm/trunk/test/CodeGen/ARM/O3-pipeline.ll
> >     llvm/trunk/test/CodeGen/ARM/ParallelDSP/smlad12.ll
> >
> > Modified: llvm/trunk/lib/Target/ARM/ARMParallelDSP.cpp
> > URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMParallelDSP.cpp?rev=367389&r1=367388&r2=367389&view=diff
> > ==============================================================================
> > --- llvm/trunk/lib/Target/ARM/ARMParallelDSP.cpp (original)
> > +++ llvm/trunk/lib/Target/ARM/ARMParallelDSP.cpp Wed Jul 31 00:32:03 2019
> > @@ -1,4 +1,4 @@
> > -//===- ParallelDSP.cpp - Parallel DSP Pass --------------------------------===//
> > +//===- ARMParallelDSP.cpp - Parallel DSP Pass -----------------------------===//
> >  //
> >  // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
> >  // See https://llvm.org/LICENSE.txt for license information.
> > @@ -18,13 +18,10 @@
> >  #include "llvm/ADT/SmallPtrSet.h"
> >  #include "llvm/Analysis/AliasAnalysis.h"
> >  #include "llvm/Analysis/LoopAccessAnalysis.h"
> > -#include "llvm/Analysis/LoopPass.h"
> > -#include "llvm/Analysis/LoopInfo.h"
> >  #include "llvm/IR/Instructions.h"
> >  #include "llvm/IR/NoFolder.h"
> >  #include "llvm/Transforms/Scalar.h"
> >  #include "llvm/Transforms/Utils/BasicBlockUtils.h"
> > -#include "llvm/Transforms/Utils/LoopUtils.h"
> >  #include "llvm/Pass.h"
> >  #include "llvm/PassRegistry.h"
> >  #include "llvm/PassSupport.h"
> > @@ -156,13 +153,11 @@ namespace {
> >      }
> >    };
> >
> > -  class ARMParallelDSP : public LoopPass {
> > +  class ARMParallelDSP : public FunctionPass {
> >      ScalarEvolution   *SE;
> >      AliasAnalysis     *AA;
> >      TargetLibraryInfo *TLI;
> >      DominatorTree     *DT;
> > -    LoopInfo          *LI;
> > -    Loop              *L;
> >      const DataLayout  *DL;
> >      Module            *M;
> >      std::map<LoadInst*, LoadInst*> LoadPairs;
> > @@ -184,63 +179,38 @@ namespace {
> >      /// products to a 32-bit accumulate operand. Optionally, the instruction can
> >      /// exchange the halfwords of the second operand before performing the
> >      /// arithmetic.
> > -    bool MatchSMLAD(Loop *L);
> > +    bool MatchSMLAD(Function &F);
> >
> >    public:
> >      static char ID;
> >
> > -    ARMParallelDSP() : LoopPass(ID) { }
> > -
> > -    bool doInitialization(Loop *L, LPPassManager &LPM) override {
> > -      LoadPairs.clear();
> > -      WideLoads.clear();
> > -      return true;
> > -    }
> > +    ARMParallelDSP() : FunctionPass(ID) { }
> >
> >      void getAnalysisUsage(AnalysisUsage &AU) const override {
> > -      LoopPass::getAnalysisUsage(AU);
> > +      FunctionPass::getAnalysisUsage(AU);
> >        AU.addRequired<AssumptionCacheTracker>();
> >        AU.addRequired<ScalarEvolutionWrapperPass>();
> >        AU.addRequired<AAResultsWrapperPass>();
> >        AU.addRequired<TargetLibraryInfoWrapperPass>();
> > -      AU.addRequired<LoopInfoWrapperPass>();
> >        AU.addRequired<DominatorTreeWrapperPass>();
> >        AU.addRequired<TargetPassConfig>();
> > -      AU.addPreserved<LoopInfoWrapperPass>();
> > +      AU.addPreserved<ScalarEvolutionWrapperPass>();
> > +      AU.addPreserved<GlobalsAAWrapperPass>();
> >        AU.setPreservesCFG();
> >      }
> >
> > -    bool runOnLoop(Loop *TheLoop, LPPassManager &) override {
> > +    bool runOnFunction(Function &F) override {
> >        if (DisableParallelDSP)
> >          return false;
> > -      if (skipLoop(TheLoop))
> > +      if (skipFunction(F))
> >          return false;
> >
> > -      L = TheLoop;
> >        SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
> >        AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
> >        TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
> >        DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
> > -      LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
> >        auto &TPC = getAnalysis<TargetPassConfig>();
> >
> > -      BasicBlock *Header = TheLoop->getHeader();
> > -      if (!Header)
> > -        return false;
> > -
> > -      // TODO: We assume the loop header and latch to be the same block.
> > -      // This is not a fundamental restriction, but lifting this would just
> > -      // require more work to do the transformation and then patch up the CFG.
> > -      if (Header != TheLoop->getLoopLatch()) {
> > -        LLVM_DEBUG(dbgs() << "The loop header is not the loop latch: not "
> > -                             "running pass ARMParallelDSP\n");
> > -        return false;
> > -      }
> > -
> > -      if (!TheLoop->getLoopPreheader())
> > -        InsertPreheaderForLoop(L, DT, LI, nullptr, true);
> > -
> > -      Function &F = *Header->getParent();
> >        M = F.getParent();
> >        DL = &M->getDataLayout();
> >
> > @@ -265,17 +235,10 @@ namespace {
> >          return false;
> >        }
> >
> > -      LoopAccessInfo LAI(L, SE, TLI, AA, DT, LI);
> > -
> >        LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n");
> >        LLVM_DEBUG(dbgs() << " - " << F.getName() << "\n\n");
> >
> > -      if (!RecordMemoryOps(Header)) {
> > -        LLVM_DEBUG(dbgs() << " - No sequential loads found.\n");
> > -        return false;
> > -      }
> > -
> > -      bool Changes = MatchSMLAD(L);
> > +      bool Changes = MatchSMLAD(F);
> >        return Changes;
> >      }
> >    };
> > @@ -337,6 +300,8 @@ bool ARMParallelDSP::IsNarrowSequence(Va
> >  bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
> >    SmallVector<LoadInst*, 8> Loads;
> >    SmallVector<Instruction*, 8> Writes;
> > +  LoadPairs.clear();
> > +  WideLoads.clear();
> >
> >    // Collect loads and instruction that may write to memory. For now we only
> >    // record loads which are simple, sign-extended and have a single user.
> > @@ -415,7 +380,7 @@ bool ARMParallelDSP::RecordMemoryOps(Bas
> >    return LoadPairs.size() > 1;
> >  }
> >
> > -// Loop Pass that needs to identify integer add/sub reductions of 16-bit vector
> > +// The pass needs to identify integer add/sub reductions of 16-bit vector
> >  // multiplications.
> >  // To use SMLAD:
> >  // 1) we first need to find integer add then look for this pattern:
> > @@ -446,13 +411,13 @@ bool ARMParallelDSP::RecordMemoryOps(Bas
> >  // If loop invariants are used instead of loads, these need to be packed
> >  // before the loop begins.
> >  //
> > -bool ARMParallelDSP::MatchSMLAD(Loop *L) {
> > +bool ARMParallelDSP::MatchSMLAD(Function &F) {
> >    // Search recursively back through the operands to find a tree of values that
> >    // form a multiply-accumulate chain. The search records the Add and Mul
> >    // instructions that form the reduction and allows us to find a single value
> >    // to be used as the initial input to the accumlator.
> > -  std::function<bool(Value*, Reduction&)> Search = [&]
> > -    (Value *V, Reduction &R) -> bool {
> > +  std::function<bool(Value*, BasicBlock*, Reduction&)> Search = [&]
> > +    (Value *V, BasicBlock *BB, Reduction &R) -> bool {
> >
> >      // If we find a non-instruction, try to use it as the initial accumulator
> >      // value. This may have already been found during the search in which case
> > @@ -461,6 +426,9 @@ bool ARMParallelDSP::MatchSMLAD(Loop *L)
> >      if (!I)
> >        return R.InsertAcc(V);
> >
> > +    if (I->getParent() != BB)
> > +      return false;
> > +
> >      switch (I->getOpcode()) {
> >      default:
> >        break;
> > @@ -471,8 +439,8 @@ bool ARMParallelDSP::MatchSMLAD(Loop *L)
> >        // Adds should be adding together two muls, or another add and a mul to
> >        // be within the mac chain. One of the operands may also be the
> >        // accumulator value at which point we should stop searching.
> > -      bool ValidLHS = Search(I->getOperand(0), R);
> > -      bool ValidRHS = Search(I->getOperand(1), R);
> > +      bool ValidLHS = Search(I->getOperand(0), BB, R);
> > +      bool ValidRHS = Search(I->getOperand(1), BB, R);
> >        if (!ValidLHS && !ValidLHS)
> >          return false;
> >        else if (ValidLHS && ValidRHS) {
> > @@ -498,36 +466,40 @@ bool ARMParallelDSP::MatchSMLAD(Loop *L)
> >        return false;
> >      }
> >      case Instruction::SExt:
> > -      return Search(I->getOperand(0), R);
> > +      return Search(I->getOperand(0), BB, R);
> >      }
> >      return false;
> >    };
> >
> >    bool Changed = false;
> > -  SmallPtrSet<Instruction*, 4> AllAdds;
> > -  BasicBlock *Latch = L->getLoopLatch();
> >
> > -  for (Instruction &I : reverse(*Latch)) {
> > -    if (I.getOpcode() != Instruction::Add)
> > +  for (auto &BB : F) {
> > +    SmallPtrSet<Instruction*, 4> AllAdds;
> > +    if (!RecordMemoryOps(&BB))
> >        continue;
> >
> > -    if (AllAdds.count(&I))
> > -      continue;
> > +    for (Instruction &I : reverse(BB)) {
> > +      if (I.getOpcode() != Instruction::Add)
> > +        continue;
> >
> > -    const auto *Ty = I.getType();
> > -    if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64))
> > -      continue;
> > +      if (AllAdds.count(&I))
> > +        continue;
> >
> > -    Reduction R(&I);
> > -    if (!Search(&I, R))
> > -      continue;
> > +      const auto *Ty = I.getType();
> > +      if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64))
> > +        continue;
> >
> > -    if (!CreateParallelPairs(R))
> > -      continue;
> > +      Reduction R(&I);
> > +      if (!Search(&I, &BB, R))
> > +        continue;
> >
> > -    InsertParallelMACs(R);
> > -    Changed = true;
> > -    AllAdds.insert(R.getAdds().begin(), R.getAdds().end());
> > +      if (!CreateParallelPairs(R))
> > +        continue;
> > +
> > +      InsertParallelMACs(R);
> > +      Changed = true;
> > +      AllAdds.insert(R.getAdds().begin(), R.getAdds().end());
> > +    }
> >    }
> >
> >    return Changed;
> > @@ -745,6 +717,6 @@ Pass *llvm::createARMParallelDSPPass() {
> >  char ARMParallelDSP::ID = 0;
> >
> >  INITIALIZE_PASS_BEGIN(ARMParallelDSP, "arm-parallel-dsp",
> > -                "Transform loops to use DSP intrinsics", false, false)
> > +                "Transform functions to use DSP intrinsics", false, false)
> >  INITIALIZE_PASS_END(ARMParallelDSP, "arm-parallel-dsp",
> > -                "Transform loops to use DSP intrinsics", false, false)
> > +                "Transform functions to use DSP intrinsics", false, false)
> >
> > Modified: llvm/trunk/test/CodeGen/ARM/O3-pipeline.ll
> > URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/O3-pipeline.ll?rev=367389&r1=367388&r2=367389&view=diff
> > ==============================================================================
> > --- llvm/trunk/test/CodeGen/ARM/O3-pipeline.ll (original)
> > +++ llvm/trunk/test/CodeGen/ARM/O3-pipeline.ll Wed Jul 31 00:32:03 2019
> > @@ -37,8 +37,7 @@
> >  ; CHECK-NEXT:      Scalar Evolution Analysis
> >  ; CHECK-NEXT:      Basic Alias Analysis (stateless AA impl)
> >  ; CHECK-NEXT:      Function Alias Analysis Results
> > -; CHECK-NEXT:      Loop Pass Manager
> > -; CHECK-NEXT:        Transform loops to use DSP intrinsics
> > +; CHECK-NEXT:      Transform functions to use DSP intrinsics
> >  ; CHECK-NEXT:      Interleaved Access Pass
> >  ; CHECK-NEXT:      ARM IR optimizations
> >  ; CHECK-NEXT:      Dominator Tree Construction
> >
> > Added: llvm/trunk/test/CodeGen/ARM/ParallelDSP/blocks.ll
> > URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/ParallelDSP/blocks.ll?rev=367389&view=auto
> > ==============================================================================
> > --- llvm/trunk/test/CodeGen/ARM/ParallelDSP/blocks.ll (added)
> > +++ llvm/trunk/test/CodeGen/ARM/ParallelDSP/blocks.ll Wed Jul 31 00:32:03 2019
> > @@ -0,0 +1,79 @@
> > +; RUN: opt -arm-parallel-dsp -mtriple=armv7-a -S %s -o - | FileCheck %s
> > +
> > +; CHECK-LABEL: single_block
> > +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
> > +; CHECK: [[A:%[^ ]+]] = load i32, i32* [[CAST_A]]
> > +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
> > +; CHECK: [[B:%[^ ]+]] = load i32, i32* [[CAST_B]]
> > +; CHECK  call i32 @llvm.arm.smlad(i32 [[A]], i32 [[B]], i32 %acc)
> > +define i32 @single_block(i16* %a, i16* %b, i32 %acc) {
> > +entry:
> > +  %ld.a.0 = load i16, i16* %a
> > +  %sext.a.0 = sext i16 %ld.a.0 to i32
> > +  %ld.b.0 = load i16, i16* %b
> > +  %sext.b.0 = sext i16 %ld.b.0 to i32
> > +  %mul.0 = mul i32 %sext.a.0, %sext.b.0
> > +  %addr.a.1 = getelementptr i16, i16* %a, i32 1
> > +  %addr.b.1 = getelementptr i16, i16* %b, i32 1
> > +  %ld.a.1 = load i16, i16* %addr.a.1
> > +  %sext.a.1 = sext i16 %ld.a.1 to i32
> > +  %ld.b.1 = load i16, i16* %addr.b.1
> > +  %sext.b.1 = sext i16 %ld.b.1 to i32
> > +  %mul.1 = mul i32 %sext.a.1, %sext.b.1
> > +  %add = add i32 %mul.0, %mul.1
> > +  %res = add i32 %add, %acc
> > +  ret i32 %res
> > +}
> > +
> > +; CHECK-LABEL: multi_block
> > +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
> > +; CHECK: [[A:%[^ ]+]] = load i32, i32* [[CAST_A]]
> > +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
> > +; CHECK: [[B:%[^ ]+]] = load i32, i32* [[CAST_B]]
> > +; CHECK  call i32 @llvm.arm.smlad(i32 [[A]], i32 [[B]], i32 0)
> > +define i32 @multi_block(i16* %a, i16* %b, i32 %acc) {
> > +entry:
> > +  %ld.a.0 = load i16, i16* %a
> > +  %sext.a.0 = sext i16 %ld.a.0 to i32
> > +  %ld.b.0 = load i16, i16* %b
> > +  %sext.b.0 = sext i16 %ld.b.0 to i32
> > +  %mul.0 = mul i32 %sext.a.0, %sext.b.0
> > +  %addr.a.1 = getelementptr i16, i16* %a, i32 1
> > +  %addr.b.1 = getelementptr i16, i16* %b, i32 1
> > +  %ld.a.1 = load i16, i16* %addr.a.1
> > +  %sext.a.1 = sext i16 %ld.a.1 to i32
> > +  %ld.b.1 = load i16, i16* %addr.b.1
> > +  %sext.b.1 = sext i16 %ld.b.1 to i32
> > +  %mul.1 = mul i32 %sext.a.1, %sext.b.1
> > +  %add = add i32 %mul.0, %mul.1
> > +  br label %bb.1
> > +
> > +bb.1:
> > +  %res = add i32 %add, %acc
> > +  ret i32 %res
> > +}
> > +
> > +; CHECK-LABEL: multi_block_1
> > +; CHECK-NOT: call i32 @llvm.arm.smlad
> > +define i32 @multi_block_1(i16* %a, i16* %b, i32 %acc) {
> > +entry:
> > +  %ld.a.0 = load i16, i16* %a
> > +  %sext.a.0 = sext i16 %ld.a.0 to i32
> > +  %ld.b.0 = load i16, i16* %b
> > +  %sext.b.0 = sext i16 %ld.b.0 to i32
> > +  %mul.0 = mul i32 %sext.a.0, %sext.b.0
> > +  br label %bb.1
> > +
> > +bb.1:
> > +  %addr.a.1 = getelementptr i16, i16* %a, i32 1
> > +  %addr.b.1 = getelementptr i16, i16* %b, i32 1
> > +  %ld.a.1 = load i16, i16* %addr.a.1
> > +  %sext.a.1 = sext i16 %ld.a.1 to i32
> > +  %ld.b.1 = load i16, i16* %addr.b.1
> > +  %sext.b.1 = sext i16 %ld.b.1 to i32
> > +  %mul.1 = mul i32 %sext.a.1, %sext.b.1
> > +  %add = add i32 %mul.0, %mul.1
> > +  %res = add i32 %add, %acc
> > +  ret i32 %res
> > +}
> > +
> >
> > Modified: llvm/trunk/test/CodeGen/ARM/ParallelDSP/smlad12.ll
> > URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/ParallelDSP/smlad12.ll?rev=367389&r1=367388&r2=367389&view=diff
> > ==============================================================================
> > --- llvm/trunk/test/CodeGen/ARM/ParallelDSP/smlad12.ll (original)
> > +++ llvm/trunk/test/CodeGen/ARM/ParallelDSP/smlad12.ll Wed Jul 31 00:32:03 2019
> > @@ -2,7 +2,7 @@
> >  ;
> >  ; The loop header is not the loop latch.
> >  ;
> > -; CHECK-NOT:  call i32 @llvm.arm.smlad
> > +; CHECK:  call i32 @llvm.arm.smlad
> >  ;
> >  define dso_local i32 @test(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
> >  entry:
> >
> >
> > _______________________________________________
> > llvm-commits mailing list
> > llvm-commits at lists.llvm.org
> > https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20190822/f5ffea25/attachment.html>