[llvm] r367389 - [ARM][ParallelDSP] Convert to function pass
Sam Parker via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 22 01:09:55 PDT 2019
Thanks Hans, sorry I've been on holiday. I think it's triggering an underlying bug, so I'll look into a proper fix today.
Sam
Sam Parker
Compilation Tools Engineer | Arm
. . . . . . . . . . . . . . . . . . . . . . . . . . .
Arm.com
________________________________
From: Hans Wennborg <hans at chromium.org>
Sent: 21 August 2019 14:09
To: Sam Parker <Sam.Parker at arm.com>
Cc: llvm-commits <llvm-commits at lists.llvm.org>
Subject: Re: [llvm] r367389 - [ARM][ParallelDSP] Convert to function pass
It does look like a miscompile. Filed
https://bugs.llvm.org/show_bug.cgi?id=43073
On Mon, Aug 19, 2019 at 6:06 PM Hans Wennborg <hans at chromium.org> wrote:
>
> Just a heads up that we bisected a regression in Chromium (really
> WebRTC) to this revision:
> https://bugs.chromium.org/p/webrtc/issues/detail?id=10887
>
> We still don't know what's wrong, and the problem could lie in the
> WebRTC code, but I wanted to give a heads up that this affected our
> code.
>
> On Wed, Jul 31, 2019 at 9:31 AM Sam Parker via llvm-commits
> <llvm-commits at lists.llvm.org> wrote:
> >
> > Author: sam_parker
> > Date: Wed Jul 31 00:32:03 2019
> > New Revision: 367389
> >
> > URL: http://llvm.org/viewvc/llvm-project?rev=367389&view=rev
> > Log:
> > [ARM][ParallelDSP] Convert to function pass
> >
> > Run across a whole function, visiting each basic block one at a time.
> >
> > Differential Revision: https://reviews.llvm.org/D65324
> >
> > Added:
> > llvm/trunk/test/CodeGen/ARM/ParallelDSP/blocks.ll
> > Modified:
> > llvm/trunk/lib/Target/ARM/ARMParallelDSP.cpp
> > llvm/trunk/test/CodeGen/ARM/O3-pipeline.ll
> > llvm/trunk/test/CodeGen/ARM/ParallelDSP/smlad12.ll
> >
> > Modified: llvm/trunk/lib/Target/ARM/ARMParallelDSP.cpp
> > URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMParallelDSP.cpp?rev=367389&r1=367388&r2=367389&view=diff
> > ==============================================================================
> > --- llvm/trunk/lib/Target/ARM/ARMParallelDSP.cpp (original)
> > +++ llvm/trunk/lib/Target/ARM/ARMParallelDSP.cpp Wed Jul 31 00:32:03 2019
> > @@ -1,4 +1,4 @@
> > -//===- ParallelDSP.cpp - Parallel DSP Pass --------------------------------===//
> > +//===- ARMParallelDSP.cpp - Parallel DSP Pass -----------------------------===//
> > //
> > // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
> > // See https://llvm.org/LICENSE.txt for license information.
> > @@ -18,13 +18,10 @@
> > #include "llvm/ADT/SmallPtrSet.h"
> > #include "llvm/Analysis/AliasAnalysis.h"
> > #include "llvm/Analysis/LoopAccessAnalysis.h"
> > -#include "llvm/Analysis/LoopPass.h"
> > -#include "llvm/Analysis/LoopInfo.h"
> > #include "llvm/IR/Instructions.h"
> > #include "llvm/IR/NoFolder.h"
> > #include "llvm/Transforms/Scalar.h"
> > #include "llvm/Transforms/Utils/BasicBlockUtils.h"
> > -#include "llvm/Transforms/Utils/LoopUtils.h"
> > #include "llvm/Pass.h"
> > #include "llvm/PassRegistry.h"
> > #include "llvm/PassSupport.h"
> > @@ -156,13 +153,11 @@ namespace {
> > }
> > };
> >
> > - class ARMParallelDSP : public LoopPass {
> > + class ARMParallelDSP : public FunctionPass {
> > ScalarEvolution *SE;
> > AliasAnalysis *AA;
> > TargetLibraryInfo *TLI;
> > DominatorTree *DT;
> > - LoopInfo *LI;
> > - Loop *L;
> > const DataLayout *DL;
> > Module *M;
> > std::map<LoadInst*, LoadInst*> LoadPairs;
> > @@ -184,63 +179,38 @@ namespace {
> > /// products to a 32-bit accumulate operand. Optionally, the instruction can
> > /// exchange the halfwords of the second operand before performing the
> > /// arithmetic.
> > - bool MatchSMLAD(Loop *L);
> > + bool MatchSMLAD(Function &F);
> >
> > public:
> > static char ID;
> >
> > - ARMParallelDSP() : LoopPass(ID) { }
> > -
> > - bool doInitialization(Loop *L, LPPassManager &LPM) override {
> > - LoadPairs.clear();
> > - WideLoads.clear();
> > - return true;
> > - }
> > + ARMParallelDSP() : FunctionPass(ID) { }
> >
> > void getAnalysisUsage(AnalysisUsage &AU) const override {
> > - LoopPass::getAnalysisUsage(AU);
> > + FunctionPass::getAnalysisUsage(AU);
> > AU.addRequired<AssumptionCacheTracker>();
> > AU.addRequired<ScalarEvolutionWrapperPass>();
> > AU.addRequired<AAResultsWrapperPass>();
> > AU.addRequired<TargetLibraryInfoWrapperPass>();
> > - AU.addRequired<LoopInfoWrapperPass>();
> > AU.addRequired<DominatorTreeWrapperPass>();
> > AU.addRequired<TargetPassConfig>();
> > - AU.addPreserved<LoopInfoWrapperPass>();
> > + AU.addPreserved<ScalarEvolutionWrapperPass>();
> > + AU.addPreserved<GlobalsAAWrapperPass>();
> > AU.setPreservesCFG();
> > }
> >
> > - bool runOnLoop(Loop *TheLoop, LPPassManager &) override {
> > + bool runOnFunction(Function &F) override {
> > if (DisableParallelDSP)
> > return false;
> > - if (skipLoop(TheLoop))
> > + if (skipFunction(F))
> > return false;
> >
> > - L = TheLoop;
> > SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
> > AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
> > TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
> > DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
> > - LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
> > auto &TPC = getAnalysis<TargetPassConfig>();
> >
> > - BasicBlock *Header = TheLoop->getHeader();
> > - if (!Header)
> > - return false;
> > -
> > - // TODO: We assume the loop header and latch to be the same block.
> > - // This is not a fundamental restriction, but lifting this would just
> > - // require more work to do the transformation and then patch up the CFG.
> > - if (Header != TheLoop->getLoopLatch()) {
> > - LLVM_DEBUG(dbgs() << "The loop header is not the loop latch: not "
> > - "running pass ARMParallelDSP\n");
> > - return false;
> > - }
> > -
> > - if (!TheLoop->getLoopPreheader())
> > - InsertPreheaderForLoop(L, DT, LI, nullptr, true);
> > -
> > - Function &F = *Header->getParent();
> > M = F.getParent();
> > DL = &M->getDataLayout();
> >
> > @@ -265,17 +235,10 @@ namespace {
> > return false;
> > }
> >
> > - LoopAccessInfo LAI(L, SE, TLI, AA, DT, LI);
> > -
> > LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n");
> > LLVM_DEBUG(dbgs() << " - " << F.getName() << "\n\n");
> >
> > - if (!RecordMemoryOps(Header)) {
> > - LLVM_DEBUG(dbgs() << " - No sequential loads found.\n");
> > - return false;
> > - }
> > -
> > - bool Changes = MatchSMLAD(L);
> > + bool Changes = MatchSMLAD(F);
> > return Changes;
> > }
> > };
> > @@ -337,6 +300,8 @@ bool ARMParallelDSP::IsNarrowSequence(Va
> > bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
> > SmallVector<LoadInst*, 8> Loads;
> > SmallVector<Instruction*, 8> Writes;
> > + LoadPairs.clear();
> > + WideLoads.clear();
> >
> > // Collect loads and instruction that may write to memory. For now we only
> > // record loads which are simple, sign-extended and have a single user.
> > @@ -415,7 +380,7 @@ bool ARMParallelDSP::RecordMemoryOps(Bas
> > return LoadPairs.size() > 1;
> > }
> >
> > -// Loop Pass that needs to identify integer add/sub reductions of 16-bit vector
> > +// The pass needs to identify integer add/sub reductions of 16-bit vector
> > // multiplications.
> > // To use SMLAD:
> > // 1) we first need to find integer add then look for this pattern:
> > @@ -446,13 +411,13 @@ bool ARMParallelDSP::RecordMemoryOps(Bas
> > // If loop invariants are used instead of loads, these need to be packed
> > // before the loop begins.
> > //
> > -bool ARMParallelDSP::MatchSMLAD(Loop *L) {
> > +bool ARMParallelDSP::MatchSMLAD(Function &F) {
> > // Search recursively back through the operands to find a tree of values that
> > // form a multiply-accumulate chain. The search records the Add and Mul
> > // instructions that form the reduction and allows us to find a single value
> > // to be used as the initial input to the accumlator.
> > - std::function<bool(Value*, Reduction&)> Search = [&]
> > - (Value *V, Reduction &R) -> bool {
> > + std::function<bool(Value*, BasicBlock*, Reduction&)> Search = [&]
> > + (Value *V, BasicBlock *BB, Reduction &R) -> bool {
> >
> > // If we find a non-instruction, try to use it as the initial accumulator
> > // value. This may have already been found during the search in which case
> > @@ -461,6 +426,9 @@ bool ARMParallelDSP::MatchSMLAD(Loop *L)
> > if (!I)
> > return R.InsertAcc(V);
> >
> > + if (I->getParent() != BB)
> > + return false;
> > +
> > switch (I->getOpcode()) {
> > default:
> > break;
> > @@ -471,8 +439,8 @@ bool ARMParallelDSP::MatchSMLAD(Loop *L)
> > // Adds should be adding together two muls, or another add and a mul to
> > // be within the mac chain. One of the operands may also be the
> > // accumulator value at which point we should stop searching.
> > - bool ValidLHS = Search(I->getOperand(0), R);
> > - bool ValidRHS = Search(I->getOperand(1), R);
> > + bool ValidLHS = Search(I->getOperand(0), BB, R);
> > + bool ValidRHS = Search(I->getOperand(1), BB, R);
> > if (!ValidLHS && !ValidLHS)
> > return false;
> > else if (ValidLHS && ValidRHS) {
> > @@ -498,36 +466,40 @@ bool ARMParallelDSP::MatchSMLAD(Loop *L)
> > return false;
> > }
> > case Instruction::SExt:
> > - return Search(I->getOperand(0), R);
> > + return Search(I->getOperand(0), BB, R);
> > }
> > return false;
> > };
> >
> > bool Changed = false;
> > - SmallPtrSet<Instruction*, 4> AllAdds;
> > - BasicBlock *Latch = L->getLoopLatch();
> >
> > - for (Instruction &I : reverse(*Latch)) {
> > - if (I.getOpcode() != Instruction::Add)
> > + for (auto &BB : F) {
> > + SmallPtrSet<Instruction*, 4> AllAdds;
> > + if (!RecordMemoryOps(&BB))
> > continue;
> >
> > - if (AllAdds.count(&I))
> > - continue;
> > + for (Instruction &I : reverse(BB)) {
> > + if (I.getOpcode() != Instruction::Add)
> > + continue;
> >
> > - const auto *Ty = I.getType();
> > - if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64))
> > - continue;
> > + if (AllAdds.count(&I))
> > + continue;
> >
> > - Reduction R(&I);
> > - if (!Search(&I, R))
> > - continue;
> > + const auto *Ty = I.getType();
> > + if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64))
> > + continue;
> >
> > - if (!CreateParallelPairs(R))
> > - continue;
> > + Reduction R(&I);
> > + if (!Search(&I, &BB, R))
> > + continue;
> >
> > - InsertParallelMACs(R);
> > - Changed = true;
> > - AllAdds.insert(R.getAdds().begin(), R.getAdds().end());
> > + if (!CreateParallelPairs(R))
> > + continue;
> > +
> > + InsertParallelMACs(R);
> > + Changed = true;
> > + AllAdds.insert(R.getAdds().begin(), R.getAdds().end());
> > + }
> > }
> >
> > return Changed;
> > @@ -745,6 +717,6 @@ Pass *llvm::createARMParallelDSPPass() {
> > char ARMParallelDSP::ID = 0;
> >
> > INITIALIZE_PASS_BEGIN(ARMParallelDSP, "arm-parallel-dsp",
> > - "Transform loops to use DSP intrinsics", false, false)
> > + "Transform functions to use DSP intrinsics", false, false)
> > INITIALIZE_PASS_END(ARMParallelDSP, "arm-parallel-dsp",
> > - "Transform loops to use DSP intrinsics", false, false)
> > + "Transform functions to use DSP intrinsics", false, false)
> >
> > Modified: llvm/trunk/test/CodeGen/ARM/O3-pipeline.ll
> > URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/O3-pipeline.ll?rev=367389&r1=367388&r2=367389&view=diff
> > ==============================================================================
> > --- llvm/trunk/test/CodeGen/ARM/O3-pipeline.ll (original)
> > +++ llvm/trunk/test/CodeGen/ARM/O3-pipeline.ll Wed Jul 31 00:32:03 2019
> > @@ -37,8 +37,7 @@
> > ; CHECK-NEXT: Scalar Evolution Analysis
> > ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl)
> > ; CHECK-NEXT: Function Alias Analysis Results
> > -; CHECK-NEXT: Loop Pass Manager
> > -; CHECK-NEXT: Transform loops to use DSP intrinsics
> > +; CHECK-NEXT: Transform functions to use DSP intrinsics
> > ; CHECK-NEXT: Interleaved Access Pass
> > ; CHECK-NEXT: ARM IR optimizations
> > ; CHECK-NEXT: Dominator Tree Construction
> >
> > Added: llvm/trunk/test/CodeGen/ARM/ParallelDSP/blocks.ll
> > URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/ParallelDSP/blocks.ll?rev=367389&view=auto
> > ==============================================================================
> > --- llvm/trunk/test/CodeGen/ARM/ParallelDSP/blocks.ll (added)
> > +++ llvm/trunk/test/CodeGen/ARM/ParallelDSP/blocks.ll Wed Jul 31 00:32:03 2019
> > @@ -0,0 +1,79 @@
> > +; RUN: opt -arm-parallel-dsp -mtriple=armv7-a -S %s -o - | FileCheck %s
> > +
> > +; CHECK-LABEL: single_block
> > +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
> > +; CHECK: [[A:%[^ ]+]] = load i32, i32* [[CAST_A]]
> > +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
> > +; CHECK: [[B:%[^ ]+]] = load i32, i32* [[CAST_B]]
> > +; CHECK call i32 @llvm.arm.smlad(i32 [[A]], i32 [[B]], i32 %acc)
> > +define i32 @single_block(i16* %a, i16* %b, i32 %acc) {
> > +entry:
> > + %ld.a.0 = load i16, i16* %a
> > + %sext.a.0 = sext i16 %ld.a.0 to i32
> > + %ld.b.0 = load i16, i16* %b
> > + %sext.b.0 = sext i16 %ld.b.0 to i32
> > + %mul.0 = mul i32 %sext.a.0, %sext.b.0
> > + %addr.a.1 = getelementptr i16, i16* %a, i32 1
> > + %addr.b.1 = getelementptr i16, i16* %b, i32 1
> > + %ld.a.1 = load i16, i16* %addr.a.1
> > + %sext.a.1 = sext i16 %ld.a.1 to i32
> > + %ld.b.1 = load i16, i16* %addr.b.1
> > + %sext.b.1 = sext i16 %ld.b.1 to i32
> > + %mul.1 = mul i32 %sext.a.1, %sext.b.1
> > + %add = add i32 %mul.0, %mul.1
> > + %res = add i32 %add, %acc
> > + ret i32 %res
> > +}
> > +
> > +; CHECK-LABEL: multi_block
> > +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
> > +; CHECK: [[A:%[^ ]+]] = load i32, i32* [[CAST_A]]
> > +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
> > +; CHECK: [[B:%[^ ]+]] = load i32, i32* [[CAST_B]]
> > +; CHECK call i32 @llvm.arm.smlad(i32 [[A]], i32 [[B]], i32 0)
> > +define i32 @multi_block(i16* %a, i16* %b, i32 %acc) {
> > +entry:
> > + %ld.a.0 = load i16, i16* %a
> > + %sext.a.0 = sext i16 %ld.a.0 to i32
> > + %ld.b.0 = load i16, i16* %b
> > + %sext.b.0 = sext i16 %ld.b.0 to i32
> > + %mul.0 = mul i32 %sext.a.0, %sext.b.0
> > + %addr.a.1 = getelementptr i16, i16* %a, i32 1
> > + %addr.b.1 = getelementptr i16, i16* %b, i32 1
> > + %ld.a.1 = load i16, i16* %addr.a.1
> > + %sext.a.1 = sext i16 %ld.a.1 to i32
> > + %ld.b.1 = load i16, i16* %addr.b.1
> > + %sext.b.1 = sext i16 %ld.b.1 to i32
> > + %mul.1 = mul i32 %sext.a.1, %sext.b.1
> > + %add = add i32 %mul.0, %mul.1
> > + br label %bb.1
> > +
> > +bb.1:
> > + %res = add i32 %add, %acc
> > + ret i32 %res
> > +}
> > +
> > +; CHECK-LABEL: multi_block_1
> > +; CHECK-NOT: call i32 @llvm.arm.smlad
> > +define i32 @multi_block_1(i16* %a, i16* %b, i32 %acc) {
> > +entry:
> > + %ld.a.0 = load i16, i16* %a
> > + %sext.a.0 = sext i16 %ld.a.0 to i32
> > + %ld.b.0 = load i16, i16* %b
> > + %sext.b.0 = sext i16 %ld.b.0 to i32
> > + %mul.0 = mul i32 %sext.a.0, %sext.b.0
> > + br label %bb.1
> > +
> > +bb.1:
> > + %addr.a.1 = getelementptr i16, i16* %a, i32 1
> > + %addr.b.1 = getelementptr i16, i16* %b, i32 1
> > + %ld.a.1 = load i16, i16* %addr.a.1
> > + %sext.a.1 = sext i16 %ld.a.1 to i32
> > + %ld.b.1 = load i16, i16* %addr.b.1
> > + %sext.b.1 = sext i16 %ld.b.1 to i32
> > + %mul.1 = mul i32 %sext.a.1, %sext.b.1
> > + %add = add i32 %mul.0, %mul.1
> > + %res = add i32 %add, %acc
> > + ret i32 %res
> > +}
> > +
> >
> > Modified: llvm/trunk/test/CodeGen/ARM/ParallelDSP/smlad12.ll
> > URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/ParallelDSP/smlad12.ll?rev=367389&r1=367388&r2=367389&view=diff
> > ==============================================================================
> > --- llvm/trunk/test/CodeGen/ARM/ParallelDSP/smlad12.ll (original)
> > +++ llvm/trunk/test/CodeGen/ARM/ParallelDSP/smlad12.ll Wed Jul 31 00:32:03 2019
> > @@ -2,7 +2,7 @@
> > ;
> > ; The loop header is not the loop latch.
> > ;
> > -; CHECK-NOT: call i32 @llvm.arm.smlad
> > +; CHECK: call i32 @llvm.arm.smlad
> > ;
> > define dso_local i32 @test(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
> > entry:
> >
> >
> > _______________________________________________
> > llvm-commits mailing list
> > llvm-commits at lists.llvm.org
> > https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20190822/f5ffea25/attachment.html>
More information about the llvm-commits
mailing list