[llvm] r367389 - [ARM][ParallelDSP] Convert to function pass
Hans Wennborg via llvm-commits
llvm-commits at lists.llvm.org
Mon Aug 19 09:06:13 PDT 2019
Just a heads up that we bisected a regression in Chromium (really
WebRTC) to this revision:
https://bugs.chromium.org/p/webrtc/issues/detail?id=10887
We still don't know what's wrong, and the problem could lie in the
WebRTC code, but I wanted to give a heads up that this affected our
code.
On Wed, Jul 31, 2019 at 9:31 AM Sam Parker via llvm-commits
<llvm-commits at lists.llvm.org> wrote:
>
> Author: sam_parker
> Date: Wed Jul 31 00:32:03 2019
> New Revision: 367389
>
> URL: http://llvm.org/viewvc/llvm-project?rev=367389&view=rev
> Log:
> [ARM][ParallelDSP] Convert to function pass
>
> Run across a whole function, visiting each basic block one at a time.
>
> Differential Revision: https://reviews.llvm.org/D65324
>
> Added:
> llvm/trunk/test/CodeGen/ARM/ParallelDSP/blocks.ll
> Modified:
> llvm/trunk/lib/Target/ARM/ARMParallelDSP.cpp
> llvm/trunk/test/CodeGen/ARM/O3-pipeline.ll
> llvm/trunk/test/CodeGen/ARM/ParallelDSP/smlad12.ll
>
> Modified: llvm/trunk/lib/Target/ARM/ARMParallelDSP.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMParallelDSP.cpp?rev=367389&r1=367388&r2=367389&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/ARM/ARMParallelDSP.cpp (original)
> +++ llvm/trunk/lib/Target/ARM/ARMParallelDSP.cpp Wed Jul 31 00:32:03 2019
> @@ -1,4 +1,4 @@
> -//===- ParallelDSP.cpp - Parallel DSP Pass --------------------------------===//
> +//===- ARMParallelDSP.cpp - Parallel DSP Pass -----------------------------===//
> //
> // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
> // See https://llvm.org/LICENSE.txt for license information.
> @@ -18,13 +18,10 @@
> #include "llvm/ADT/SmallPtrSet.h"
> #include "llvm/Analysis/AliasAnalysis.h"
> #include "llvm/Analysis/LoopAccessAnalysis.h"
> -#include "llvm/Analysis/LoopPass.h"
> -#include "llvm/Analysis/LoopInfo.h"
> #include "llvm/IR/Instructions.h"
> #include "llvm/IR/NoFolder.h"
> #include "llvm/Transforms/Scalar.h"
> #include "llvm/Transforms/Utils/BasicBlockUtils.h"
> -#include "llvm/Transforms/Utils/LoopUtils.h"
> #include "llvm/Pass.h"
> #include "llvm/PassRegistry.h"
> #include "llvm/PassSupport.h"
> @@ -156,13 +153,11 @@ namespace {
> }
> };
>
> - class ARMParallelDSP : public LoopPass {
> + class ARMParallelDSP : public FunctionPass {
> ScalarEvolution *SE;
> AliasAnalysis *AA;
> TargetLibraryInfo *TLI;
> DominatorTree *DT;
> - LoopInfo *LI;
> - Loop *L;
> const DataLayout *DL;
> Module *M;
> std::map<LoadInst*, LoadInst*> LoadPairs;
> @@ -184,63 +179,38 @@ namespace {
> /// products to a 32-bit accumulate operand. Optionally, the instruction can
> /// exchange the halfwords of the second operand before performing the
> /// arithmetic.
> - bool MatchSMLAD(Loop *L);
> + bool MatchSMLAD(Function &F);
>
> public:
> static char ID;
>
> - ARMParallelDSP() : LoopPass(ID) { }
> -
> - bool doInitialization(Loop *L, LPPassManager &LPM) override {
> - LoadPairs.clear();
> - WideLoads.clear();
> - return true;
> - }
> + ARMParallelDSP() : FunctionPass(ID) { }
>
> void getAnalysisUsage(AnalysisUsage &AU) const override {
> - LoopPass::getAnalysisUsage(AU);
> + FunctionPass::getAnalysisUsage(AU);
> AU.addRequired<AssumptionCacheTracker>();
> AU.addRequired<ScalarEvolutionWrapperPass>();
> AU.addRequired<AAResultsWrapperPass>();
> AU.addRequired<TargetLibraryInfoWrapperPass>();
> - AU.addRequired<LoopInfoWrapperPass>();
> AU.addRequired<DominatorTreeWrapperPass>();
> AU.addRequired<TargetPassConfig>();
> - AU.addPreserved<LoopInfoWrapperPass>();
> + AU.addPreserved<ScalarEvolutionWrapperPass>();
> + AU.addPreserved<GlobalsAAWrapperPass>();
> AU.setPreservesCFG();
> }
>
> - bool runOnLoop(Loop *TheLoop, LPPassManager &) override {
> + bool runOnFunction(Function &F) override {
> if (DisableParallelDSP)
> return false;
> - if (skipLoop(TheLoop))
> + if (skipFunction(F))
> return false;
>
> - L = TheLoop;
> SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
> AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
> TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
> DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
> - LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
> auto &TPC = getAnalysis<TargetPassConfig>();
>
> - BasicBlock *Header = TheLoop->getHeader();
> - if (!Header)
> - return false;
> -
> - // TODO: We assume the loop header and latch to be the same block.
> - // This is not a fundamental restriction, but lifting this would just
> - // require more work to do the transformation and then patch up the CFG.
> - if (Header != TheLoop->getLoopLatch()) {
> - LLVM_DEBUG(dbgs() << "The loop header is not the loop latch: not "
> - "running pass ARMParallelDSP\n");
> - return false;
> - }
> -
> - if (!TheLoop->getLoopPreheader())
> - InsertPreheaderForLoop(L, DT, LI, nullptr, true);
> -
> - Function &F = *Header->getParent();
> M = F.getParent();
> DL = &M->getDataLayout();
>
> @@ -265,17 +235,10 @@ namespace {
> return false;
> }
>
> - LoopAccessInfo LAI(L, SE, TLI, AA, DT, LI);
> -
> LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n");
> LLVM_DEBUG(dbgs() << " - " << F.getName() << "\n\n");
>
> - if (!RecordMemoryOps(Header)) {
> - LLVM_DEBUG(dbgs() << " - No sequential loads found.\n");
> - return false;
> - }
> -
> - bool Changes = MatchSMLAD(L);
> + bool Changes = MatchSMLAD(F);
> return Changes;
> }
> };
> @@ -337,6 +300,8 @@ bool ARMParallelDSP::IsNarrowSequence(Va
> bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
> SmallVector<LoadInst*, 8> Loads;
> SmallVector<Instruction*, 8> Writes;
> + LoadPairs.clear();
> + WideLoads.clear();
>
> // Collect loads and instruction that may write to memory. For now we only
> // record loads which are simple, sign-extended and have a single user.
> @@ -415,7 +380,7 @@ bool ARMParallelDSP::RecordMemoryOps(Bas
> return LoadPairs.size() > 1;
> }
>
> -// Loop Pass that needs to identify integer add/sub reductions of 16-bit vector
> +// The pass needs to identify integer add/sub reductions of 16-bit vector
> // multiplications.
> // To use SMLAD:
> // 1) we first need to find integer add then look for this pattern:
> @@ -446,13 +411,13 @@ bool ARMParallelDSP::RecordMemoryOps(Bas
> // If loop invariants are used instead of loads, these need to be packed
> // before the loop begins.
> //
> -bool ARMParallelDSP::MatchSMLAD(Loop *L) {
> +bool ARMParallelDSP::MatchSMLAD(Function &F) {
> // Search recursively back through the operands to find a tree of values that
> // form a multiply-accumulate chain. The search records the Add and Mul
> // instructions that form the reduction and allows us to find a single value
> // to be used as the initial input to the accumlator.
> - std::function<bool(Value*, Reduction&)> Search = [&]
> - (Value *V, Reduction &R) -> bool {
> + std::function<bool(Value*, BasicBlock*, Reduction&)> Search = [&]
> + (Value *V, BasicBlock *BB, Reduction &R) -> bool {
>
> // If we find a non-instruction, try to use it as the initial accumulator
> // value. This may have already been found during the search in which case
> @@ -461,6 +426,9 @@ bool ARMParallelDSP::MatchSMLAD(Loop *L)
> if (!I)
> return R.InsertAcc(V);
>
> + if (I->getParent() != BB)
> + return false;
> +
> switch (I->getOpcode()) {
> default:
> break;
> @@ -471,8 +439,8 @@ bool ARMParallelDSP::MatchSMLAD(Loop *L)
> // Adds should be adding together two muls, or another add and a mul to
> // be within the mac chain. One of the operands may also be the
> // accumulator value at which point we should stop searching.
> - bool ValidLHS = Search(I->getOperand(0), R);
> - bool ValidRHS = Search(I->getOperand(1), R);
> + bool ValidLHS = Search(I->getOperand(0), BB, R);
> + bool ValidRHS = Search(I->getOperand(1), BB, R);
> if (!ValidLHS && !ValidLHS)
> return false;
> else if (ValidLHS && ValidRHS) {
> @@ -498,36 +466,40 @@ bool ARMParallelDSP::MatchSMLAD(Loop *L)
> return false;
> }
> case Instruction::SExt:
> - return Search(I->getOperand(0), R);
> + return Search(I->getOperand(0), BB, R);
> }
> return false;
> };
>
> bool Changed = false;
> - SmallPtrSet<Instruction*, 4> AllAdds;
> - BasicBlock *Latch = L->getLoopLatch();
>
> - for (Instruction &I : reverse(*Latch)) {
> - if (I.getOpcode() != Instruction::Add)
> + for (auto &BB : F) {
> + SmallPtrSet<Instruction*, 4> AllAdds;
> + if (!RecordMemoryOps(&BB))
> continue;
>
> - if (AllAdds.count(&I))
> - continue;
> + for (Instruction &I : reverse(BB)) {
> + if (I.getOpcode() != Instruction::Add)
> + continue;
>
> - const auto *Ty = I.getType();
> - if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64))
> - continue;
> + if (AllAdds.count(&I))
> + continue;
>
> - Reduction R(&I);
> - if (!Search(&I, R))
> - continue;
> + const auto *Ty = I.getType();
> + if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64))
> + continue;
>
> - if (!CreateParallelPairs(R))
> - continue;
> + Reduction R(&I);
> + if (!Search(&I, &BB, R))
> + continue;
>
> - InsertParallelMACs(R);
> - Changed = true;
> - AllAdds.insert(R.getAdds().begin(), R.getAdds().end());
> + if (!CreateParallelPairs(R))
> + continue;
> +
> + InsertParallelMACs(R);
> + Changed = true;
> + AllAdds.insert(R.getAdds().begin(), R.getAdds().end());
> + }
> }
>
> return Changed;
> @@ -745,6 +717,6 @@ Pass *llvm::createARMParallelDSPPass() {
> char ARMParallelDSP::ID = 0;
>
> INITIALIZE_PASS_BEGIN(ARMParallelDSP, "arm-parallel-dsp",
> - "Transform loops to use DSP intrinsics", false, false)
> + "Transform functions to use DSP intrinsics", false, false)
> INITIALIZE_PASS_END(ARMParallelDSP, "arm-parallel-dsp",
> - "Transform loops to use DSP intrinsics", false, false)
> + "Transform functions to use DSP intrinsics", false, false)
>
> Modified: llvm/trunk/test/CodeGen/ARM/O3-pipeline.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/O3-pipeline.ll?rev=367389&r1=367388&r2=367389&view=diff
> ==============================================================================
> --- llvm/trunk/test/CodeGen/ARM/O3-pipeline.ll (original)
> +++ llvm/trunk/test/CodeGen/ARM/O3-pipeline.ll Wed Jul 31 00:32:03 2019
> @@ -37,8 +37,7 @@
> ; CHECK-NEXT: Scalar Evolution Analysis
> ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl)
> ; CHECK-NEXT: Function Alias Analysis Results
> -; CHECK-NEXT: Loop Pass Manager
> -; CHECK-NEXT: Transform loops to use DSP intrinsics
> +; CHECK-NEXT: Transform functions to use DSP intrinsics
> ; CHECK-NEXT: Interleaved Access Pass
> ; CHECK-NEXT: ARM IR optimizations
> ; CHECK-NEXT: Dominator Tree Construction
>
> Added: llvm/trunk/test/CodeGen/ARM/ParallelDSP/blocks.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/ParallelDSP/blocks.ll?rev=367389&view=auto
> ==============================================================================
> --- llvm/trunk/test/CodeGen/ARM/ParallelDSP/blocks.ll (added)
> +++ llvm/trunk/test/CodeGen/ARM/ParallelDSP/blocks.ll Wed Jul 31 00:32:03 2019
> @@ -0,0 +1,79 @@
> +; RUN: opt -arm-parallel-dsp -mtriple=armv7-a -S %s -o - | FileCheck %s
> +
> +; CHECK-LABEL: single_block
> +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
> +; CHECK: [[A:%[^ ]+]] = load i32, i32* [[CAST_A]]
> +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
> +; CHECK: [[B:%[^ ]+]] = load i32, i32* [[CAST_B]]
> +; CHECK call i32 @llvm.arm.smlad(i32 [[A]], i32 [[B]], i32 %acc)
> +define i32 @single_block(i16* %a, i16* %b, i32 %acc) {
> +entry:
> + %ld.a.0 = load i16, i16* %a
> + %sext.a.0 = sext i16 %ld.a.0 to i32
> + %ld.b.0 = load i16, i16* %b
> + %sext.b.0 = sext i16 %ld.b.0 to i32
> + %mul.0 = mul i32 %sext.a.0, %sext.b.0
> + %addr.a.1 = getelementptr i16, i16* %a, i32 1
> + %addr.b.1 = getelementptr i16, i16* %b, i32 1
> + %ld.a.1 = load i16, i16* %addr.a.1
> + %sext.a.1 = sext i16 %ld.a.1 to i32
> + %ld.b.1 = load i16, i16* %addr.b.1
> + %sext.b.1 = sext i16 %ld.b.1 to i32
> + %mul.1 = mul i32 %sext.a.1, %sext.b.1
> + %add = add i32 %mul.0, %mul.1
> + %res = add i32 %add, %acc
> + ret i32 %res
> +}
> +
> +; CHECK-LABEL: multi_block
> +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
> +; CHECK: [[A:%[^ ]+]] = load i32, i32* [[CAST_A]]
> +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
> +; CHECK: [[B:%[^ ]+]] = load i32, i32* [[CAST_B]]
> +; CHECK call i32 @llvm.arm.smlad(i32 [[A]], i32 [[B]], i32 0)
> +define i32 @multi_block(i16* %a, i16* %b, i32 %acc) {
> +entry:
> + %ld.a.0 = load i16, i16* %a
> + %sext.a.0 = sext i16 %ld.a.0 to i32
> + %ld.b.0 = load i16, i16* %b
> + %sext.b.0 = sext i16 %ld.b.0 to i32
> + %mul.0 = mul i32 %sext.a.0, %sext.b.0
> + %addr.a.1 = getelementptr i16, i16* %a, i32 1
> + %addr.b.1 = getelementptr i16, i16* %b, i32 1
> + %ld.a.1 = load i16, i16* %addr.a.1
> + %sext.a.1 = sext i16 %ld.a.1 to i32
> + %ld.b.1 = load i16, i16* %addr.b.1
> + %sext.b.1 = sext i16 %ld.b.1 to i32
> + %mul.1 = mul i32 %sext.a.1, %sext.b.1
> + %add = add i32 %mul.0, %mul.1
> + br label %bb.1
> +
> +bb.1:
> + %res = add i32 %add, %acc
> + ret i32 %res
> +}
> +
> +; CHECK-LABEL: multi_block_1
> +; CHECK-NOT: call i32 @llvm.arm.smlad
> +define i32 @multi_block_1(i16* %a, i16* %b, i32 %acc) {
> +entry:
> + %ld.a.0 = load i16, i16* %a
> + %sext.a.0 = sext i16 %ld.a.0 to i32
> + %ld.b.0 = load i16, i16* %b
> + %sext.b.0 = sext i16 %ld.b.0 to i32
> + %mul.0 = mul i32 %sext.a.0, %sext.b.0
> + br label %bb.1
> +
> +bb.1:
> + %addr.a.1 = getelementptr i16, i16* %a, i32 1
> + %addr.b.1 = getelementptr i16, i16* %b, i32 1
> + %ld.a.1 = load i16, i16* %addr.a.1
> + %sext.a.1 = sext i16 %ld.a.1 to i32
> + %ld.b.1 = load i16, i16* %addr.b.1
> + %sext.b.1 = sext i16 %ld.b.1 to i32
> + %mul.1 = mul i32 %sext.a.1, %sext.b.1
> + %add = add i32 %mul.0, %mul.1
> + %res = add i32 %add, %acc
> + ret i32 %res
> +}
> +
>
> Modified: llvm/trunk/test/CodeGen/ARM/ParallelDSP/smlad12.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/ParallelDSP/smlad12.ll?rev=367389&r1=367388&r2=367389&view=diff
> ==============================================================================
> --- llvm/trunk/test/CodeGen/ARM/ParallelDSP/smlad12.ll (original)
> +++ llvm/trunk/test/CodeGen/ARM/ParallelDSP/smlad12.ll Wed Jul 31 00:32:03 2019
> @@ -2,7 +2,7 @@
> ;
> ; The loop header is not the loop latch.
> ;
> -; CHECK-NOT: call i32 @llvm.arm.smlad
> +; CHECK: call i32 @llvm.arm.smlad
> ;
> define dso_local i32 @test(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
> entry:
>
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at lists.llvm.org
> https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits
More information about the llvm-commits
mailing list