[llvm] 8b8281f - [SLP]Do not vectorize non-profitable alternate nodes.
Alexey Bataev via llvm-commits
llvm-commits at lists.llvm.org
Fri May 13 14:48:40 PDT 2022
Author: Alexey Bataev
Date: 2022-05-13T14:28:54-07:00
New Revision: 8b8281f354758be2d154625155b392bee2daad58
URL: https://github.com/llvm/llvm-project/commit/8b8281f354758be2d154625155b392bee2daad58
DIFF: https://github.com/llvm/llvm-project/commit/8b8281f354758be2d154625155b392bee2daad58.diff
LOG: [SLP]Do not vectorize non-profitable alternate nodes.
If alternate node has only 2 instructions and the tree is already big
enough, better to skip the vectorization of such nodes, they are not
very profitable (the resulting code cotains 3 instructions instead of
original 2 scalars). SLP can try to vectorize the buildvector sequence
in the next attempt, if it is profitable.
Metric: SLP.NumVectorInstructions
Program SLP.NumVectorInstructions
results results0 diff
test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C/miniAMR/miniAMR.test 72.00 73.00 1.4%
test-suite :: MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/timberwolfmc.test 1186.00 1198.00 1.0%
test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/miniFE.test 241.00 242.00 0.4%
test-suite :: MultiSource/Applications/JM/lencod/lencod.test 2131.00 2139.00 0.4%
test-suite :: External/SPEC/CINT2017rate/523.xalancbmk_r/523.xalancbmk_r.test 6377.00 6384.00 0.1%
test-suite :: External/SPEC/CINT2017speed/623.xalancbmk_s/623.xalancbmk_s.test 6377.00 6384.00 0.1%
test-suite :: External/SPEC/CFP2017rate/510.parest_r/510.parest_r.test 12650.00 12658.00 0.1%
test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 26169.00 26147.00 -0.1%
test-suite :: MultiSource/Benchmarks/Trimaran/enc-3des/enc-3des.test 99.00 86.00 -13.1%
Gains:
526.blender_r - more vectorized trees.
enc-3des - same.
Others:
510.parest_r - no changes.
miniFE - same
623.xalancbmk_s - some (non-profitable) parts of the trees are not
vectorized.
523.xalancbmk_r - same
lencod - same
timberwolfmc - same
miniAMR - same
Differential Revision: https://reviews.llvm.org/D125571
Added:
Modified:
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index b9f86cd10498..830cc8d0217a 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2016,11 +2016,13 @@ class BoUpSLP {
/// for a pair which have highest score deemed to have best chance to form
/// root of profitable tree to vectorize. Return None if no candidate scored
/// above the LookAheadHeuristics::ScoreFail.
+ /// \param Limit Lower limit of the cost, considered to be good enough score.
Optional<int>
- findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates) {
+ findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
+ int Limit = LookAheadHeuristics::ScoreFail) {
LookAheadHeuristics LookAhead(*DL, *SE, *this, /*NumLanes=*/2,
RootLookAheadMaxDepth);
- int BestScore = LookAheadHeuristics::ScoreFail;
+ int BestScore = Limit;
Optional<int> Index = None;
for (int I : seq<int>(0, Candidates.size())) {
int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
@@ -4524,10 +4526,64 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// If all of the operands are identical or constant we have a simple solution.
// If we deal with insert/extract instructions, they all must have constant
// indices, otherwise we should gather them, not try to vectorize.
+ // If alternate op node with 2 elements with gathered operands - do not
+ // vectorize.
+ auto &&NotProfitableForVectorization = [&S, this,
+ Depth](ArrayRef<Value *> VL) {
+ if (!S.getOpcode() || !S.isAltShuffle() || VL.size() > 2)
+ return false;
+ if (VectorizableTree.size() < MinTreeSize)
+ return false;
+ if (Depth >= RecursionMaxDepth - 1)
+ return true;
+ // Check if all operands are extracts, part of vector node or can build a
+ // regular vectorize node.
+ SmallVector<unsigned, 2> InstsCount(VL.size(), 0);
+ for (Value *V : VL) {
+ auto *I = cast<Instruction>(V);
+ InstsCount.push_back(count_if(I->operand_values(), [](Value *Op) {
+ return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
+ }));
+ }
+ bool IsCommutative = isCommutative(S.MainOp) || isCommutative(S.AltOp);
+ if ((IsCommutative &&
+ std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
+ (!IsCommutative &&
+ all_of(InstsCount, [](unsigned ICnt) { return ICnt < 2; })))
+ return true;
+ assert(VL.size() == 2 && "Expected only 2 alternate op instructions.");
+ SmallVector<SmallVector<std::pair<Value *, Value *>>> Candidates;
+ auto *I1 = cast<Instruction>(VL.front());
+ auto *I2 = cast<Instruction>(VL.back());
+ for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op)
+ Candidates.emplace_back().emplace_back(I1->getOperand(Op),
+ I2->getOperand(Op));
+ if (count_if(
+ Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
+ return findBestRootPair(Cand, LookAheadHeuristics::ScoreSplat);
+ }) >= S.MainOp->getNumOperands() / 2)
+ return false;
+ if (S.MainOp->getNumOperands() > 2)
+ return true;
+ if (IsCommutative) {
+ // Check permuted operands.
+ Candidates.clear();
+ for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op)
+ Candidates.emplace_back().emplace_back(I1->getOperand(Op),
+ I2->getOperand((Op + 1) % E));
+ if (any_of(
+ Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
+ return findBestRootPair(Cand, LookAheadHeuristics::ScoreSplat);
+ }))
+ return false;
+ }
+ return true;
+ };
if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode() ||
(isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(S.MainOp) &&
- !all_of(VL, isVectorLikeInstWithConstOps))) {
- LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
+ !all_of(VL, isVectorLikeInstWithConstOps)) ||
+ NotProfitableForVectorization(VL)) {
+ LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n");
if (TryToFindDuplicates(S))
newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies);
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
index 318d45203efb..616058d6f67f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake -slp-threshold=-4 | FileCheck %s --check-prefix=CHECK
-; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake -slp-threshold=-7 -slp-min-tree-size=6 | FileCheck %s --check-prefix=FORCE_REDUCTION
+; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake -slp-threshold=-6 -slp-min-tree-size=5 | FileCheck %s --check-prefix=FORCE_REDUCTION
define void @Test(i32) {
; CHECK-LABEL: @Test(
@@ -79,24 +79,22 @@ define void @Test(i32) {
; FORCE_REDUCTION-NEXT: [[TMP24:%.*]] = insertelement <16 x i32> [[TMP23]], i32 [[TMP0]], i32 15
; FORCE_REDUCTION-NEXT: br label [[LOOP:%.*]]
; FORCE_REDUCTION: loop:
-; FORCE_REDUCTION-NEXT: [[TMP25:%.*]] = phi <2 x i32> [ [[TMP36:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ]
+; FORCE_REDUCTION-NEXT: [[TMP25:%.*]] = phi <2 x i32> [ [[TMP32:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ]
; FORCE_REDUCTION-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP25]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; FORCE_REDUCTION-NEXT: [[TMP26:%.*]] = add <8 x i32> [[SHUFFLE]], <i32 0, i32 55, i32 285, i32 1240, i32 1496, i32 8555, i32 12529, i32 13685>
-; FORCE_REDUCTION-NEXT: [[TMP27:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP24]])
-; FORCE_REDUCTION-NEXT: [[TMP28:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP8]])
-; FORCE_REDUCTION-NEXT: [[TMP29:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP26]])
-; FORCE_REDUCTION-NEXT: [[OP_RDX13:%.*]] = and i32 [[TMP29]], [[TMP0]]
-; FORCE_REDUCTION-NEXT: [[OP_RDX14:%.*]] = and i32 [[OP_RDX13]], [[TMP0]]
-; FORCE_REDUCTION-NEXT: [[OP_RDX15:%.*]] = and i32 [[OP_RDX14]], [[TMP0]]
-; FORCE_REDUCTION-NEXT: [[OP_RDX16:%.*]] = and i32 [[OP_RDX15]], [[TMP27]]
-; FORCE_REDUCTION-NEXT: [[OP_RDX17:%.*]] = and i32 [[OP_RDX16]], [[TMP28]]
-; FORCE_REDUCTION-NEXT: [[TMP30:%.*]] = insertelement <2 x i32> <i32 poison, i32 14910>, i32 [[OP_RDX17]], i32 0
-; FORCE_REDUCTION-NEXT: [[TMP31:%.*]] = extractelement <8 x i32> [[SHUFFLE]], i32 1
-; FORCE_REDUCTION-NEXT: [[TMP32:%.*]] = insertelement <2 x i32> poison, i32 [[TMP31]], i32 0
-; FORCE_REDUCTION-NEXT: [[TMP33:%.*]] = insertelement <2 x i32> [[TMP32]], i32 [[TMP31]], i32 1
-; FORCE_REDUCTION-NEXT: [[TMP34:%.*]] = and <2 x i32> [[TMP30]], [[TMP33]]
-; FORCE_REDUCTION-NEXT: [[TMP35:%.*]] = add <2 x i32> [[TMP30]], [[TMP33]]
-; FORCE_REDUCTION-NEXT: [[TMP36]] = shufflevector <2 x i32> [[TMP34]], <2 x i32> [[TMP35]], <2 x i32> <i32 0, i32 3>
+; FORCE_REDUCTION-NEXT: [[TMP26:%.*]] = extractelement <8 x i32> [[SHUFFLE]], i32 1
+; FORCE_REDUCTION-NEXT: [[TMP27:%.*]] = add <8 x i32> [[SHUFFLE]], <i32 0, i32 55, i32 285, i32 1240, i32 1496, i32 8555, i32 12529, i32 13685>
+; FORCE_REDUCTION-NEXT: [[TMP28:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP24]])
+; FORCE_REDUCTION-NEXT: [[TMP29:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP8]])
+; FORCE_REDUCTION-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP28]], [[TMP29]]
+; FORCE_REDUCTION-NEXT: [[TMP30:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP27]])
+; FORCE_REDUCTION-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[TMP30]]
+; FORCE_REDUCTION-NEXT: [[OP_RDX2:%.*]] = and i32 [[OP_RDX1]], [[TMP0]]
+; FORCE_REDUCTION-NEXT: [[OP_RDX3:%.*]] = and i32 [[OP_RDX2]], [[TMP0]]
+; FORCE_REDUCTION-NEXT: [[OP_RDX4:%.*]] = and i32 [[OP_RDX3]], [[TMP0]]
+; FORCE_REDUCTION-NEXT: [[OP_RDX5:%.*]] = and i32 [[OP_RDX4]], [[TMP26]]
+; FORCE_REDUCTION-NEXT: [[VAL_43:%.*]] = add i32 [[TMP26]], 14910
+; FORCE_REDUCTION-NEXT: [[TMP31:%.*]] = insertelement <2 x i32> poison, i32 [[OP_RDX5]], i32 0
+; FORCE_REDUCTION-NEXT: [[TMP32]] = insertelement <2 x i32> [[TMP31]], i32 [[VAL_43]], i32 1
; FORCE_REDUCTION-NEXT: br label [[LOOP]]
;
entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll b/llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll
index 6752641753ff..d2a56c9415b1 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll
@@ -5,18 +5,18 @@ define dso_local void @rftbsub(double* %a) local_unnamed_addr #0 {
; CHECK-LABEL: @rftbsub(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 2
-; CHECK-NEXT: [[TMP0:%.*]] = load double, double* [[ARRAYIDX6]], align 8
-; CHECK-NEXT: [[TMP1:%.*]] = or i64 2, 1
-; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP2:%.*]] = load double, double* [[ARRAYIDX12]], align 8
+; CHECK-NEXT: [[SUB22:%.*]] = fsub double undef, undef
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[ARRAYIDX6]] to <2 x double>*
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
; CHECK-NEXT: [[ADD16:%.*]] = fadd double [[TMP2]], undef
; CHECK-NEXT: [[MUL18:%.*]] = fmul double undef, [[ADD16]]
; CHECK-NEXT: [[ADD19:%.*]] = fadd double undef, [[MUL18]]
-; CHECK-NEXT: [[SUB22:%.*]] = fsub double undef, undef
-; CHECK-NEXT: [[SUB25:%.*]] = fsub double [[TMP0]], [[ADD19]]
-; CHECK-NEXT: store double [[SUB25]], double* [[ARRAYIDX6]], align 8
-; CHECK-NEXT: [[SUB29:%.*]] = fsub double [[TMP2]], [[SUB22]]
-; CHECK-NEXT: store double [[SUB29]], double* [[ARRAYIDX12]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[ADD19]], i32 0
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[SUB22]], i32 1
+; CHECK-NEXT: [[TMP5:%.*]] = fsub <2 x double> [[TMP1]], [[TMP4]]
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[ARRAYIDX6]] to <2 x double>*
+; CHECK-NEXT: store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 8
; CHECK-NEXT: unreachable
;
entry:
More information about the llvm-commits
mailing list