[llvm] 5dccea5 - [SLP]Do not emit many extractelements, reuse the single one emitted.
Alexey Bataev via llvm-commits
llvm-commits at lists.llvm.org
Fri Dec 30 06:45:13 PST 2022
Author: Alexey Bataev
Date: 2022-12-30T06:38:06-08:00
New Revision: 5dccea5a68fb5181ab88ce6faac4668934f35cd7
URL: https://github.com/llvm/llvm-project/commit/5dccea5a68fb5181ab88ce6faac4668934f35cd7
DIFF: https://github.com/llvm/llvm-project/commit/5dccea5a68fb5181ab88ce6faac4668934f35cd7.diff
LOG: [SLP]Do not emit many extractelements, reuse the single one emitted.
We do not need to emit many extractelements for each particular use, we
can reuse the only one, just need to adjust it to make it dominate on
all uses.
Differential Revision: https://reviews.llvm.org/D140580
Added:
Modified:
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 744cefb206cb2..f852a21692a5c 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -9554,6 +9554,9 @@ Value *BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues,
SmallVector<ShuffledInsertData> ShuffledInserts;
// Maps vector instruction to original insertelement instruction
DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
+ // Maps extract Scalar to the corresponding extractelement instruction in the
+ // basic block. Only one extractelement per block should be emitted.
+ DenseMap<Value *, DenseMap<BasicBlock *, Value *>> ScalarToEEs;
// Extract all of the elements with the external uses.
for (const auto &ExternalUse : ExternalUses) {
Value *Scalar = ExternalUse.Scalar;
@@ -9578,13 +9581,29 @@ Value *BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues,
Value *Lane = Builder.getInt32(ExternalUse.Lane);
auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
if (Scalar->getType() != Vec->getType()) {
- Value *Ex;
- // "Reuse" the existing extract to improve final codegen.
- if (auto *ES = dyn_cast<ExtractElementInst>(Scalar)) {
- Ex = Builder.CreateExtractElement(ES->getOperand(0),
- ES->getOperand(1));
- } else {
- Ex = Builder.CreateExtractElement(Vec, Lane);
+ Value *Ex = nullptr;
+ auto It = ScalarToEEs.find(Scalar);
+ if (It != ScalarToEEs.end()) {
+ // No need to emit many extracts, just move the only one in the
+ // current block.
+ auto EEIt = It->second.find(Builder.GetInsertBlock());
+ if (EEIt != It->second.end()) {
+ auto *I = cast<Instruction>(EEIt->second);
+ if (Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
+ Builder.GetInsertPoint()->comesBefore(I))
+ I->moveBefore(&*Builder.GetInsertPoint());
+ Ex = I;
+ }
+ }
+ if (!Ex) {
+ // "Reuse" the existing extract to improve final codegen.
+ if (auto *ES = dyn_cast<ExtractElementInst>(Scalar)) {
+ Ex = Builder.CreateExtractElement(ES->getOperand(0),
+ ES->getOperand(1));
+ } else {
+ Ex = Builder.CreateExtractElement(Vec, Lane);
+ }
+ ScalarToEEs[Scalar].try_emplace(Builder.GetInsertBlock(), Ex);
}
// The then branch of the previous if may produce constants, since 0
// operand might be a constant.
@@ -9615,8 +9634,11 @@ Value *BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues,
"Scalar with nullptr as an external user must be registered in "
"ExternallyUsedValues map");
if (auto *VecI = dyn_cast<Instruction>(Vec)) {
- Builder.SetInsertPoint(VecI->getParent(),
- std::next(VecI->getIterator()));
+ if (auto *PHI = dyn_cast<PHINode>(VecI))
+ Builder.SetInsertPoint(PHI->getParent()->getFirstNonPHI());
+ else
+ Builder.SetInsertPoint(VecI->getParent(),
+ std::next(VecI->getIterator()));
} else {
Builder.SetInsertPoint(&F->getEntryBlock().front());
}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll
index 6c8518060b17d..9459b9376468e 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll
@@ -75,13 +75,13 @@ define void @pr35497() local_unnamed_addr #0 {
; SSE-NEXT: [[TMP2:%.*]] = shl <2 x i64> [[TMP1]], <i64 2, i64 2>
; SSE-NEXT: [[TMP3:%.*]] = and <2 x i64> [[TMP2]], <i64 20, i64 20>
; SSE-NEXT: [[TMP4:%.*]] = add nuw nsw <2 x i64> [[TMP3]], zeroinitializer
-; SSE-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
-; SSE-NEXT: [[TMP6:%.*]] = bitcast i64* [[ARRAYIDX2_6]] to <2 x i64>*
-; SSE-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[TMP6]], align 1
-; SSE-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0
-; SSE-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[ADD]], i32 1
-; SSE-NEXT: [[TMP9:%.*]] = shl <2 x i64> [[TMP8]], <i64 2, i64 2>
-; SSE-NEXT: [[TMP10:%.*]] = and <2 x i64> [[TMP9]], <i64 20, i64 20>
+; SSE-NEXT: [[TMP5:%.*]] = bitcast i64* [[ARRAYIDX2_6]] to <2 x i64>*
+; SSE-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[TMP5]], align 1
+; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> poison, i64 [[ADD]], i32 0
+; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; SSE-NEXT: [[TMP8:%.*]] = shl <2 x i64> [[TMP7]], <i64 2, i64 2>
+; SSE-NEXT: [[TMP9:%.*]] = and <2 x i64> [[TMP8]], <i64 20, i64 20>
+; SSE-NEXT: [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP9]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
; SSE-NEXT: [[TMP11:%.*]] = lshr <2 x i64> [[TMP4]], <i64 6, i64 6>
; SSE-NEXT: [[TMP12:%.*]] = add nuw nsw <2 x i64> [[TMP10]], [[TMP11]]
; SSE-NEXT: [[TMP13:%.*]] = bitcast i64* [[ARRAYIDX2_2]] to <2 x i64>*
@@ -99,13 +99,13 @@ define void @pr35497() local_unnamed_addr #0 {
; AVX-NEXT: [[TMP2:%.*]] = shl <2 x i64> [[TMP1]], <i64 2, i64 2>
; AVX-NEXT: [[TMP3:%.*]] = and <2 x i64> [[TMP2]], <i64 20, i64 20>
; AVX-NEXT: [[TMP4:%.*]] = add nuw nsw <2 x i64> [[TMP3]], zeroinitializer
-; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
-; AVX-NEXT: [[TMP6:%.*]] = bitcast i64* [[ARRAYIDX2_6]] to <2 x i64>*
-; AVX-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[TMP6]], align 1
-; AVX-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0
-; AVX-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[ADD]], i32 1
-; AVX-NEXT: [[TMP9:%.*]] = shl <2 x i64> [[TMP8]], <i64 2, i64 2>
-; AVX-NEXT: [[TMP10:%.*]] = and <2 x i64> [[TMP9]], <i64 20, i64 20>
+; AVX-NEXT: [[TMP5:%.*]] = bitcast i64* [[ARRAYIDX2_6]] to <2 x i64>*
+; AVX-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[TMP5]], align 1
+; AVX-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> poison, i64 [[ADD]], i32 0
+; AVX-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; AVX-NEXT: [[TMP8:%.*]] = shl <2 x i64> [[TMP7]], <i64 2, i64 2>
+; AVX-NEXT: [[TMP9:%.*]] = and <2 x i64> [[TMP8]], <i64 20, i64 20>
+; AVX-NEXT: [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP9]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
; AVX-NEXT: [[TMP11:%.*]] = lshr <2 x i64> [[TMP4]], <i64 6, i64 6>
; AVX-NEXT: [[TMP12:%.*]] = add nuw nsw <2 x i64> [[TMP10]], [[TMP11]]
; AVX-NEXT: [[TMP13:%.*]] = bitcast i64* [[ARRAYIDX2_2]] to <2 x i64>*
More information about the llvm-commits
mailing list