[llvm] 8ceccbd - [SLP]Outline and fix code for finding common insertelement vectors.
Alexey Bataev via llvm-commits
llvm-commits at lists.llvm.org
Thu Dec 2 09:19:18 PST 2021
Author: Alexey Bataev
Date: 2021-12-02T09:18:25-08:00
New Revision: 8ceccbd321319434450717ac1917cdca174ff6eb
URL: https://github.com/llvm/llvm-project/commit/8ceccbd321319434450717ac1917cdca174ff6eb
DIFF: https://github.com/llvm/llvm-project/commit/8ceccbd321319434450717ac1917cdca174ff6eb.diff
LOG: [SLP]Outline and fix code for finding common insertelement vectors.
Need to outline the code for finding common vectors in insertelement
instructions into a separate function for future patches. It also
improves the process by adding some extra checks for early exit and
fixes a bug where it always finds the match because of erroneous compare
of the same values.
Differential Revision: https://reviews.llvm.org/D114909
Added:
Modified:
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
llvm/test/Transforms/SLPVectorizer/X86/cmp_commute-inseltpoison.ll
llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index a6231f8b6e7f..95061e9053fa 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -5375,6 +5375,42 @@ InstructionCost BoUpSLP::getSpillCost() const {
return Cost;
}
+/// Check if two insertelement instructions are from the same buildvector.
+static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU,
+ InsertElementInst *V) {
+ // Instructions must be from the same basic blocks.
+ if (VU->getParent() != V->getParent())
+ return false;
+ // Checks if 2 insertelements are from the same buildvector.
+ if (VU->getType() != V->getType())
+ return false;
+ // Multiple used inserts are separate nodes.
+ if (!VU->hasOneUse() && !V->hasOneUse())
+ return false;
+ auto *IE1 = VU;
+ auto *IE2 = V;
+ // Go through the vector operand of insertelement instructions trying to find
+ // either VU as the original vector for IE2 or V as the original vector for
+ // IE1.
+ do {
+ if (IE2 == VU || IE1 == V)
+ return true;
+ if (IE1) {
+ if (IE1 != VU && !IE1->hasOneUse())
+ IE1 = nullptr;
+ else
+ IE1 = dyn_cast<InsertElementInst>(IE1->getOperand(0));
+ }
+ if (IE2) {
+ if (IE2 != V && !IE2->hasOneUse())
+ IE2 = nullptr;
+ else
+ IE2 = dyn_cast<InsertElementInst>(IE2->getOperand(0));
+ }
+ } while (IE1 || IE2);
+ return false;
+}
+
InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
InstructionCost Cost = 0;
LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
@@ -5422,29 +5458,14 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
// If found user is an insertelement, do not calculate extract cost but try
// to detect it as a final shuffled/identity match.
- if (isa_and_nonnull<InsertElementInst>(EU.User)) {
- if (auto *FTy = dyn_cast<FixedVectorType>(EU.User->getType())) {
- Optional<int> InsertIdx = getInsertIndex(EU.User, 0);
+ if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User)) {
+ if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
+ Optional<int> InsertIdx = getInsertIndex(VU, 0);
if (!InsertIdx || *InsertIdx == UndefMaskElem)
continue;
- Value *VU = EU.User;
auto *It = find_if(FirstUsers, [VU](Value *V) {
- // Checks if 2 insertelements are from the same buildvector.
- if (VU->getType() != V->getType())
- return false;
- auto *IE1 = cast<InsertElementInst>(VU);
- auto *IE2 = cast<InsertElementInst>(V);
- // Go through of insertelement instructions trying to find either VU
- // as the original vector for IE2 or V as the original vector for IE1.
- do {
- if (IE1 == VU || IE2 == V)
- return true;
- if (IE1)
- IE1 = dyn_cast<InsertElementInst>(IE1->getOperand(0));
- if (IE2)
- IE2 = dyn_cast<InsertElementInst>(IE2->getOperand(0));
- } while (IE1 || IE2);
- return false;
+ return areTwoInsertFromSameBuildVector(VU,
+ cast<InsertElementInst>(V));
});
int VecId = -1;
if (It == FirstUsers.end()) {
@@ -5455,7 +5476,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
while (isa<InsertElementInst>(Base)) {
// Build the mask for the vectorized insertelement instructions.
if (const TreeEntry *E = getTreeEntry(Base)) {
- VU = Base;
+ VU = cast<InsertElementInst>(Base);
do {
int Idx = E->findLaneForValue(Base);
ShuffleMask.back()[Idx] = Idx;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute-inseltpoison.ll
index 85a79014506a..b1fbebbee12e 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute-inseltpoison.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+sse2 | FileCheck %s
-; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+avx | FileCheck %s
+; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX
;
; Check that we can commute operands based on the predicate.
@@ -235,26 +235,46 @@ define <4 x i32> @fcmp_ogt_olt_v4i32(<4 x float> %a, float* %b) {
}
define <4 x i32> @fcmp_ord_uno_v4i32(<4 x float> %a, float* %b) {
-; CHECK-LABEL: @fcmp_ord_uno_v4i32(
-; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
-; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
-; CHECK-NEXT: [[B0:%.*]] = load float, float* [[B]], align 4
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P1]] to <2 x float>*
-; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
-; CHECK-NEXT: [[B3:%.*]] = load float, float* [[P3]], align 4
-; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <2 x float> [[TMP2]], [[TMP3]]
-; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> <i32 3, i32 0>
-; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[B3]], i32 0
-; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[B0]], i32 1
-; CHECK-NEXT: [[TMP8:%.*]] = fcmp ord <2 x float> [[TMP5]], [[TMP7]]
-; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT: [[D0:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i1> [[TMP4]], <2 x i1> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; CHECK-NEXT: [[D21:%.*]] = shufflevector <4 x i1> [[D0]], <4 x i1> [[TMP10]], <4 x i32> <i32 0, i32 4, i32 5, i32 undef>
-; CHECK-NEXT: [[D3:%.*]] = shufflevector <4 x i1> [[D21]], <4 x i1> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 2, i32 4>
-; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[D3]] to <4 x i32>
-; CHECK-NEXT: ret <4 x i32> [[R]]
+; SSE-LABEL: @fcmp_ord_uno_v4i32(
+; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
+; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
+; SSE-NEXT: [[B0:%.*]] = load float, float* [[B]], align 4
+; SSE-NEXT: [[TMP1:%.*]] = bitcast float* [[P1]] to <2 x float>*
+; SSE-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
+; SSE-NEXT: [[B3:%.*]] = load float, float* [[P3]], align 4
+; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <2 x i32> <i32 1, i32 2>
+; SSE-NEXT: [[TMP4:%.*]] = fcmp uno <2 x float> [[TMP2]], [[TMP3]]
+; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> <i32 3, i32 0>
+; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[B3]], i32 0
+; SSE-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[B0]], i32 1
+; SSE-NEXT: [[TMP8:%.*]] = fcmp ord <2 x float> [[TMP5]], [[TMP7]]
+; SSE-NEXT: [[TMP9:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT: [[D0:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT: [[TMP10:%.*]] = shufflevector <2 x i1> [[TMP4]], <2 x i1> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; SSE-NEXT: [[D21:%.*]] = shufflevector <4 x i1> [[D0]], <4 x i1> [[TMP10]], <4 x i32> <i32 0, i32 4, i32 5, i32 undef>
+; SSE-NEXT: [[D3:%.*]] = shufflevector <4 x i1> [[D21]], <4 x i1> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+; SSE-NEXT: [[R:%.*]] = sext <4 x i1> [[D3]] to <4 x i32>
+; SSE-NEXT: ret <4 x i32> [[R]]
+;
+; AVX-LABEL: @fcmp_ord_uno_v4i32(
+; AVX-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
+; AVX-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
+; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
+; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
+; AVX-NEXT: [[B0:%.*]] = load float, float* [[B]], align 4
+; AVX-NEXT: [[TMP1:%.*]] = bitcast float* [[P1]] to <2 x float>*
+; AVX-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
+; AVX-NEXT: [[B3:%.*]] = load float, float* [[P3]], align 4
+; AVX-NEXT: [[C0:%.*]] = fcmp ord float [[A0]], [[B0]]
+; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> <i32 1, i32 2>
+; AVX-NEXT: [[TMP4:%.*]] = fcmp uno <2 x float> [[TMP2]], [[TMP3]]
+; AVX-NEXT: [[C3:%.*]] = fcmp ord float [[A3]], [[B3]]
+; AVX-NEXT: [[D0:%.*]] = insertelement <4 x i1> poison, i1 [[C0]], i32 0
+; AVX-NEXT: [[TMP5:%.*]] = shufflevector <2 x i1> [[TMP4]], <2 x i1> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; AVX-NEXT: [[D21:%.*]] = shufflevector <4 x i1> [[D0]], <4 x i1> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 5, i32 undef>
+; AVX-NEXT: [[D3:%.*]] = insertelement <4 x i1> [[D21]], i1 [[C3]], i32 3
+; AVX-NEXT: [[R:%.*]] = sext <4 x i1> [[D3]] to <4 x i32>
+; AVX-NEXT: ret <4 x i32> [[R]]
;
%a0 = extractelement <4 x float> %a, i32 0
%a1 = extractelement <4 x float> %a, i32 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll b/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll
index 5e99d77106ec..87df8546d79d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+sse2 | FileCheck %s
-; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+avx | FileCheck %s
+; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX
;
; Check that we can commute operands based on the predicate.
@@ -235,26 +235,46 @@ define <4 x i32> @fcmp_ogt_olt_v4i32(<4 x float> %a, float* %b) {
}
define <4 x i32> @fcmp_ord_uno_v4i32(<4 x float> %a, float* %b) {
-; CHECK-LABEL: @fcmp_ord_uno_v4i32(
-; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
-; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
-; CHECK-NEXT: [[B0:%.*]] = load float, float* [[B]], align 4
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P1]] to <2 x float>*
-; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
-; CHECK-NEXT: [[B3:%.*]] = load float, float* [[P3]], align 4
-; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <2 x float> [[TMP2]], [[TMP3]]
-; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> <i32 3, i32 0>
-; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[B3]], i32 0
-; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[B0]], i32 1
-; CHECK-NEXT: [[TMP8:%.*]] = fcmp ord <2 x float> [[TMP5]], [[TMP7]]
-; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT: [[D0:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i1> [[TMP4]], <2 x i1> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; CHECK-NEXT: [[D21:%.*]] = shufflevector <4 x i1> [[D0]], <4 x i1> [[TMP10]], <4 x i32> <i32 0, i32 4, i32 5, i32 undef>
-; CHECK-NEXT: [[D3:%.*]] = shufflevector <4 x i1> [[D21]], <4 x i1> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 2, i32 4>
-; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[D3]] to <4 x i32>
-; CHECK-NEXT: ret <4 x i32> [[R]]
+; SSE-LABEL: @fcmp_ord_uno_v4i32(
+; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
+; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
+; SSE-NEXT: [[B0:%.*]] = load float, float* [[B]], align 4
+; SSE-NEXT: [[TMP1:%.*]] = bitcast float* [[P1]] to <2 x float>*
+; SSE-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
+; SSE-NEXT: [[B3:%.*]] = load float, float* [[P3]], align 4
+; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <2 x i32> <i32 1, i32 2>
+; SSE-NEXT: [[TMP4:%.*]] = fcmp uno <2 x float> [[TMP2]], [[TMP3]]
+; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> <i32 3, i32 0>
+; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[B3]], i32 0
+; SSE-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[B0]], i32 1
+; SSE-NEXT: [[TMP8:%.*]] = fcmp ord <2 x float> [[TMP5]], [[TMP7]]
+; SSE-NEXT: [[TMP9:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT: [[D0:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT: [[TMP10:%.*]] = shufflevector <2 x i1> [[TMP4]], <2 x i1> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; SSE-NEXT: [[D21:%.*]] = shufflevector <4 x i1> [[D0]], <4 x i1> [[TMP10]], <4 x i32> <i32 0, i32 4, i32 5, i32 undef>
+; SSE-NEXT: [[D3:%.*]] = shufflevector <4 x i1> [[D21]], <4 x i1> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+; SSE-NEXT: [[R:%.*]] = sext <4 x i1> [[D3]] to <4 x i32>
+; SSE-NEXT: ret <4 x i32> [[R]]
+;
+; AVX-LABEL: @fcmp_ord_uno_v4i32(
+; AVX-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
+; AVX-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
+; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
+; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
+; AVX-NEXT: [[B0:%.*]] = load float, float* [[B]], align 4
+; AVX-NEXT: [[TMP1:%.*]] = bitcast float* [[P1]] to <2 x float>*
+; AVX-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
+; AVX-NEXT: [[B3:%.*]] = load float, float* [[P3]], align 4
+; AVX-NEXT: [[C0:%.*]] = fcmp ord float [[A0]], [[B0]]
+; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> <i32 1, i32 2>
+; AVX-NEXT: [[TMP4:%.*]] = fcmp uno <2 x float> [[TMP2]], [[TMP3]]
+; AVX-NEXT: [[C3:%.*]] = fcmp ord float [[A3]], [[B3]]
+; AVX-NEXT: [[D0:%.*]] = insertelement <4 x i1> undef, i1 [[C0]], i32 0
+; AVX-NEXT: [[TMP5:%.*]] = shufflevector <2 x i1> [[TMP4]], <2 x i1> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; AVX-NEXT: [[D21:%.*]] = shufflevector <4 x i1> [[D0]], <4 x i1> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 5, i32 undef>
+; AVX-NEXT: [[D3:%.*]] = insertelement <4 x i1> [[D21]], i1 [[C3]], i32 3
+; AVX-NEXT: [[R:%.*]] = sext <4 x i1> [[D3]] to <4 x i32>
+; AVX-NEXT: ret <4 x i32> [[R]]
;
%a0 = extractelement <4 x float> %a, i32 0
%a1 = extractelement <4 x float> %a, i32 1
More information about the llvm-commits
mailing list