[llvm] ddce6e0 - [SLP]Improve vectorization of cmp instructions sequences.
Alexey Bataev via llvm-commits
llvm-commits at lists.llvm.org
Wed Dec 1 07:27:20 PST 2021
Author: Alexey Bataev
Date: 2021-12-01T07:26:29-08:00
New Revision: ddce6e05612dac4edec0ab636fb2cbdad1676879
URL: https://github.com/llvm/llvm-project/commit/ddce6e05612dac4edec0ab636fb2cbdad1676879
DIFF: https://github.com/llvm/llvm-project/commit/ddce6e05612dac4edec0ab636fb2cbdad1676879.diff
LOG: [SLP]Improve vectorization of cmp instructions sequences.
Final attempt to vectorize bundles of comptatible cmp instructions after
all other instructions processing.
Metric: SLP.NumVectorInstructions
Program results results0 diff
test-suite :: MultiSource/Benchmarks/mediabench/g721/g721encode/encode.test 1.00 5.00 400.0%
test-suite :: MultiSource/Benchmarks/PAQ8p/paq8p.test 8.00 11.00 37.5%
test-suite :: MultiSource/Benchmarks/Olden/voronoi/voronoi.test 20.00 26.00 30.0%
test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 1344.00 1648.00 22.6%
test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 1344.00 1648.00 22.6%
test-suite :: MultiSource/Benchmarks/Olden/bh/bh.test 102.00 124.00 21.6%
test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C/CoMD/CoMD.test 118.00 133.00 12.7%
test-suite :: External/SPEC/CFP2017speed/638.imagick_s/638.imagick_s.test 3233.00 3554.00 9.9%
test-suite :: External/SPEC/CFP2017rate/538.imagick_r/538.imagick_r.test 3233.00 3554.00 9.9%
test-suite :: MultiSource/Benchmarks/Olden/power/power.test 64.00 70.00 9.4%
test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 7879.00 8604.00 9.2%
test-suite :: MultiSource/Benchmarks/Prolangs-C/simulator/simulator.test 50.00 54.00 8.0%
test-suite :: MultiSource/Applications/sqlite3/sqlite3.test 27.00 29.00 7.4%
test-suite :: External/SPEC/CFP2017rate/510.parest_r/510.parest_r.test 8345.00 8955.00 7.3%
test-suite :: MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/timberwolfmc.test 694.00 738.00 6.3%
test-suite :: MultiSource/Benchmarks/MallocBench/gs/gs.test 361.00 382.00 5.8%
test-suite :: MultiSource/Benchmarks/7zip/7zip-benchmark.test 409.00 430.00 5.1%
test-suite :: External/SPEC/CINT2017speed/600.perlbench_s/600.perlbench_s.test 140.00 147.00 5.0%
test-suite :: External/SPEC/CINT2017rate/500.perlbench_r/500.perlbench_r.test 140.00 147.00 5.0%
test-suite :: External/SPEC/CFP2017rate/511.povray_r/511.povray_r.test 4013.00 4206.00 4.8%
test-suite :: MultiSource/Applications/JM/ldecod/ldecod.test 966.00 1011.00 4.7%
test-suite :: SingleSource/Benchmarks/Misc/oourafft.test 65.00 68.00 4.6%
test-suite :: MultiSource/Benchmarks/Bullet/bullet.test 4219.00 4381.00 3.8%
test-suite :: MultiSource/Benchmarks/tramp3d-v4/tramp3d-v4.test 1911.00 1973.00 3.2%
test-suite :: External/SPEC/CINT2017rate/531.deepsjeng_r/531.deepsjeng_r.test 62.00 64.00 3.2%
test-suite :: External/SPEC/CINT2017speed/631.deepsjeng_s/631.deepsjeng_s.test 62.00 64.00 3.2%
test-suite :: External/SPEC/CINT2017speed/602.gcc_s/602.gcc_s.test 852.00 877.00 2.9%
test-suite :: External/SPEC/CINT2017rate/502.gcc_r/502.gcc_r.test 852.00 877.00 2.9%
test-suite :: MultiSource/Applications/JM/lencod/lencod.test 1624.00 1668.00 2.7%
test-suite :: MultiSource/Benchmarks/McCat/18-imp/imp.test 39.00 40.00 2.6%
test-suite :: MultiSource/Benchmarks/MiBench/consumer-typeset/consumer-typeset.test 613.00 624.00 1.8%
test-suite :: MultiSource/Benchmarks/MiBench/consumer-lame/consumer-lame.test 378.00 383.00 1.3%
test-suite :: MultiSource/Benchmarks/MiBench/consumer-jpeg/consumer-jpeg.test 293.00 295.00 0.7%
test-suite :: MultiSource/Benchmarks/mediabench/jpeg/jpeg-6a/cjpeg.test 297.00 299.00 0.7%
test-suite :: External/SPEC/CINT2017rate/523.xalancbmk_r/523.xalancbmk_r.test 5522.00 5534.00 0.2%
test-suite :: External/SPEC/CINT2017speed/623.xalancbmk_s/623.xalancbmk_s.test 5522.00 5534.00 0.2%
Differential Revision: https://reviews.llvm.org/D114799
Added:
Modified:
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll
llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll
llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 4db630fbd063f..843150dcde0d8 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -9318,6 +9318,63 @@ bool SLPVectorizerPass::vectorizeSimpleInstructions(
continue;
OpsChanged |= tryToVectorize(I, R);
}
+ // Try to vectorize list of compares.
+ // Sort by type, compare predicate, etc.
+ // TODO: Add analysis on the operand opcodes (profitable to vectorize
+ // instructions with same/alternate opcodes/const values).
+ auto &&CompareSorter = [&R](Value *V, Value *V2) {
+ auto *CI1 = cast<CmpInst>(V);
+ auto *CI2 = cast<CmpInst>(V2);
+ if (R.isDeleted(CI2) || !isValidElementType(CI2->getType()))
+ return false;
+ if (CI1->getOperand(0)->getType()->getTypeID() <
+ CI2->getOperand(0)->getType()->getTypeID())
+ return true;
+ if (CI1->getOperand(0)->getType()->getTypeID() >
+ CI2->getOperand(0)->getType()->getTypeID())
+ return false;
+ return CI1->getPredicate() < CI2->getPredicate() ||
+ (CI1->getPredicate() > CI2->getPredicate() &&
+ CI1->getPredicate() <
+ CmpInst::getSwappedPredicate(CI2->getPredicate()));
+ };
+
+ auto &&AreCompatibleCompares = [&R](Value *V1, Value *V2) {
+ if (V1 == V2)
+ return true;
+ auto *CI1 = cast<CmpInst>(V1);
+ auto *CI2 = cast<CmpInst>(V2);
+ if (R.isDeleted(CI2) || !isValidElementType(CI2->getType()))
+ return false;
+ if (CI1->getOperand(0)->getType() != CI2->getOperand(0)->getType())
+ return false;
+ return CI1->getPredicate() == CI2->getPredicate() ||
+ CI1->getPredicate() ==
+ CmpInst::getSwappedPredicate(CI2->getPredicate());
+ };
+ auto Limit = [&R](Value *V) {
+ unsigned EltSize = R.getVectorElementSize(V);
+ return std::max(2U, R.getMaxVecRegSize() / EltSize);
+ };
+
+ SmallVector<Value *> Vals(PostponedCmps.begin(), PostponedCmps.end());
+ OpsChanged |= tryToVectorizeSequence<Value>(
+ Vals, Limit, CompareSorter, AreCompatibleCompares,
+ [this, &R](ArrayRef<Value *> Candidates, bool LimitForRegisterSize) {
+ // Exclude possible reductions from other blocks.
+ bool ArePossiblyReducedInOtherBlock =
+ any_of(Candidates, [](Value *V) {
+ return any_of(V->users(), [V](User *U) {
+ return isa<SelectInst>(U) &&
+ cast<SelectInst>(U)->getParent() !=
+ cast<Instruction>(V)->getParent();
+ });
+ });
+ if (ArePossiblyReducedInOtherBlock)
+ return false;
+ return tryToVectorizeList(Candidates, R, LimitForRegisterSize);
+ },
+ /*LimitForRegisterSize=*/true);
Instructions.clear();
} else {
// Insert in reverse order since the PostponedCmps vector was filled in
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll
index dd2742c209e2d..9620f25b3aac5 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll
@@ -420,23 +420,22 @@ return:
define float @test_merge_anyof_v4si(<4 x i32> %t) {
; CHECK-LABEL: @test_merge_anyof_v4si(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x i32> [[T:%.*]], i32 3
-; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[T]], i32 2
-; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[T]], i32 1
-; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[T]], i32 0
-; CHECK-NEXT: [[T_FR:%.*]] = freeze <4 x i32> [[T]]
-; CHECK-NEXT: [[TMP4:%.*]] = icmp slt <4 x i32> [[T_FR]], <i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i1> [[TMP4]] to i4
-; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i4 [[TMP5]], 0
-; CHECK-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[TMP3]], 255
-; CHECK-NEXT: [[OR_COND3:%.*]] = select i1 [[TMP6]], i1 true, i1 [[CMP11]]
-; CHECK-NEXT: [[CMP14:%.*]] = icmp sgt i32 [[TMP2]], 255
-; CHECK-NEXT: [[OR_COND4:%.*]] = select i1 [[OR_COND3]], i1 true, i1 [[CMP14]]
-; CHECK-NEXT: [[CMP17:%.*]] = icmp sgt i32 [[TMP1]], 255
-; CHECK-NEXT: [[OR_COND5:%.*]] = select i1 [[OR_COND4]], i1 true, i1 [[CMP17]]
-; CHECK-NEXT: [[CMP20:%.*]] = icmp sgt i32 [[TMP0]], 255
-; CHECK-NEXT: [[OR_COND6:%.*]] = select i1 [[OR_COND5]], i1 true, i1 [[CMP20]]
-; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT: [[T_FR:%.*]] = freeze <4 x i32> [[T:%.*]]
+; CHECK-NEXT: [[TMP0:%.*]] = icmp slt <4 x i32> [[T_FR]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i1> [[TMP0]] to i4
+; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i4 [[TMP1]], 0
+; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i32> [[T_FR]], <i32 255, i32 255, i32 255, i32 255>
+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0
+; CHECK-NEXT: [[OR_COND3:%.*]] = or i1 [[TMP2]], [[TMP4]]
+; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP3]], i32 1
+; CHECK-NEXT: [[OR_COND4:%.*]] = or i1 [[OR_COND3]], [[TMP5]]
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP3]], i32 2
+; CHECK-NEXT: [[OR_COND5:%.*]] = or i1 [[OR_COND4]], [[TMP6]]
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3
+; CHECK-NEXT: [[OR_COND6:%.*]] = or i1 [[OR_COND5]], [[TMP7]]
+; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x i32> [[T_FR]], <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[SHIFT]], [[T_FR]]
+; CHECK-NEXT: [[ADD:%.*]] = extractelement <4 x i32> [[TMP8]], i32 0
; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[ADD]] to float
; CHECK-NEXT: [[RETVAL_0:%.*]] = select i1 [[OR_COND6]], float 0.000000e+00, float [[CONV]]
; CHECK-NEXT: ret float [[RETVAL_0]]
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
index 6406e9ca572ce..e8408ad64146a 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
@@ -279,19 +279,26 @@ define i1 @cmp_lt_gt(double %a, double %b, double %c) {
; CHECK-LABEL: @cmp_lt_gt(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[FNEG:%.*]] = fneg double [[B:%.*]]
-; CHECK-NEXT: [[ADD:%.*]] = fsub double [[C:%.*]], [[B]]
; CHECK-NEXT: [[MUL:%.*]] = fmul double [[A:%.*]], 2.000000e+00
-; CHECK-NEXT: [[DIV:%.*]] = fdiv double [[ADD]], [[MUL]]
-; CHECK-NEXT: [[SUB:%.*]] = fsub double [[FNEG]], [[C]]
-; CHECK-NEXT: [[DIV3:%.*]] = fdiv double [[SUB]], [[MUL]]
-; CHECK-NEXT: [[CMP:%.*]] = fcmp olt double [[DIV]], 0x3EB0C6F7A0B5ED8D
-; CHECK-NEXT: [[CMP4:%.*]] = fcmp olt double [[DIV3]], 0x3EB0C6F7A0B5ED8D
+; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i32 1
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[FNEG]], i32 0
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[C]], i32 0
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[B]], i32 1
+; CHECK-NEXT: [[TMP4:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]]
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[MUL]], i32 0
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = fdiv <2 x double> [[TMP4]], [[TMP6]]
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 1
+; CHECK-NEXT: [[CMP:%.*]] = fcmp olt double [[TMP8]], 0x3EB0C6F7A0B5ED8D
+; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 0
+; CHECK-NEXT: [[CMP4:%.*]] = fcmp olt double [[TMP9]], 0x3EB0C6F7A0B5ED8D
; CHECK-NEXT: [[OR_COND:%.*]] = select i1 [[CMP]], i1 [[CMP4]], i1 false
; CHECK-NEXT: br i1 [[OR_COND]], label [[CLEANUP:%.*]], label [[LOR_LHS_FALSE:%.*]]
; CHECK: lor.lhs.false:
-; CHECK-NEXT: [[CMP5:%.*]] = fcmp ule double [[DIV]], 1.000000e+00
-; CHECK-NEXT: [[CMP7:%.*]] = fcmp ule double [[DIV3]], 1.000000e+00
-; CHECK-NEXT: [[OR_COND1:%.*]] = select i1 [[CMP5]], i1 true, i1 [[CMP7]]
+; CHECK-NEXT: [[TMP10:%.*]] = fcmp ule <2 x double> [[TMP7]], <double 1.000000e+00, double 1.000000e+00>
+; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP10]], i32 0
+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i1> [[TMP10]], i32 1
+; CHECK-NEXT: [[OR_COND1:%.*]] = select i1 [[TMP12]], i1 true, i1 [[TMP11]]
; CHECK-NEXT: br label [[CLEANUP]]
; CHECK: cleanup:
; CHECK-NEXT: [[RETVAL_0:%.*]] = phi i1 [ false, [[ENTRY:%.*]] ], [ [[OR_COND1]], [[LOR_LHS_FALSE]] ]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll
index 02ae15a023483..d9b6207997236 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll
@@ -132,6 +132,10 @@ define <4 x float> @simple_select_eph(<4 x float> %a, <4 x float> %b, <4 x i32>
; MINTREESIZE-NEXT: [[CMP1:%.*]] = icmp ne i32 [[C1]], 0
; MINTREESIZE-NEXT: [[CMP2:%.*]] = icmp ne i32 [[C2]], 0
; MINTREESIZE-NEXT: [[CMP3:%.*]] = icmp ne i32 [[C3]], 0
+; MINTREESIZE-NEXT: [[TMP1:%.*]] = insertelement <4 x i1> poison, i1 [[CMP3]], i32 0
+; MINTREESIZE-NEXT: [[TMP2:%.*]] = insertelement <4 x i1> [[TMP1]], i1 [[CMP2]], i32 1
+; MINTREESIZE-NEXT: [[TMP3:%.*]] = insertelement <4 x i1> [[TMP2]], i1 [[CMP1]], i32 2
+; MINTREESIZE-NEXT: [[TMP4:%.*]] = insertelement <4 x i1> [[TMP3]], i1 [[CMP0]], i32 3
; MINTREESIZE-NEXT: [[S0:%.*]] = select i1 [[CMP0]], float [[A0]], float [[B0]]
; MINTREESIZE-NEXT: [[S1:%.*]] = select i1 [[CMP1]], float [[A1]], float [[B1]]
; MINTREESIZE-NEXT: [[S2:%.*]] = select i1 [[CMP2]], float [[A2]], float [[B2]]
@@ -142,19 +146,19 @@ define <4 x float> @simple_select_eph(<4 x float> %a, <4 x float> %b, <4 x i32>
; MINTREESIZE-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[S3]], i32 3
; MINTREESIZE-NEXT: [[Q0:%.*]] = extractelement <4 x float> [[RD]], i32 0
; MINTREESIZE-NEXT: [[Q1:%.*]] = extractelement <4 x float> [[RD]], i32 1
-; MINTREESIZE-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[Q0]], i32 0
-; MINTREESIZE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[Q1]], i32 1
+; MINTREESIZE-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[Q0]], i32 0
+; MINTREESIZE-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[Q1]], i32 1
; MINTREESIZE-NEXT: [[Q2:%.*]] = extractelement <4 x float> [[RD]], i32 2
; MINTREESIZE-NEXT: [[Q3:%.*]] = extractelement <4 x float> [[RD]], i32 3
-; MINTREESIZE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[Q2]], i32 0
-; MINTREESIZE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[Q3]], i32 1
+; MINTREESIZE-NEXT: [[TMP7:%.*]] = insertelement <2 x float> poison, float [[Q2]], i32 0
+; MINTREESIZE-NEXT: [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[Q3]], i32 1
; MINTREESIZE-NEXT: [[Q4:%.*]] = fadd float [[Q0]], [[Q1]]
; MINTREESIZE-NEXT: [[Q5:%.*]] = fadd float [[Q2]], [[Q3]]
-; MINTREESIZE-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[Q4]], i32 0
-; MINTREESIZE-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[Q5]], i32 1
+; MINTREESIZE-NEXT: [[TMP9:%.*]] = insertelement <2 x float> poison, float [[Q4]], i32 0
+; MINTREESIZE-NEXT: [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[Q5]], i32 1
; MINTREESIZE-NEXT: [[Q6:%.*]] = fadd float [[Q4]], [[Q5]]
-; MINTREESIZE-NEXT: [[TMP7:%.*]] = insertelement <2 x float> poison, float [[Q6]], i32 0
-; MINTREESIZE-NEXT: [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[Q5]], i32 1
+; MINTREESIZE-NEXT: [[TMP11:%.*]] = insertelement <2 x float> poison, float [[Q6]], i32 0
+; MINTREESIZE-NEXT: [[TMP12:%.*]] = insertelement <2 x float> [[TMP11]], float [[Q5]], i32 1
; MINTREESIZE-NEXT: [[QI:%.*]] = fcmp olt float [[Q6]], [[Q5]]
; MINTREESIZE-NEXT: call void @llvm.assume(i1 [[QI]])
; MINTREESIZE-NEXT: ret <4 x float> undef
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
index f6878c7b06f07..2b28765d82a04 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
@@ -167,6 +167,10 @@ define <4 x float> @simple_select_eph(<4 x float> %a, <4 x float> %b, <4 x i32>
; MINTREESIZE-NEXT: [[CMP1:%.*]] = icmp ne i32 [[C1]], 0
; MINTREESIZE-NEXT: [[CMP2:%.*]] = icmp ne i32 [[C2]], 0
; MINTREESIZE-NEXT: [[CMP3:%.*]] = icmp ne i32 [[C3]], 0
+; MINTREESIZE-NEXT: [[TMP1:%.*]] = insertelement <4 x i1> poison, i1 [[CMP3]], i32 0
+; MINTREESIZE-NEXT: [[TMP2:%.*]] = insertelement <4 x i1> [[TMP1]], i1 [[CMP2]], i32 1
+; MINTREESIZE-NEXT: [[TMP3:%.*]] = insertelement <4 x i1> [[TMP2]], i1 [[CMP1]], i32 2
+; MINTREESIZE-NEXT: [[TMP4:%.*]] = insertelement <4 x i1> [[TMP3]], i1 [[CMP0]], i32 3
; MINTREESIZE-NEXT: [[S0:%.*]] = select i1 [[CMP0]], float [[A0]], float [[B0]]
; MINTREESIZE-NEXT: [[S1:%.*]] = select i1 [[CMP1]], float [[A1]], float [[B1]]
; MINTREESIZE-NEXT: [[S2:%.*]] = select i1 [[CMP2]], float [[A2]], float [[B2]]
@@ -177,19 +181,19 @@ define <4 x float> @simple_select_eph(<4 x float> %a, <4 x float> %b, <4 x i32>
; MINTREESIZE-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[S3]], i32 3
; MINTREESIZE-NEXT: [[Q0:%.*]] = extractelement <4 x float> [[RD]], i32 0
; MINTREESIZE-NEXT: [[Q1:%.*]] = extractelement <4 x float> [[RD]], i32 1
-; MINTREESIZE-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[Q0]], i32 0
-; MINTREESIZE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[Q1]], i32 1
+; MINTREESIZE-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[Q0]], i32 0
+; MINTREESIZE-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[Q1]], i32 1
; MINTREESIZE-NEXT: [[Q2:%.*]] = extractelement <4 x float> [[RD]], i32 2
; MINTREESIZE-NEXT: [[Q3:%.*]] = extractelement <4 x float> [[RD]], i32 3
-; MINTREESIZE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[Q2]], i32 0
-; MINTREESIZE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[Q3]], i32 1
+; MINTREESIZE-NEXT: [[TMP7:%.*]] = insertelement <2 x float> poison, float [[Q2]], i32 0
+; MINTREESIZE-NEXT: [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[Q3]], i32 1
; MINTREESIZE-NEXT: [[Q4:%.*]] = fadd float [[Q0]], [[Q1]]
; MINTREESIZE-NEXT: [[Q5:%.*]] = fadd float [[Q2]], [[Q3]]
-; MINTREESIZE-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[Q4]], i32 0
-; MINTREESIZE-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[Q5]], i32 1
+; MINTREESIZE-NEXT: [[TMP9:%.*]] = insertelement <2 x float> poison, float [[Q4]], i32 0
+; MINTREESIZE-NEXT: [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[Q5]], i32 1
; MINTREESIZE-NEXT: [[Q6:%.*]] = fadd float [[Q4]], [[Q5]]
-; MINTREESIZE-NEXT: [[TMP7:%.*]] = insertelement <2 x float> poison, float [[Q6]], i32 0
-; MINTREESIZE-NEXT: [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[Q5]], i32 1
+; MINTREESIZE-NEXT: [[TMP11:%.*]] = insertelement <2 x float> poison, float [[Q6]], i32 0
+; MINTREESIZE-NEXT: [[TMP12:%.*]] = insertelement <2 x float> [[TMP11]], float [[Q5]], i32 1
; MINTREESIZE-NEXT: [[QI:%.*]] = fcmp olt float [[Q6]], [[Q5]]
; MINTREESIZE-NEXT: call void @llvm.assume(i1 [[QI]])
; MINTREESIZE-NEXT: ret <4 x float> undef
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
index b5e1dfe9d97c5..ee3481e242d04 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -slp-vectorizer -mtriple=x86_64-- -S | FileCheck %s
-; RUN: opt < %s -slp-vectorizer -mtriple=x86_64-- -mattr=avx512vl -S | FileCheck %s
+; RUN: opt < %s -slp-vectorizer -mtriple=x86_64-- -S | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -slp-vectorizer -mtriple=x86_64-- -mattr=avx512vl -S | FileCheck %s --check-prefixes=CHECK,AVX
declare void @use1(i1)
@@ -93,19 +93,36 @@ define i1 @logical_or_fcmp(<4 x float> %x) {
}
define i1 @logical_and_icmp_
diff _preds(<4 x i32> %x) {
-; CHECK-LABEL: @logical_and_icmp_
diff _preds(
-; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0
-; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1
-; CHECK-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2
-; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3
-; CHECK-NEXT: [[C0:%.*]] = icmp ult i32 [[X0]], 0
-; CHECK-NEXT: [[C1:%.*]] = icmp slt i32 [[X1]], 0
-; CHECK-NEXT: [[C2:%.*]] = icmp sgt i32 [[X2]], 0
-; CHECK-NEXT: [[C3:%.*]] = icmp slt i32 [[X3]], 0
-; CHECK-NEXT: [[S1:%.*]] = select i1 [[C0]], i1 [[C1]], i1 false
-; CHECK-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false
-; CHECK-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[C3]], i1 false
-; CHECK-NEXT: ret i1 [[S3]]
+; SSE-LABEL: @logical_and_icmp_
diff _preds(
+; SSE-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0
+; SSE-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1
+; SSE-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2
+; SSE-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3
+; SSE-NEXT: [[C0:%.*]] = icmp ult i32 [[X0]], 0
+; SSE-NEXT: [[C2:%.*]] = icmp sgt i32 [[X2]], 0
+; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[X3]], i32 0
+; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[X1]], i32 1
+; SSE-NEXT: [[TMP3:%.*]] = icmp slt <2 x i32> [[TMP2]], zeroinitializer
+; SSE-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP3]], i32 1
+; SSE-NEXT: [[S1:%.*]] = select i1 [[C0]], i1 [[TMP4]], i1 false
+; SSE-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false
+; SSE-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP3]], i32 0
+; SSE-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[TMP5]], i1 false
+; SSE-NEXT: ret i1 [[S3]]
+;
+; AVX-LABEL: @logical_and_icmp_
diff _preds(
+; AVX-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0
+; AVX-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1
+; AVX-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2
+; AVX-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3
+; AVX-NEXT: [[C0:%.*]] = icmp ult i32 [[X0]], 0
+; AVX-NEXT: [[C1:%.*]] = icmp slt i32 [[X1]], 0
+; AVX-NEXT: [[C2:%.*]] = icmp sgt i32 [[X2]], 0
+; AVX-NEXT: [[C3:%.*]] = icmp slt i32 [[X3]], 0
+; AVX-NEXT: [[S1:%.*]] = select i1 [[C0]], i1 [[C1]], i1 false
+; AVX-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false
+; AVX-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[C3]], i1 false
+; AVX-NEXT: ret i1 [[S3]]
;
%x0 = extractelement <4 x i32> %x, i32 0
%x1 = extractelement <4 x i32> %x, i32 1
@@ -144,17 +161,14 @@ define i1 @logical_and_icmp_
diff _const(<4 x i32> %x) {
define i1 @mixed_logical_icmp(<4 x i32> %x) {
; CHECK-LABEL: @mixed_logical_icmp(
-; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0
-; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1
-; CHECK-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2
-; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3
-; CHECK-NEXT: [[C0:%.*]] = icmp sgt i32 [[X0]], 0
-; CHECK-NEXT: [[C1:%.*]] = icmp sgt i32 [[X1]], 0
-; CHECK-NEXT: [[C2:%.*]] = icmp sgt i32 [[X2]], 0
-; CHECK-NEXT: [[C3:%.*]] = icmp sgt i32 [[X3]], 0
-; CHECK-NEXT: [[S1:%.*]] = select i1 [[C0]], i1 [[C1]], i1 false
-; CHECK-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 true, i1 [[C2]]
-; CHECK-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[C3]], i1 false
+; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <4 x i32> [[X:%.*]], zeroinitializer
+; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0
+; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1
+; CHECK-NEXT: [[S1:%.*]] = select i1 [[TMP2]], i1 [[TMP3]], i1 false
+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2
+; CHECK-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 true, i1 [[TMP4]]
+; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3
+; CHECK-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[TMP5]], i1 false
; CHECK-NEXT: ret i1 [[S3]]
;
%x0 = extractelement <4 x i32> %x, i32 0
@@ -200,21 +214,18 @@ define i1 @logical_and_icmp_subvec(<4 x i32> %x) {
define i1 @logical_and_icmp_clamp(<4 x i32> %x) {
; CHECK-LABEL: @logical_and_icmp_clamp(
-; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 3
-; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[X]], i32 2
-; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[X]], i32 1
-; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[X]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = icmp slt <4 x i32> [[X]], <i32 42, i32 42, i32 42, i32 42>
-; CHECK-NEXT: [[D0:%.*]] = icmp sgt i32 [[TMP4]], 17
-; CHECK-NEXT: [[D1:%.*]] = icmp sgt i32 [[TMP3]], 17
-; CHECK-NEXT: [[D2:%.*]] = icmp sgt i32 [[TMP2]], 17
-; CHECK-NEXT: [[D3:%.*]] = icmp sgt i32 [[TMP1]], 17
-; CHECK-NEXT: [[TMP6:%.*]] = freeze <4 x i1> [[TMP5]]
-; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP6]])
-; CHECK-NEXT: [[S4:%.*]] = select i1 [[TMP7]], i1 [[D0]], i1 false
-; CHECK-NEXT: [[S5:%.*]] = select i1 [[S4]], i1 [[D1]], i1 false
-; CHECK-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false
-; CHECK-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false
+; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], <i32 42, i32 42, i32 42, i32 42>
+; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[X]], <i32 17, i32 17, i32 17, i32 17>
+; CHECK-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP3]])
+; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP2]], i32 0
+; CHECK-NEXT: [[S4:%.*]] = select i1 [[TMP4]], i1 [[TMP5]], i1 false
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP2]], i32 1
+; CHECK-NEXT: [[S5:%.*]] = select i1 [[S4]], i1 [[TMP6]], i1 false
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP2]], i32 2
+; CHECK-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[TMP7]], i1 false
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP2]], i32 3
+; CHECK-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[TMP8]], i1 false
; CHECK-NEXT: ret i1 [[S7]]
;
%x0 = extractelement <4 x i32> %x, i32 0
More information about the llvm-commits
mailing list