[llvm] [SLP]Initial non-power-of-2 (but still whole register) for remaining nodes (PR #113356)
Alexey Bataev via llvm-commits
llvm-commits at lists.llvm.org
Tue Dec 24 11:48:46 PST 2024
https://github.com/alexey-bataev updated https://github.com/llvm/llvm-project/pull/113356
>From 72d910e364c49bc95f40ebd2ef38d2eef65b689d Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Tue, 22 Oct 2024 18:26:10 +0000
Subject: [PATCH] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20initia?=
=?UTF-8?q?l=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Created using spr 1.3.5
---
.../Transforms/Vectorize/SLPVectorizer.cpp | 53 +++++++-----
.../SLPVectorizer/X86/long-full-reg-stores.ll | 30 +++++--
...duced-val-extracted-and-externally-used.ll | 18 ++--
.../extract-many-users-buildvector.ll | 85 +++++++++++++------
4 files changed, 125 insertions(+), 61 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 756b25ac985612..7920c3ef3c9ea8 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2547,7 +2547,9 @@ class BoUpSLP {
}
// TODO: Check if we can remove a check for non-power-2 number of
// scalars after full support of non-power-2 vectorization.
- return UniqueValues.size() != 2 && has_single_bit(UniqueValues.size());
+ return UniqueValues.size() != 2 &&
+ hasFullVectorsOrPowerOf2(*R.TTI, Op0.front().V->getType(),
+ UniqueValues.size());
};
// If the initial strategy fails for any of the operand indexes, then we
@@ -4945,12 +4947,13 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
});
});
const unsigned AbsoluteDiff = std::abs(*Diff);
- if (IsPossibleStrided && (IsAnyPointerUsedOutGraph ||
- ((Sz > MinProfitableStridedLoads ||
- (AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
- has_single_bit(AbsoluteDiff))) &&
- AbsoluteDiff > Sz) ||
- *Diff == -(static_cast<int>(Sz) - 1))) {
+ if (IsPossibleStrided &&
+ (IsAnyPointerUsedOutGraph ||
+ ((Sz > MinProfitableStridedLoads ||
+ (AbsoluteDiff <= MaxProfitableLoadStride * Sz && AbsoluteDiff > Sz &&
+ AbsoluteDiff % Sz == 0 && has_single_bit(AbsoluteDiff / Sz))) &&
+ AbsoluteDiff > Sz) ||
+ *Diff == -(static_cast<int>(Sz) - 1))) {
int Stride = *Diff / static_cast<int>(Sz - 1);
if (*Diff == Stride * static_cast<int>(Sz - 1)) {
Align Alignment =
@@ -5036,9 +5039,10 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
// FIXME: The following code has not been updated for non-power-of-2
- // vectors. The splitting logic here does not cover the original
- // vector if the vector factor is not a power of two. FIXME
- if (!has_single_bit(VL.size()))
+ // vectors (and not whole registers). The splitting logic here does not
+ // cover the original vector if the vector factor is not a power of two.
+ // FIXME
+ if (!hasFullVectorsOrPowerOf2(TTI, ScalarTy, VL.size()))
return false;
unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
@@ -5046,7 +5050,10 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
DemandedElts.clearAllBits();
// Iterate through possible vectorization factors and check if vectorized +
// shuffles is better than just gather.
- for (unsigned VF = VL.size() / 2; VF >= MinVF; VF /= 2) {
+ for (unsigned VF =
+ getFloorFullVectorNumberOfElements(TTI, ScalarTy, VL.size() - 1);
+ VF >= MinVF;
+ VF = getFloorFullVectorNumberOfElements(TTI, ScalarTy, VF - 1)) {
SmallVector<LoadsState> States;
for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End; Cnt += VF) {
ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
@@ -7424,8 +7431,9 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
case Instruction::ExtractValue:
case Instruction::ExtractElement: {
bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
- // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
- if (!has_single_bit(VL.size()))
+ // FIXME: Vectorizing is not supported yet for non-power-of-2 ops (and
+ // non-full registers).
+ if (!hasFullVectorsOrPowerOf2(*TTI, VL0->getType(), VL.size()))
return TreeEntry::NeedToGather;
if (Reuse || !CurrentOrder.empty())
return TreeEntry::Vectorize;
@@ -7864,7 +7872,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
if ((UserTreeIdx.UserTE &&
UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI)) ||
- !has_single_bit(VL.size())) {
+ !hasFullVectorsOrPowerOf2(*TTI, VL.front()->getType(), VL.size())) {
LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
"for nodes with padding.\n");
newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
@@ -9425,7 +9433,8 @@ void BoUpSLP::transformNodes() {
if (!S.getOpcode() || S.isAltShuffle() || !allSameBlock(Slice) ||
(S.getOpcode() == Instruction::Load &&
areKnownNonVectorizableLoads(Slice)) ||
- (S.getOpcode() != Instruction::Load && !has_single_bit(VF)))
+ (S.getOpcode() != Instruction::Load &&
+ !hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(), VF)))
continue;
if (VF == 2) {
// Try to vectorize reduced values or if all users are vectorized.
@@ -12899,8 +12908,9 @@ BoUpSLP::isGatherShuffledEntry(
return !TE->isGather();
})))
return {};
- // FIXME: Gathering for non-power-of-2 nodes not implemented yet.
- if (TE->isNonPowOf2Vec())
+ // FIXME: Gathering for non-power-of-2 (non whole registers) nodes not
+ // implemented yet.
+ if (TE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
return {};
Mask.assign(VL.size(), PoisonMaskElem);
assert((TE->UserTreeIndices.size() == 1 ||
@@ -18312,7 +18322,9 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
unsigned Sz = R.getVectorElementSize(I0);
unsigned MinVF = R.getMinVF(Sz);
- unsigned MaxVF = std::max<unsigned>(llvm::bit_floor(VL.size()), MinVF);
+ unsigned MaxVF = std::max<unsigned>(
+ getFloorFullVectorNumberOfElements(*TTI, I0->getType(), VL.size()),
+ MinVF);
MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
if (MaxVF < 2) {
R.getORE()->emit([&]() {
@@ -18329,7 +18341,8 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
Type *ScalarTy = getValueType(VL[0]);
unsigned NextInst = 0, MaxInst = VL.size();
- for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
+ for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
+ VF = getFloorFullVectorNumberOfElements(*TTI, I0->getType(), VF - 1)) {
// No actual vectorization should happen, if number of parts is the same as
// provided vectorization factor (i.e. the scalar type is used for vector
// code during codegen).
@@ -18344,7 +18357,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
if (MaxVFOnly && ActualVF < MaxVF)
break;
- if ((VF > MinVF && ActualVF <= VF / 2) || (VF == MinVF && ActualVF < 2))
+ if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2))
break;
SmallVector<Value *> Ops(ActualVF, nullptr);
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/long-full-reg-stores.ll b/llvm/test/Transforms/SLPVectorizer/X86/long-full-reg-stores.ll
index aff66dd7c10ea7..3ae36121fb9eb1 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/long-full-reg-stores.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/long-full-reg-stores.ll
@@ -1,19 +1,35 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+; FIXME: need to fix/improve cost estimation of many SK_PermuteTwo vector registers to account
+; correctly for poisoned/identity shuffles and only calculate actual shuffles.
define void @test(ptr noalias %0, ptr noalias %1) {
; CHECK-LABEL: define void @test(
; CHECK-SAME: ptr noalias [[TMP0:%.*]], ptr noalias [[TMP1:%.*]]) {
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i64 24
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[TMP1]], i64 48
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP1]], i64 8
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[TMP1]], i64 16
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP0]], i64 24
+; CHECK-NEXT: [[TMP8:%.*]] = load double, ptr [[TMP7]], align 8
+; CHECK-NEXT: store double [[TMP8]], ptr [[TMP5]], align 8
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP0]], i64 48
+; CHECK-NEXT: [[TMP10:%.*]] = load double, ptr [[TMP9]], align 16
+; CHECK-NEXT: store double [[TMP10]], ptr [[TMP6]], align 16
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP0]], i64 8
-; CHECK-NEXT: [[TMP6:%.*]] = load <2 x double>, ptr [[TMP9]], align 16
-; CHECK-NEXT: [[TMP7:%.*]] = load <4 x double>, ptr [[TMP11]], align 8
-; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <6 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> [[TMP10]], <6 x i32> <i32 2, i32 4, i32 0, i32 3, i32 5, i32 1>
-; CHECK-NEXT: store <6 x double> [[TMP13]], ptr [[TMP5]], align 8
+; CHECK-NEXT: [[TMP12:%.*]] = load double, ptr [[TMP11]], align 8
+; CHECK-NEXT: store double [[TMP12]], ptr [[TMP3]], align 8
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP0]], i64 32
+; CHECK-NEXT: [[TMP14:%.*]] = load double, ptr [[TMP13]], align 16
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[TMP1]], i64 32
+; CHECK-NEXT: store double [[TMP14]], ptr [[TMP15]], align 16
+; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[TMP0]], i64 56
+; CHECK-NEXT: [[TMP17:%.*]] = load double, ptr [[TMP16]], align 8
+; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP1]], i64 40
+; CHECK-NEXT: store double [[TMP17]], ptr [[TMP18]], align 8
+; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[TMP0]], i64 16
+; CHECK-NEXT: [[TMP20:%.*]] = load double, ptr [[TMP19]], align 16
+; CHECK-NEXT: store double [[TMP20]], ptr [[TMP4]], align 16
; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP0]], i64 40
; CHECK-NEXT: [[TMP22:%.*]] = load double, ptr [[TMP21]], align 8
; CHECK-NEXT: [[TMP23:%.*]] = getelementptr i8, ptr [[TMP1]], i64 56
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-extracted-and-externally-used.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-extracted-and-externally-used.ll
index bb7964146c44d2..d1617c9a382d16 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-extracted-and-externally-used.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-extracted-and-externally-used.ll
@@ -8,23 +8,23 @@ define void @test(i32 %arg) {
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[ARG]], i32 0
; CHECK-NEXT: br label %[[BB1:.*]]
; CHECK: [[BB1]]:
-; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[TMP5:%.*]], %[[BB1]] ]
-; CHECK-NEXT: [[PHI2:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[TMP6:%.*]], %[[BB1]] ]
+; CHECK-NEXT: [[PHI2:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[TMP5:%.*]], %[[BB1]] ]
+; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[TMP6:%.*]], %[[BB1]] ]
; CHECK-NEXT: [[PHI3:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[OP_RDX4:%.*]], %[[BB1]] ]
; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ zeroinitializer, %[[BB]] ], [ [[TMP4:%.*]], %[[BB1]] ]
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 0, i32 0, i32 1, i32 0>
-; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0>
; CHECK-NEXT: [[ADD17:%.*]] = add i32 [[PHI]], 0
-; CHECK-NEXT: [[ADD18:%.*]] = add i32 [[PHI2]], 0
+; CHECK-NEXT: [[ADD4:%.*]] = add i32 [[PHI]], 0
; CHECK-NEXT: [[ADD19:%.*]] = add i32 [[PHI2]], 0
-; CHECK-NEXT: [[ADD23:%.*]] = add i32 [[PHI2]], 0
+; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[PHI]], 0
+; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], zeroinitializer
; CHECK-NEXT: [[TMP4]] = add <2 x i32> [[TMP0]], <i32 0, i32 1>
; CHECK-NEXT: [[TMP5]] = extractelement <2 x i32> [[TMP4]], i32 1
; CHECK-NEXT: [[TMP6]] = extractelement <2 x i32> [[TMP4]], i32 0
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> [[TMP3]])
-; CHECK-NEXT: [[OP_RDX:%.*]] = xor i32 [[TMP7]], [[ADD18]]
-; CHECK-NEXT: [[OP_RDX1:%.*]] = xor i32 [[ADD17]], [[ADD19]]
-; CHECK-NEXT: [[OP_RDX2:%.*]] = xor i32 [[ADD23]], [[TMP6]]
+; CHECK-NEXT: [[OP_RDX:%.*]] = xor i32 [[TMP7]], [[ADD17]]
+; CHECK-NEXT: [[OP_RDX1:%.*]] = xor i32 [[ADD4]], [[ADD6]]
+; CHECK-NEXT: [[OP_RDX2:%.*]] = xor i32 [[ADD19]], [[TMP6]]
; CHECK-NEXT: [[OP_RDX3:%.*]] = xor i32 [[OP_RDX]], [[OP_RDX1]]
; CHECK-NEXT: [[OP_RDX4]] = xor i32 [[OP_RDX3]], [[OP_RDX2]]
; CHECK-NEXT: [[ICMP:%.*]] = icmp ult i32 [[TMP5]], 0
diff --git a/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll b/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll
index 261ec2b3935d7e..fc9a45adb2ed7f 100644
--- a/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll
@@ -1,31 +1,66 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: %if x86-registered-target %{ opt -S -passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s %}
-; RUN: %if aarch64-registered-target %{ opt -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s %}
+; RUN: %if x86-registered-target %{ opt -S -passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s --check-prefix X86 %}
+; RUN: %if aarch64-registered-target %{ opt -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefix AARCH64 %}
+; FIXME: need to fix/improve cost estimation of many SK_PermuteTwo vector registers to account
+; correctly for poisoned/identity shuffles and only calculate actual shuffles.
define i1 @test(float %0, double %1) {
-; CHECK-LABEL: define i1 @test
-; CHECK-SAME: (float [[TMP0:%.*]], double [[TMP1:%.*]]) {
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison>, float [[TMP0]], i32 3
-; CHECK-NEXT: [[TMP4:%.*]] = fpext <4 x float> [[TMP3]] to <4 x double>
-; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> <double poison, double 0.000000e+00>, double [[TMP1]], i32 0
-; CHECK-NEXT: [[TMP6:%.*]] = fmul <2 x double> zeroinitializer, [[TMP5]]
-; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 poison, i32 0, i32 3, i32 3>
-; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> <double 0.000000e+00, double poison, double poison, double poison>, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> <double poison, double poison, double poison, double 0.000000e+00>, <4 x i32> <i32 2, i32 0, i32 1, i32 7>
-; CHECK-NEXT: [[TMP10:%.*]] = fmul <4 x double> [[TMP8]], [[TMP9]]
-; CHECK-NEXT: [[TMP11:%.*]] = fmul <4 x double> zeroinitializer, [[TMP4]]
-; CHECK-NEXT: [[TMP12:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v4f64(<8 x double> <double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, <4 x double> [[TMP10]], i64 0)
-; CHECK-NEXT: [[TMP13:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v4f64(<8 x double> <double poison, double poison, double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00>, <4 x double> [[TMP11]], i64 0)
-; CHECK-NEXT: [[TMP14:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v2f64(<8 x double> [[TMP13]], <2 x double> [[TMP6]], i64 4)
-; CHECK-NEXT: [[TMP15:%.*]] = fsub <8 x double> [[TMP12]], [[TMP14]]
-; CHECK-NEXT: [[TMP16:%.*]] = fmul <8 x double> [[TMP12]], [[TMP14]]
-; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <8 x double> [[TMP15]], <8 x double> [[TMP16]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 13, i32 14, i32 15>
-; CHECK-NEXT: [[TMP18:%.*]] = fptrunc <8 x double> [[TMP17]] to <8 x float>
-; CHECK-NEXT: [[TMP19:%.*]] = fmul <8 x float> [[TMP18]], zeroinitializer
-; CHECK-NEXT: [[TMP20:%.*]] = fcmp oeq <8 x float> [[TMP19]], zeroinitializer
-; CHECK-NEXT: [[TMP21:%.*]] = freeze <8 x i1> [[TMP20]]
-; CHECK-NEXT: [[TMP22:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP21]])
-; CHECK-NEXT: ret i1 [[TMP22]]
+; X86-LABEL: define i1 @test
+; X86-SAME: (float [[TMP0:%.*]], double [[TMP1:%.*]]) {
+; X86-NEXT: [[TMP3:%.*]] = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison>, float [[TMP0]], i32 3
+; X86-NEXT: [[TMP4:%.*]] = fpext <4 x float> [[TMP3]] to <4 x double>
+; X86-NEXT: [[TMP5:%.*]] = insertelement <2 x double> <double poison, double 0.000000e+00>, double [[TMP1]], i32 0
+; X86-NEXT: [[TMP6:%.*]] = fmul <2 x double> zeroinitializer, [[TMP5]]
+; X86-NEXT: [[TMP7:%.*]] = insertelement <4 x double> <double 0.000000e+00, double poison, double poison, double poison>, double [[TMP1]], i32 1
+; X86-NEXT: [[TMP8:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> <i32 poison, i32 1, i32 poison, i32 poison>
+; X86-NEXT: [[TMP9:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; X86-NEXT: [[TMP10:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 5, i32 poison>
+; X86-NEXT: [[TMP11:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; X86-NEXT: [[TMP12:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> <double poison, double poison, double poison, double 0.000000e+00>, <4 x i32> <i32 2, i32 0, i32 1, i32 7>
+; X86-NEXT: [[TMP13:%.*]] = fmul <4 x double> [[TMP11]], [[TMP12]]
+; X86-NEXT: [[TMP14:%.*]] = fmul <4 x double> zeroinitializer, [[TMP4]]
+; X86-NEXT: [[TMP15:%.*]] = fsub <4 x double> [[TMP13]], [[TMP14]]
+; X86-NEXT: [[TMP16:%.*]] = fptrunc <4 x double> [[TMP15]] to <4 x float>
+; X86-NEXT: [[TMP17:%.*]] = fmul <4 x float> [[TMP16]], zeroinitializer
+; X86-NEXT: [[TMP18:%.*]] = fcmp oeq <4 x float> [[TMP17]], zeroinitializer
+; X86-NEXT: [[TMP19:%.*]] = call <4 x double> @llvm.vector.insert.v4f64.v2f64(<4 x double> <double poison, double poison, double 0.000000e+00, double 0.000000e+00>, <2 x double> [[TMP6]], i64 0)
+; X86-NEXT: [[TMP20:%.*]] = fsub <4 x double> zeroinitializer, [[TMP19]]
+; X86-NEXT: [[TMP21:%.*]] = fmul <4 x double> zeroinitializer, [[TMP19]]
+; X86-NEXT: [[TMP22:%.*]] = shufflevector <4 x double> [[TMP20]], <4 x double> [[TMP21]], <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+; X86-NEXT: [[TMP23:%.*]] = fptrunc <4 x double> [[TMP22]] to <4 x float>
+; X86-NEXT: [[TMP24:%.*]] = fmul <4 x float> [[TMP23]], zeroinitializer
+; X86-NEXT: [[TMP25:%.*]] = fcmp oeq <4 x float> [[TMP24]], zeroinitializer
+; X86-NEXT: [[TMP26:%.*]] = freeze <4 x i1> [[TMP18]]
+; X86-NEXT: [[TMP27:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP26]])
+; X86-NEXT: [[TMP28:%.*]] = freeze <4 x i1> [[TMP25]]
+; X86-NEXT: [[TMP29:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP28]])
+; X86-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP27]], i1 [[TMP29]], i1 false
+; X86-NEXT: ret i1 [[OP_RDX]]
+;
+; AARCH64-LABEL: define i1 @test
+; AARCH64-SAME: (float [[TMP0:%.*]], double [[TMP1:%.*]]) {
+; AARCH64-NEXT: [[TMP3:%.*]] = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison>, float [[TMP0]], i32 3
+; AARCH64-NEXT: [[TMP4:%.*]] = fpext <4 x float> [[TMP3]] to <4 x double>
+; AARCH64-NEXT: [[TMP5:%.*]] = insertelement <6 x double> <double poison, double poison, double poison, double poison, double poison, double 0.000000e+00>, double [[TMP1]], i32 4
+; AARCH64-NEXT: [[TMP6:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison>
+; AARCH64-NEXT: [[TMP7:%.*]] = shufflevector <6 x double> [[TMP5]], <6 x double> [[TMP6]], <6 x i32> <i32 6, i32 7, i32 8, i32 9, i32 4, i32 5>
+; AARCH64-NEXT: [[TMP8:%.*]] = fmul <6 x double> zeroinitializer, [[TMP7]]
+; AARCH64-NEXT: [[TMP9:%.*]] = shufflevector <6 x double> [[TMP7]], <6 x double> [[TMP8]], <4 x i32> <i32 poison, i32 4, i32 11, i32 11>
+; AARCH64-NEXT: [[TMP10:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> <double 0.000000e+00, double poison, double poison, double poison>, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+; AARCH64-NEXT: [[TMP11:%.*]] = shufflevector <6 x double> [[TMP7]], <6 x double> poison, <4 x i32> <i32 2, i32 0, i32 1, i32 poison>
+; AARCH64-NEXT: [[TMP12:%.*]] = shufflevector <4 x double> [[TMP11]], <4 x double> <double poison, double poison, double poison, double 0.000000e+00>, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; AARCH64-NEXT: [[TMP13:%.*]] = fmul <4 x double> [[TMP10]], [[TMP12]]
+; AARCH64-NEXT: [[TMP14:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v4f64(<8 x double> <double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, <4 x double> [[TMP13]], i64 0)
+; AARCH64-NEXT: [[TMP15:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v6f64(<8 x double> <double poison, double poison, double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00>, <6 x double> [[TMP8]], i64 0)
+; AARCH64-NEXT: [[TMP16:%.*]] = fsub <8 x double> [[TMP14]], [[TMP15]]
+; AARCH64-NEXT: [[TMP17:%.*]] = fmul <8 x double> [[TMP14]], [[TMP15]]
+; AARCH64-NEXT: [[TMP18:%.*]] = shufflevector <8 x double> [[TMP16]], <8 x double> [[TMP17]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 13, i32 14, i32 15>
+; AARCH64-NEXT: [[TMP19:%.*]] = fptrunc <8 x double> [[TMP18]] to <8 x float>
+; AARCH64-NEXT: [[TMP20:%.*]] = fmul <8 x float> [[TMP19]], zeroinitializer
+; AARCH64-NEXT: [[TMP21:%.*]] = fcmp oeq <8 x float> [[TMP20]], zeroinitializer
+; AARCH64-NEXT: [[TMP22:%.*]] = freeze <8 x i1> [[TMP21]]
+; AARCH64-NEXT: [[TMP23:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP22]])
+; AARCH64-NEXT: ret i1 [[TMP23]]
;
%3 = fpext float %0 to double
%4 = fpext float 0.000000e+00 to double
More information about the llvm-commits
mailing list