[llvm] 279b1ea - [SLP]Improve gathering of the scalars used in the graph.
Alexey Bataev via llvm-commits
llvm-commits at lists.llvm.org
Fri Dec 1 11:26:31 PST 2023
Author: Alexey Bataev
Date: 2023-12-01T11:23:57-08:00
New Revision: 279b1ea65f8403aa6d49e7aafa7e40dc906be4bf
URL: https://github.com/llvm/llvm-project/commit/279b1ea65f8403aa6d49e7aafa7e40dc906be4bf
DIFF: https://github.com/llvm/llvm-project/commit/279b1ea65f8403aa6d49e7aafa7e40dc906be4bf.diff
LOG: [SLP]Improve gathering of the scalars used in the graph.
Currently we emit gathers for scalars being vectorized in the tree as
a pair of extractelement/insertelement instructions. Instead we can try
to find all required vectors and emit shuffle vector instructions
directly, improving the code and reducing compile time.
Part of non-power-of-2 vectorization.
Differential Revision: https://reviews.llvm.org/D110978
Added:
Modified:
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index a95754f74d1b6ef..be48465b8e0e4f5 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -436,26 +436,6 @@ static SmallBitVector isUndefVector(const Value *V,
/// i32 6>
/// %2 = mul <4 x i8> %1, %1
/// ret <4 x i8> %2
-/// We convert this initially to something like:
-/// %x0 = extractelement <4 x i8> %x, i32 0
-/// %x3 = extractelement <4 x i8> %x, i32 3
-/// %y1 = extractelement <4 x i8> %y, i32 1
-/// %y2 = extractelement <4 x i8> %y, i32 2
-/// %1 = insertelement <4 x i8> poison, i8 %x0, i32 0
-/// %2 = insertelement <4 x i8> %1, i8 %x3, i32 1
-/// %3 = insertelement <4 x i8> %2, i8 %y1, i32 2
-/// %4 = insertelement <4 x i8> %3, i8 %y2, i32 3
-/// %5 = mul <4 x i8> %4, %4
-/// %6 = extractelement <4 x i8> %5, i32 0
-/// %ins1 = insertelement <4 x i8> poison, i8 %6, i32 0
-/// %7 = extractelement <4 x i8> %5, i32 1
-/// %ins2 = insertelement <4 x i8> %ins1, i8 %7, i32 1
-/// %8 = extractelement <4 x i8> %5, i32 2
-/// %ins3 = insertelement <4 x i8> %ins2, i8 %8, i32 2
-/// %9 = extractelement <4 x i8> %5, i32 3
-/// %ins4 = insertelement <4 x i8> %ins3, i8 %9, i32 3
-/// ret <4 x i8> %ins4
-/// InstCombiner transforms this into a shuffle and vector mul
/// Mask will return the Shuffle Mask equivalent to the extracted elements.
/// TODO: Can we split off and reuse the shuffle mask detection from
/// ShuffleVectorInst/getShuffleCost?
@@ -7505,6 +7485,14 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
}
return VecBase;
}
+ /// Checks if the specified entry \p E needs to be delayed because of its
+ /// dependency nodes.
+ std::optional<InstructionCost>
+ needToDelay(const TreeEntry *,
+ ArrayRef<SmallVector<const TreeEntry *>>) const {
+ // No need to delay the cost estimation during analysis.
+ return std::nullopt;
+ }
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
if (&E1 == &E2) {
assert(all_of(Mask,
@@ -7619,13 +7607,16 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
CommonMask[Idx] = Mask[Idx] + VF;
}
- Value *gather(ArrayRef<Value *> VL, Value *Root = nullptr) {
+ Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
+ Value *Root = nullptr) {
Cost += getBuildVectorCost(VL, Root);
if (!Root) {
- assert(InVectors.empty() && "Unexpected input vectors for buildvector.");
// FIXME: Need to find a way to avoid use of getNullValue here.
SmallVector<Constant *> Vals;
- for (Value *V : VL) {
+ unsigned VF = VL.size();
+ if (MaskVF != 0)
+ VF = std::min(VF, MaskVF);
+ for (Value *V : VL.take_front(VF)) {
if (isa<UndefValue>(V)) {
Vals.push_back(cast<Constant>(V));
continue;
@@ -7635,9 +7626,11 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
return ConstantVector::get(Vals);
}
return ConstantVector::getSplat(
- ElementCount::getFixed(VL.size()),
+ ElementCount::getFixed(
+ cast<FixedVectorType>(Root->getType())->getNumElements()),
getAllOnesValue(*R.DL, VL.front()->getType()));
}
+ InstructionCost createFreeze(InstructionCost Cost) { return Cost; }
/// Finalize emission of the shuffles.
InstructionCost
finalize(ArrayRef<int> ExtMask, unsigned VF = 0,
@@ -7659,8 +7652,10 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
InVectors.front() = V;
}
::addMask(CommonMask, ExtMask, /*ExtendingManyInputs=*/true);
- if (CommonMask.empty())
+ if (CommonMask.empty()) {
+ assert(InVectors.size() == 1 && "Expected only one vector with no mask");
return Cost;
+ }
return Cost +
createShuffle(InVectors.front(),
InVectors.size() == 2 ? InVectors.back() : nullptr,
@@ -7737,189 +7732,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
return 0;
if (isa<InsertElementInst>(VL[0]))
return InstructionCost::getInvalid();
- ShuffleCostEstimator Estimator(*TTI, VectorizedVals, *this,
- CheckedExtracts);
- unsigned VF = E->getVectorFactor();
- SmallVector<int> ReuseShuffleIndicies(E->ReuseShuffleIndices.begin(),
- E->ReuseShuffleIndices.end());
- SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
- // Build a mask out of the reorder indices and reorder scalars per this
- // mask.
- SmallVector<int> ReorderMask;
- inversePermutation(E->ReorderIndices, ReorderMask);
- if (!ReorderMask.empty())
- reorderScalars(GatheredScalars, ReorderMask);
- SmallVector<int> Mask;
- SmallVector<int> ExtractMask;
- Value *ExtractVecBase = nullptr;
- bool UseVecBaseAsInput = false;
- SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles;
- SmallVector<SmallVector<const TreeEntry *>> Entries;
- SmallVector<std::optional<TTI::ShuffleKind>> ExtractShuffles;
- // Check for gathered extracts.
- bool Resized = false;
- unsigned NumParts = TTI->getNumberOfParts(VecTy);
- if (NumParts == 0 || NumParts >= GatheredScalars.size())
- NumParts = 1;
- if (!all_of(GatheredScalars, UndefValue::classof)) {
- ExtractShuffles =
- tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
- if (!ExtractShuffles.empty()) {
- if (Value *VecBase = Estimator.adjustExtracts(
- E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
- if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
- if (VF == VecBaseTy->getNumElements() &&
- GatheredScalars.size() != VF) {
- Resized = true;
- GatheredScalars.append(VF - GatheredScalars.size(),
- PoisonValue::get(ScalarTy));
- }
- }
- }
-
- // Do not try to look for reshuffled loads for gathered loads (they will
- // be handled later), for vectorized scalars, and cases, which are
- // definitely not profitable (splats and small gather nodes.)
- if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load ||
- E->isAltShuffle() ||
- all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
- isSplat(E->Scalars) ||
- (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2))
- GatherShuffles =
- isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
- }
- if (!GatherShuffles.empty()) {
- if (GatherShuffles.size() == 1 &&
- *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
- Entries.front().front()->isSame(E->Scalars)) {
- // Perfect match in the graph, will reuse the previously vectorized
- // node. Cost is 0.
- LLVM_DEBUG(
- dbgs()
- << "SLP: perfect diamond match for gather bundle "
- << shortBundleName(VL) << ".\n");
- // Restore the mask for previous partially matched values.
- Mask.resize(E->Scalars.size());
- const TreeEntry *FrontTE = Entries.front().front();
- if (FrontTE->ReorderIndices.empty() &&
- ((FrontTE->ReuseShuffleIndices.empty() &&
- E->Scalars.size() == FrontTE->Scalars.size()) ||
- (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
- std::iota(Mask.begin(), Mask.end(), 0);
- } else {
- for (auto [I, V] : enumerate(E->Scalars)) {
- if (isa<PoisonValue>(V)) {
- Mask[I] = PoisonMaskElem;
- continue;
- }
- Mask[I] = FrontTE->findLaneForValue(V);
- }
- }
- Estimator.add(*FrontTE, Mask);
- return Estimator.finalize(E->getCommonMask());
- }
- if (!Resized) {
- if (GatheredScalars.size() != VF &&
- any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
- return any_of(TEs, [&](const TreeEntry *TE) {
- return TE->getVectorFactor() == VF;
- });
- }))
- GatheredScalars.append(VF - GatheredScalars.size(),
- PoisonValue::get(ScalarTy));
- }
- // Remove shuffled elements from list of gathers.
- for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
- if (Mask[I] != PoisonMaskElem)
- GatheredScalars[I] = PoisonValue::get(ScalarTy);
- }
- LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size()
- << " entries for bundle "
- << shortBundleName(VL) << ".\n");
- unsigned SliceSize = E->Scalars.size() / NumParts;
- SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
- for (const auto [I, TEs] : enumerate(Entries)) {
- if (TEs.empty()) {
- assert(!GatherShuffles[I] &&
- "No shuffles with empty entries list expected.");
- continue;
- }
- assert((TEs.size() == 1 || TEs.size() == 2) &&
- "Expected shuffle of 1 or 2 entries.");
- auto SubMask = ArrayRef(Mask).slice(I * SliceSize, SliceSize);
- VecMask.assign(VecMask.size(), PoisonMaskElem);
- copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
- Estimator.add(*TEs.front(), *TEs.back(), VecMask);
- }
- if (all_of(GatheredScalars, PoisonValue ::classof))
- return Estimator.finalize(E->ReuseShuffleIndices);
- return Estimator.finalize(
- E->ReuseShuffleIndices, E->Scalars.size(),
- [&](Value *&Vec, SmallVectorImpl<int> &Mask) {
- Vec = Estimator.gather(GatheredScalars,
- Constant::getNullValue(FixedVectorType::get(
- ScalarTy, GatheredScalars.size())));
- });
- }
- if (!ExtractShuffles.empty()) {
- Value *Vec1 = nullptr;
- // Gather of extractelements can be represented as just a shuffle of
- // a single/two vectors the scalars are extracted from.
- // Find input vectors.
- Value *Vec2 = nullptr;
- for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
- if (!Mask.empty() && Mask[I] != PoisonMaskElem)
- ExtractMask[I] = PoisonMaskElem;
- }
- if (UseVecBaseAsInput) {
- Vec1 = ExtractVecBase;
- } else {
- for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
- if (ExtractMask[I] == PoisonMaskElem)
- continue;
- if (isa<UndefValue>(E->Scalars[I]))
- continue;
- auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
- Value *VecOp = EI->getVectorOperand();
- if (const auto *TE = getTreeEntry(VecOp))
- if (TE->VectorizedValue)
- VecOp = TE->VectorizedValue;
- if (!Vec1) {
- Vec1 = VecOp;
- } else if (Vec1 != EI->getVectorOperand()) {
- assert((!Vec2 || Vec2 == EI->getVectorOperand()) &&
- "Expected only 1 or 2 vectors shuffle.");
- Vec2 = VecOp;
- }
- }
- }
- if (Vec2) {
- Estimator.add(Vec1, Vec2, ExtractMask);
- } else if (Vec1) {
- Estimator.add(Vec1, ExtractMask, /*ForExtracts=*/true);
- } else {
- Estimator.add(PoisonValue::get(FixedVectorType::get(
- ScalarTy, GatheredScalars.size())),
- ExtractMask, /*ForExtracts=*/true);
- }
- }
- if (!all_of(GatheredScalars, PoisonValue::classof)) {
- auto Gathers = ArrayRef(GatheredScalars).take_front(VL.size());
- bool SameGathers = VL.equals(Gathers);
- if (!SameGathers)
- return Estimator.finalize(
- E->ReuseShuffleIndices, E->Scalars.size(),
- [&](Value *&Vec, SmallVectorImpl<int> &Mask) {
- Vec = Estimator.gather(
- GatheredScalars, Constant::getNullValue(FixedVectorType::get(
- ScalarTy, GatheredScalars.size())));
- });
- Value *BV = Estimator.gather(Gathers);
- SmallVector<int> ReuseMask(Gathers.size(), PoisonMaskElem);
- std::iota(ReuseMask.begin(), ReuseMask.end(), 0);
- Estimator.add(BV, ReuseMask);
- }
- return Estimator.finalize(E->ReuseShuffleIndices);
+ return processBuildVector<ShuffleCostEstimator, InstructionCost>(
+ E, *TTI, VectorizedVals, *this, CheckedExtracts);
}
InstructionCost CommonCost = 0;
SmallVector<int> Mask;
@@ -10337,6 +10151,7 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
/// Adjusts extractelements after reusing them.
Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
+ ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
unsigned NumParts, bool &UseVecBaseAsInput) {
UseVecBaseAsInput = false;
SmallPtrSet<Value *, 4> UniqueBases;
@@ -10441,14 +10256,15 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
}
/// Checks if the specified entry \p E needs to be delayed because of its
/// dependency nodes.
- Value *needToDelay(const TreeEntry *E,
- ArrayRef<SmallVector<const TreeEntry *>> Deps) const {
+ std::optional<Value *>
+ needToDelay(const TreeEntry *E,
+ ArrayRef<SmallVector<const TreeEntry *>> Deps) const {
// No need to delay emission if all deps are ready.
if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {
return all_of(
TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; });
}))
- return nullptr;
+ return std::nullopt;
// Postpone gather emission, will be emitted after the end of the
// process to keep correct order.
auto *VecTy = FixedVectorType::get(E->Scalars.front()->getType(),
@@ -10558,7 +10374,8 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
inversePermutation(Order, NewMask);
add(V1, NewMask);
}
- Value *gather(ArrayRef<Value *> VL, Value *Root = nullptr) {
+ Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
+ Value *Root = nullptr) {
return R.gather(VL, Root);
}
Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
@@ -10819,15 +10636,16 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
cast<ExtractElementInst>(E->Scalars[Idx])->getVectorOperand()))
ExtractEntries.push_back(TE);
}
- if (Value *Delayed = ShuffleBuilder.needToDelay(E, ExtractEntries)) {
+ if (std::optional<ResTy> Delayed =
+ ShuffleBuilder.needToDelay(E, ExtractEntries)) {
// Delay emission of gathers which are not ready yet.
PostponedGathers.insert(E);
// Postpone gather emission, will be emitted after the end of the
// process to keep correct order.
- return Delayed;
+ return *Delayed;
}
if (Value *VecBase = ShuffleBuilder.adjustExtracts(
- E, ExtractMask, NumParts, UseVecBaseAsInput)) {
+ E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
ExtractVecBase = VecBase;
if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
if (VF == VecBaseTy->getNumElements() &&
@@ -10848,12 +10666,13 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
}
if (!GatherShuffles.empty()) {
- if (Value *Delayed = ShuffleBuilder.needToDelay(E, Entries)) {
+ if (std::optional<ResTy> Delayed =
+ ShuffleBuilder.needToDelay(E, Entries)) {
// Delay emission of gathers which are not ready yet.
PostponedGathers.insert(E);
// Postpone gather emission, will be emitted after the end of the
// process to keep correct order.
- return Delayed;
+ return *Delayed;
}
if (GatherShuffles.size() == 1 &&
*GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
@@ -11062,14 +10881,16 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
IsUsedInExpr &=
FindReusedSplat(VecMask, TEs.front()->getVectorFactor());
ShuffleBuilder.add(*TEs.front(), VecMask);
- IsNonPoisoned &=
- isGuaranteedNotToBePoison(TEs.front()->VectorizedValue);
+ if (TEs.front()->VectorizedValue)
+ IsNonPoisoned &=
+ isGuaranteedNotToBePoison(TEs.front()->VectorizedValue);
} else {
IsUsedInExpr = false;
ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
- IsNonPoisoned &=
- isGuaranteedNotToBePoison(TEs.front()->VectorizedValue) &&
- isGuaranteedNotToBePoison(TEs.back()->VectorizedValue);
+ if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
+ IsNonPoisoned &=
+ isGuaranteedNotToBePoison(TEs.front()->VectorizedValue) &&
+ isGuaranteedNotToBePoison(TEs.back()->VectorizedValue);
}
}
}
@@ -11128,7 +10949,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
if (!all_of(GatheredScalars, PoisonValue::classof)) {
SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);
- Value *BV = ShuffleBuilder.gather(GatheredScalars);
+ Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
ShuffleBuilder.add(BV, BVMask);
}
if (all_of(NonConstants, [=](Value *V) {
@@ -11142,13 +10963,13 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
E->ReuseShuffleIndices, E->Scalars.size(),
[&](Value *&Vec, SmallVectorImpl<int> &Mask) {
TryPackScalars(NonConstants, Mask, /*IsRootPoison=*/false);
- Vec = ShuffleBuilder.gather(NonConstants, Vec);
+ Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
});
} else if (!allConstant(GatheredScalars)) {
// Gather unique scalars and all constants.
SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);
- Value *BV = ShuffleBuilder.gather(GatheredScalars);
+ Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
ShuffleBuilder.add(BV, ReuseMask);
Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
} else {
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll
index d7425f053d5014e..c6209fd71063a03 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll
@@ -6,58 +6,36 @@
define fastcc i64 @zot(float %arg, float %arg1, float %arg2, float %arg3, float %arg4, ptr %arg5, i1 %arg6, i1 %arg7, i1 %arg8) {
; CHECK-LABEL: @zot(
; CHECK-NEXT: bb:
-; CHECK-NEXT: [[VAL:%.*]] = fmul fast float 0.000000e+00, 0.000000e+00
-; CHECK-NEXT: [[VAL9:%.*]] = fmul fast float 0.000000e+00, [[ARG:%.*]]
-; CHECK-NEXT: [[VAL10:%.*]] = fmul fast float [[ARG3:%.*]], 1.000000e+00
-; CHECK-NEXT: [[VAL11:%.*]] = fmul fast float [[ARG3]], 1.000000e+00
+; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x float> <float 0.000000e+00, float poison, float poison, float poison>, float [[ARG:%.*]], i32 1
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[ARG3:%.*]], i32 2
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> <float 0.000000e+00, float 0.000000e+00, float 1.000000e+00, float 1.000000e+00>, [[TMP2]]
; CHECK-NEXT: [[VAL12:%.*]] = fadd fast float [[ARG3]], 1.000000e+00
-; CHECK-NEXT: [[VAL13:%.*]] = fadd fast float [[VAL12]], 2.000000e+00
-; CHECK-NEXT: [[VAL14:%.*]] = fadd fast float 0.000000e+00, 0.000000e+00
-; CHECK-NEXT: [[VAL15:%.*]] = fadd fast float [[VAL14]], 1.000000e+00
-; CHECK-NEXT: [[VAL16:%.*]] = fadd fast float [[ARG3]], 1.000000e+00
-; CHECK-NEXT: [[VAL17:%.*]] = fadd fast float [[ARG3]], 1.000000e+00
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[TMP2]], float [[VAL12]], i32 0
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float 0.000000e+00, i32 1
+; CHECK-NEXT: [[TMP6:%.*]] = fadd fast <4 x float> [[TMP5]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
; CHECK-NEXT: br i1 [[ARG6:%.*]], label [[BB18:%.*]], label [[BB57:%.*]]
; CHECK: bb18:
-; CHECK-NEXT: [[VAL19:%.*]] = phi float [ [[VAL13]], [[BB:%.*]] ]
-; CHECK-NEXT: [[VAL20:%.*]] = phi float [ [[VAL15]], [[BB]] ]
-; CHECK-NEXT: [[VAL21:%.*]] = phi float [ [[VAL16]], [[BB]] ]
-; CHECK-NEXT: [[VAL22:%.*]] = phi float [ [[VAL17]], [[BB]] ]
-; CHECK-NEXT: [[VAL23:%.*]] = fmul fast float [[VAL16]], 2.000000e+00
-; CHECK-NEXT: [[VAL24:%.*]] = fmul fast float [[VAL17]], 3.000000e+00
+; CHECK-NEXT: [[TMP7:%.*]] = phi <4 x float> [ [[TMP6]], [[BB:%.*]] ]
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[TMP6]], i32 2
+; CHECK-NEXT: [[VAL23:%.*]] = fmul fast float [[TMP8]], 2.000000e+00
+; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[TMP6]], i32 3
+; CHECK-NEXT: [[VAL24:%.*]] = fmul fast float [[TMP9]], 3.000000e+00
; CHECK-NEXT: br i1 [[ARG7:%.*]], label [[BB25:%.*]], label [[BB57]]
; CHECK: bb25:
-; CHECK-NEXT: [[VAL26:%.*]] = phi float [ [[VAL19]], [[BB18]] ]
-; CHECK-NEXT: [[VAL27:%.*]] = phi float [ [[VAL20]], [[BB18]] ]
-; CHECK-NEXT: [[VAL28:%.*]] = phi float [ [[VAL21]], [[BB18]] ]
-; CHECK-NEXT: [[VAL29:%.*]] = phi float [ [[VAL22]], [[BB18]] ]
+; CHECK-NEXT: [[TMP10:%.*]] = phi <4 x float> [ [[TMP7]], [[BB18]] ]
+; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
; CHECK-NEXT: br label [[BB30:%.*]]
; CHECK: bb30:
; CHECK-NEXT: [[VAL31:%.*]] = phi float [ [[VAL55:%.*]], [[BB30]] ], [ 0.000000e+00, [[BB25]] ]
-; CHECK-NEXT: [[VAL32:%.*]] = phi float [ [[VAL9]], [[BB30]] ], [ 0.000000e+00, [[BB25]] ]
-; CHECK-NEXT: [[VAL33:%.*]] = load i8, ptr [[ARG5:%.*]], align 1
-; CHECK-NEXT: [[VAL34:%.*]] = uitofp i8 [[VAL33]] to float
-; CHECK-NEXT: [[VAL35:%.*]] = getelementptr inbounds i8, ptr [[ARG5]], i64 1
-; CHECK-NEXT: [[VAL36:%.*]] = load i8, ptr [[VAL35]], align 1
-; CHECK-NEXT: [[VAL37:%.*]] = uitofp i8 [[VAL36]] to float
-; CHECK-NEXT: [[VAL38:%.*]] = getelementptr inbounds i8, ptr [[ARG5]], i64 2
-; CHECK-NEXT: [[VAL39:%.*]] = load i8, ptr [[VAL38]], align 1
-; CHECK-NEXT: [[VAL40:%.*]] = uitofp i8 [[VAL39]] to float
-; CHECK-NEXT: [[VAL41:%.*]] = getelementptr inbounds i8, ptr [[ARG5]], i64 3
-; CHECK-NEXT: [[VAL42:%.*]] = load i8, ptr [[VAL41]], align 1
-; CHECK-NEXT: [[VAL43:%.*]] = uitofp i8 [[VAL42]] to float
-; CHECK-NEXT: [[VAL44:%.*]] = fsub fast float [[VAL34]], [[VAL]]
-; CHECK-NEXT: [[VAL45:%.*]] = fsub fast float [[VAL37]], [[VAL9]]
-; CHECK-NEXT: [[VAL46:%.*]] = fsub fast float [[VAL40]], [[VAL10]]
-; CHECK-NEXT: [[VAL47:%.*]] = fsub fast float [[VAL43]], [[VAL11]]
-; CHECK-NEXT: [[VAL48:%.*]] = fmul fast float [[VAL44]], [[VAL26]]
-; CHECK-NEXT: [[VAL49:%.*]] = fmul fast float [[VAL45]], [[VAL27]]
-; CHECK-NEXT: [[VAL50:%.*]] = fadd fast float [[VAL49]], [[VAL48]]
-; CHECK-NEXT: [[VAL51:%.*]] = fmul fast float [[VAL46]], [[VAL28]]
-; CHECK-NEXT: [[VAL52:%.*]] = fadd fast float [[VAL50]], [[VAL51]]
-; CHECK-NEXT: [[VAL53:%.*]] = fmul fast float [[VAL47]], [[VAL29]]
-; CHECK-NEXT: [[VAL54:%.*]] = fadd fast float [[VAL52]], [[VAL53]]
+; CHECK-NEXT: [[VAL32:%.*]] = phi float [ [[TMP11]], [[BB30]] ], [ 0.000000e+00, [[BB25]] ]
+; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i8>, ptr [[ARG5:%.*]], align 1
+; CHECK-NEXT: [[TMP13:%.*]] = uitofp <4 x i8> [[TMP12]] to <4 x float>
+; CHECK-NEXT: [[TMP14:%.*]] = fsub fast <4 x float> [[TMP13]], [[TMP3]]
+; CHECK-NEXT: [[TMP15:%.*]] = fmul fast <4 x float> [[TMP14]], [[TMP10]]
+; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP15]])
; CHECK-NEXT: [[VAL55]] = tail call fast float @llvm.minnum.f32(float [[VAL31]], float [[ARG1:%.*]])
-; CHECK-NEXT: [[VAL56:%.*]] = tail call fast float @llvm.maxnum.f32(float [[ARG2:%.*]], float [[VAL54]])
+; CHECK-NEXT: [[VAL56:%.*]] = tail call fast float @llvm.maxnum.f32(float [[ARG2:%.*]], float [[TMP16]])
; CHECK-NEXT: call void @ham(float [[VAL55]], float [[VAL56]])
; CHECK-NEXT: br i1 [[ARG8:%.*]], label [[BB30]], label [[BB57]]
; CHECK: bb57:
More information about the llvm-commits
mailing list