[llvm] [SLP] Loop aware cost model/tree building (PR #150450)
Alexey Bataev via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 7 11:32:24 PDT 2025
https://github.com/alexey-bataev updated https://github.com/llvm/llvm-project/pull/150450
>From e823808fef752b9e05e46c154169ca524b39d464 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Thu, 24 Jul 2025 16:10:23 +0000
Subject: [PATCH] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20initia?=
=?UTF-8?q?l=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Created using spr 1.3.5
---
.../Transforms/Vectorize/SLPVectorizer.cpp | 193 ++++++++++++++++--
.../SLPVectorizer/AArch64/slp-fma-loss.ll | 23 +--
llvm/test/Transforms/SLPVectorizer/X86/phi.ll | 21 +-
.../SLPVectorizer/X86/remark_horcost.ll | 4 +-
.../SLPVectorizer/X86/remark_not_all_parts.ll | 2 +-
5 files changed, 205 insertions(+), 38 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 0adad5a90d31c..7063cb80c5b5a 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -212,6 +212,11 @@ static cl::opt<bool> VectorizeCopyableElements(
cl::desc("Try to replace values with the idempotent instructions for "
"better vectorization."));
+static cl::opt<unsigned> LoopAwareMinTripCount(
+ "slp-cost-loop-min-trip-count", cl::init(1), cl::Hidden,
+ cl::desc("Minimum loop trip count, considered by the cost model during "
+ "modeling (0=loops are ignored and considered flat code)"));
+
// Limit the number of alias checks. The limit is chosen so that
// it has no negative effect on the llvm benchmarks.
static const unsigned AliasedCheckLimit = 10;
@@ -2033,6 +2038,7 @@ class BoUpSLP {
UserIgnoreList = nullptr;
PostponedGathers.clear();
ValueToGatherNodes.clear();
+ LoopNest.clear();
}
unsigned getTreeSize() const { return VectorizableTree.size(); }
@@ -3575,6 +3581,15 @@ class BoUpSLP {
TargetTransformInfo::CastContextHint
getCastContextHint(const TreeEntry &TE) const;
+ /// \returns the scale of the given tree entry to the loop iteration.
+ /// \p Scalar is the scalar value from entry, if using the parent for the
+ /// external use.
+ /// \p U is the user of the vectorized value from entry, if using the parent
+ /// for the external use.
+ unsigned getScaleToLoopIterations(const TreeEntry &TE,
+ Value *Scalar = nullptr,
+ Instruction *U = nullptr) const;
+
/// \returns the cost of the vectorizable entry.
InstructionCost getEntryCost(const TreeEntry *E,
ArrayRef<Value *> VectorizedVals,
@@ -4471,6 +4486,10 @@ class BoUpSLP {
std::tuple<SmallVector<int>, VectorType *, unsigned, bool>>
CompressEntryToData;
+ /// The loop nest, used to check if only single loop nest is vectorized, not
+ /// multiple, to avoid side-effects fronm loop-aware cost model.
+ SmallVector<const Loop *> LoopNest;
+
/// This POD struct describes one external user in the vectorized tree.
struct ExternalUser {
ExternalUser(Value *S, llvm::User *U, const TreeEntry &E, unsigned L)
@@ -9149,6 +9168,33 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
return {IntrinsicCost, LibCost};
}
+/// Find the innermost loop starting from \p L, for which at least single value
+/// in \p VL is not invariant.
+static const Loop *findInnermostNonInvariantLoop(const Loop *L,
+ ArrayRef<Value *> VL) {
+ assert(L && "Expected valid loop");
+ auto IsLoopInvariant = [&](const Loop *L, ArrayRef<Value *> VL) {
+ return all_of(VL, [&](Value *V) { return L->isLoopInvariant(V); });
+ };
+ while (L && IsLoopInvariant(L, VL))
+ L = L->getParentLoop();
+ return L;
+}
+
+/// Get the loop nest for the given loop.
+static SmallVector<const Loop *> getLoopNest(const Loop *L) {
+ assert(L && "Expected valid loop");
+ SmallVector<const Loop *> LoopNest;
+ if (LoopAwareMinTripCount == 0)
+ return LoopNest;
+ while (L) {
+ LoopNest.push_back(L);
+ L = L->getParentLoop();
+ }
+ SmallVector<const Loop *> Res(LoopNest.rbegin(), LoopNest.rend());
+ return Res;
+}
+
BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
const InstructionsState &S, ArrayRef<Value *> VL,
bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
@@ -9156,6 +9202,43 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
assert(S.getMainOp() &&
"Expected instructions with same/alternate opcodes only.");
+ // Check the loop nest. Need to be sure, we handle single loop nest at the
+ // time to avoid incorrect cost estimation because of the loop aware cost
+ // model.
+ if (VectorizableTree.empty()) {
+ assert(LoopNest.empty() && "Expected empty loop nest");
+ // Process the first node? Initial fill of the loop nest.
+ BasicBlock *Parent = S.getMainOp()->getParent();
+ if (const Loop *L = LI->getLoopFor(Parent)) {
+ L = findInnermostNonInvariantLoop(L, VL);
+ if (L)
+ LoopNest = getLoopNest(L);
+ }
+ } else {
+ BasicBlock *Parent = S.getMainOp()->getParent();
+ if (const Loop *L = LI->getLoopFor(Parent)) {
+ // Check that the new loop nest is not involved.
+ // Otherwise, mark it as a gather node.
+ L = findInnermostNonInvariantLoop(L, VL);
+ if (L) {
+ SmallVector<const Loop *> NewLoopNest = getLoopNest(L);
+ for (const auto [L1, L2] : zip_longest(LoopNest, NewLoopNest)) {
+ if (L1 && L2) {
+ if (*L1 != *L2) {
+ LLVM_DEBUG(dbgs() << "SLP: Different loop nest.\n");
+ return TreeEntry::NeedToGather;
+ }
+ continue;
+ }
+ if (!L2)
+ break;
+ assert(!L1 && "L1 is expected to be null");
+ LoopNest.push_back(*L2);
+ }
+ }
+ }
+ }
+
unsigned ShuffleOrOp =
S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
Instruction *VL0 = S.getMainOp();
@@ -13342,6 +13425,59 @@ TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
return TTI::CastContextHint::None;
}
+/// Get the minimum loop trip count for the loop \p L.
+static unsigned getLoopMinTripCount(const Loop *L, ScalarEvolution &SE) {
+ if (LoopAwareMinTripCount == 0)
+ return 1;
+ // Multiple exiting blocks - skip.
+ if (!L->getExitingBlock())
+ return LoopAwareMinTripCount;
+ if (unsigned Scale = SE.getSmallConstantTripCount(L))
+ return Scale;
+ return LoopAwareMinTripCount;
+}
+
+unsigned BoUpSLP::getScaleToLoopIterations(const TreeEntry &TE, Value *Scalar,
+ Instruction *U) const {
+ unsigned Scale = 1;
+ if (TE.State == TreeEntry::SplitVectorize)
+ return Scale;
+ BasicBlock *Parent = nullptr;
+ if (U) {
+ Parent = U->getParent();
+ } else if (TE.isGather()) {
+ EdgeInfo EI = TE.UserTreeIndex;
+ while (EI.UserTE) {
+ if (EI.UserTE->isGather()) {
+ EI = EI.UserTE->UserTreeIndex;
+ continue;
+ }
+ if (EI.UserTE->State == TreeEntry::Vectorize &&
+ EI.UserTE->getOpcode() == Instruction::PHI) {
+ auto *PH = cast<PHINode>(EI.UserTE->getMainOp());
+ Parent = PH->getIncomingBlock(EI.EdgeIdx);
+ } else {
+ Parent = EI.UserTE->getMainOp()->getParent();
+ }
+ break;
+ }
+ if (!Parent)
+ return Scale;
+ } else {
+ Parent = TE.getMainOp()->getParent();
+ }
+ if (const Loop *L = LI->getLoopFor(Parent)) {
+ L = findInnermostNonInvariantLoop(L, Scalar ? ArrayRef(Scalar)
+ : ArrayRef(TE.Scalars));
+ if (L) {
+ SmallVector<const Loop *> Nest = getLoopNest(L);
+ for (const Loop *L : reverse(Nest))
+ Scale *= getLoopMinTripCount(L, *SE);
+ }
+ }
+ return Scale;
+}
+
InstructionCost
BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
SmallPtrSetImpl<Value *> &CheckedExtracts) {
@@ -14689,10 +14825,14 @@ InstructionCost BoUpSLP::getSpillCost() {
if (It != MinBWs.end())
ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
auto *VecTy = getWidenedType(ScalarTy, Op->getVectorFactor());
- Cost += TTI->getCostOfKeepingLiveOverCall(VecTy);
+ unsigned Scale = getScaleToLoopIterations(*Op);
+ InstructionCost KeepLiveCost = TTI->getCostOfKeepingLiveOverCall(VecTy);
+ KeepLiveCost *= Scale;
+ Cost += KeepLiveCost;
if (ScalarTy->isVectorTy()) {
// Handle revec dead vector instructions.
- Cost -= Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(ScalarTy);
+ Cost -= Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(ScalarTy) *
+ Scale;
}
};
// Memoize the relationship between blocks, i.e. if there is (at least one)
@@ -14991,7 +15131,7 @@ template <typename T> struct ShuffledInsertData {
InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
InstructionCost ReductionCost) {
- InstructionCost Cost = ReductionCost;
+ InstructionCost Cost = 0;
LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
<< VectorizableTree.size() << ".\n");
@@ -15026,13 +15166,29 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
assert((!TE.isGather() || TE.Idx == 0 || TE.UserTreeIndex) &&
"Expected gather nodes with users only.");
- InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
+ InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts) *
+ getScaleToLoopIterations(TE);
Cost += C;
LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
<< shortBundleName(TE.Scalars, TE.Idx) << ".\n"
<< "SLP: Current total cost = " << Cost << "\n");
}
+ // Add reduced value cost, if resized.
+ Instruction *ReductionRoot = nullptr;
+ if (UserIgnoreList) {
+ const auto It = find_if(*UserIgnoreList, IsaPred<Instruction>);
+ assert(It != UserIgnoreList->end() && "Expected reduction instruction.");
+ ReductionRoot = cast<Instruction>(*It);
+ // Scale reuction cost to the factor of the loop nest trip count.
+ ReductionCost *=
+ getScaleToLoopIterations(*VectorizableTree.front().get(),
+ /*Scalar=*/nullptr, ReductionRoot);
+ }
+
+ // Add the cost for reduction.
+ Cost += ReductionCost;
+
if (Cost >= -SLPCostThreshold &&
none_of(ExternalUses, [](const ExternalUser &EU) {
return isa_and_nonnull<InsertElementInst>(EU.User);
@@ -15320,6 +15476,9 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
}
}
+ ExtraCost *= getScaleToLoopIterations(EU.E, EU.Scalar,
+ cast_or_null<Instruction>(EU.User));
+
ExtractCost += ExtraCost;
}
// Insert externals for extract of operands of casts to be emitted as scalars
@@ -15331,7 +15490,6 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
TEs.front()->findLaneForValue(V));
}
}
- // Add reduced value cost, if resized.
if (!VectorizedVals.empty()) {
const TreeEntry &Root = *VectorizableTree.front();
auto BWIt = MinBWs.find(&Root);
@@ -15349,9 +15507,12 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
assert(SLPReVec && "Only supported by REVEC.");
SrcTy = getWidenedType(SrcTy, VecTy->getNumElements());
}
- Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
- TTI::CastContextHint::None,
- TTI::TCK_RecipThroughput);
+ InstructionCost CastCost =
+ TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
+ TTI::CastContextHint::None,
+ TTI::TCK_RecipThroughput) *
+ getScaleToLoopIterations(Root, /*Scalar=*/nullptr, ReductionRoot);
+ Cost += CastCost;
}
}
}
@@ -15423,6 +15584,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
})) {
InstructionCost C =
::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, FTy, Mask);
+ C *= getScaleToLoopIterations(*TEs.front());
LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
<< " for final shuffle of insertelement "
"external users.\n";
@@ -15441,6 +15603,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
InstructionCost C =
::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, FTy, Mask);
+ C *= getScaleToLoopIterations(*TEs.back());
LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
<< " for final shuffle of vector node and external "
"insertelement users.\n";
@@ -15494,7 +15657,6 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
auto *DstVecTy =
getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
TTI::CastContextHint CCH = getCastContextHint(E);
- InstructionCost CastCost;
switch (E.getOpcode()) {
case Instruction::SExt:
case Instruction::ZExt:
@@ -15506,8 +15668,11 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
default:
break;
}
- CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
- TTI::TCK_RecipThroughput);
+ InstructionCost CastCost =
+ TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
+ TTI::TCK_RecipThroughput) *
+ getScaleToLoopIterations(*VectorizableTree.front().get(),
+ /*Scalar=*/nullptr, ReductionRoot);
Cost += CastCost;
LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost
<< " for final resize for reduction from " << SrcVecTy
@@ -15531,8 +15696,10 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
OS << *SpillCost;
else
OS << "<skipped>";
- OS << ".\nSLP: Extract Cost = " << ExtractCost << ".\n"
- << "SLP: Total Cost = " << Cost << ".\n";
+ OS << ".\nSLP: Extract Cost = " << ExtractCost << ".\n";
+ if (ReductionRoot)
+ OS << "SLP: Reduction Cost = " << ReductionCost << ".\n";
+ OS << "SLP: Total Cost = " << Cost << ".\n";
}
LLVM_DEBUG(dbgs() << Str);
if (ViewSLPTree)
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll
index 03f67ecb3e695..7000d5d0f26a6 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll
@@ -216,23 +216,22 @@ define void @slp_profitable_missing_fmf_nnans_only(ptr %A, ptr %B) {
define float @slp_not_profitable_in_loop(float %x, ptr %A) {
; CHECK-LABEL: @slp_not_profitable_in_loop(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 2
-; CHECK-NEXT: [[L_1:%.*]] = load float, ptr [[GEP_A_2]], align 4
+; CHECK-NEXT: [[A:%.*]] = getelementptr inbounds float, ptr [[A1:%.*]], i64 1
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[A]], align 4
-; CHECK-NEXT: [[L_3:%.*]] = load float, ptr [[A]], align 4
-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> <float poison, float 3.000000e+00>, float [[X:%.*]], i32 0
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT: [[L_2:%.*]] = load float, ptr [[A1]], align 4
+; CHECK-NEXT: [[L_3:%.*]] = load float, ptr [[A1]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> <float 3.000000e+00, float 3.000000e+00, float poison, float 3.000000e+00>, float [[X:%.*]], i32 2
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[L_2]], i32 2
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[L_3]], i32 3
+; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP7]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: loop:
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
; CHECK-NEXT: [[RED:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP0]]
-; CHECK-NEXT: [[MUL12:%.*]] = fmul fast float 3.000000e+00, [[L_1]]
-; CHECK-NEXT: [[MUL16:%.*]] = fmul fast float 3.000000e+00, [[L_3]]
-; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
-; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[MUL12]], [[TMP3]]
-; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
-; CHECK-NEXT: [[ADD13:%.*]] = fadd fast float [[ADD]], [[TMP4]]
-; CHECK-NEXT: [[RED_NEXT]] = fadd fast float [[ADD13]], [[MUL16]]
+; CHECK-NEXT: [[TMP6:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP5]]
+; CHECK-NEXT: [[RED_NEXT]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP6]])
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV]], 10
; CHECK-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll
index 17ae33652b6d8..245d284fb3169 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll
@@ -136,30 +136,31 @@ for.end: ; preds = %for.body
define float @foo3(ptr nocapture readonly %A) #0 {
; CHECK-LABEL: @foo3(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 1
-; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[A]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[A:%.*]], align 4
+; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1
+; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX1]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[R_052:%.*]] = phi float [ [[TMP2]], [[ENTRY]] ], [ [[ADD6:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[TMP11:%.*]] = phi float [ [[TMP4]], [[ENTRY]] ], [ [[TMP21:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[TMP5:%.*]] = phi float [ [[TMP2]], [[ENTRY]] ], [ [[TMP10:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[TMP3:%.*]] = phi <4 x float> [ [[TMP1]], [[ENTRY]] ], [ [[TMP15:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x float> [ [[TMP0]], [[ENTRY]] ], [ [[TMP7:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
; CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP5]], 7.000000e+00
; CHECK-NEXT: [[ADD6]] = fadd float [[R_052]], [[MUL]]
; CHECK-NEXT: [[TMP6:%.*]] = add nsw i64 [[INDVARS_IV]], 2
; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP6]]
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 3
; CHECK-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV_NEXT]]
+; CHECK-NEXT: [[TMP7:%.*]] = add nsw i64 [[INDVARS_IV]], 4
+; CHECK-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]]
+; CHECK-NEXT: [[TMP21]] = load float, ptr [[ARRAYIDX24]], align 4
; CHECK-NEXT: [[TMP8:%.*]] = load <2 x float>, ptr [[ARRAYIDX14]], align 4
-; CHECK-NEXT: [[TMP7]] = load <2 x float>, ptr [[ARRAYIDX19]], align 4
+; CHECK-NEXT: [[TMP10]] = load float, ptr [[ARRAYIDX19]], align 4
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <4 x i32> <i32 poison, i32 0, i32 1, i32 poison>
-; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x i32> <i32 5, i32 1, i32 2, i32 poison>
-; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x float> [[TMP11]], <4 x float> [[TMP12]], <4 x i32> <i32 0, i32 1, i32 2, i32 5>
+; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP11]], i32 0
+; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x float> [[TMP12]], float [[TMP21]], i32 3
; CHECK-NEXT: [[TMP14:%.*]] = fmul <4 x float> [[TMP13]], <float 8.000000e+00, float 9.000000e+00, float 1.000000e+01, float 1.100000e+01>
; CHECK-NEXT: [[TMP15]] = fadd <4 x float> [[TMP3]], [[TMP14]]
; CHECK-NEXT: [[TMP16:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll
index f1cd42a2c404a..7641dd5eb4210 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll
@@ -86,7 +86,7 @@ for.body: ; preds = %for.body, %entry
; YAML-NEXT: Function: foo
; YAML-NEXT: Args:
; YAML-NEXT: - String: 'Stores SLP vectorized with cost '
- ; YAML-NEXT: - Cost: '-5'
+ ; YAML-NEXT: - Cost: '-40'
; YAML-NEXT: - String: ' and with tree size '
; YAML-NEXT: - TreeSize: '4'
@@ -96,7 +96,7 @@ for.body: ; preds = %for.body, %entry
; YAML-NEXT: Function: foo
; YAML-NEXT: Args:
; YAML-NEXT: - String: 'Vectorized horizontal reduction with cost '
- ; YAML-NEXT: - Cost: '-7'
+ ; YAML-NEXT: - Cost: '-56'
; YAML-NEXT: - String: ' and with tree size '
; YAML-NEXT: - TreeSize: '1'
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_not_all_parts.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_not_all_parts.ll
index 6d96d6d29cd59..82fcde5319aa0 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/remark_not_all_parts.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_not_all_parts.ll
@@ -62,7 +62,7 @@ for.body: ; preds = %for.body, %entry
; YAML-NEXT: Function: foo
; YAML-NEXT: Args:
; YAML-NEXT: - String: 'Stores SLP vectorized with cost '
- ; YAML-NEXT: - Cost: '-1'
+ ; YAML-NEXT: - Cost: '-8'
; YAML-NEXT: - String: ' and with tree size '
; YAML-NEXT: - TreeSize: '4'
More information about the llvm-commits
mailing list