[llvm] [SLP] Check for extracts, being replaced by original scalars, for user nodes (PR #149572)
Alexey Bataev via llvm-commits
llvm-commits at lists.llvm.org
Fri Jul 25 12:51:39 PDT 2025
https://github.com/alexey-bataev updated https://github.com/llvm/llvm-project/pull/149572
>From eb2d7aa90afdb5fc6f54a4fd9b4828c4050cc7ce Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Fri, 18 Jul 2025 19:34:54 +0000
Subject: [PATCH] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20initia?=
=?UTF-8?q?l=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Created using spr 1.3.5
---
.../Transforms/Vectorize/SLPVectorizer.cpp | 146 +++++++++++++++++-
.../RISCV/reordered-buildvector-scalars.ll | 102 ++++++------
2 files changed, 197 insertions(+), 51 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 6ad5c60105a28..14d4e45d1ab6f 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1906,6 +1906,7 @@ class BoUpSLP {
void deleteTree() {
VectorizableTree.clear();
ScalarToTreeEntries.clear();
+ PostponedNodesWithNonVecUsers.clear();
OperandsToTreeEntry.clear();
ScalarsInSplitNodes.clear();
MustGather.clear();
@@ -3896,6 +3897,9 @@ class BoUpSLP {
bool hasState() const { return S.valid(); }
+ /// Returns the state of the operations.
+ const InstructionsState &getOperations() const { return S; }
+
/// When ReuseReorderShuffleIndices is empty it just returns position of \p
/// V within vector of Scalars. Otherwise, try to remap on its reuse index.
unsigned findLaneForValue(Value *V) const {
@@ -4290,6 +4294,13 @@ class BoUpSLP {
OrdersType &CurrentOrder,
SmallVectorImpl<Value *> &PointerOps);
+ /// Checks if it is profitable to vectorize the specified list of the
+ /// instructions if not all users are vectorized.
+ bool isProfitableToVectorizeWithNonVecUsers(const InstructionsState &S,
+ const EdgeInfo &UserTreeIdx,
+ ArrayRef<Value *> Scalars,
+ ArrayRef<int> ScalarsMask);
+
/// Maps a specific scalar to its tree entry(ies).
SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
@@ -4300,6 +4311,9 @@ class BoUpSLP {
/// Scalars, used in split vectorize nodes.
SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarsInSplitNodes;
+ /// List of tree nodes indices, which have non-vectorized users.
+ SmallSet<unsigned, 4> PostponedNodesWithNonVecUsers;
+
/// Maps a value to the proposed vectorizable size.
SmallDenseMap<Value *, unsigned> InstrElementSize;
@@ -8993,6 +9007,81 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
return {IntrinsicCost, LibCost};
}
+bool BoUpSLP::isProfitableToVectorizeWithNonVecUsers(
+ const InstructionsState &S, const EdgeInfo &UserTreeIdx,
+ ArrayRef<Value *> Scalars, ArrayRef<int> ScalarsMask) {
+ assert(S && "Expected valid instructions state.");
+ // Loads, extracts and geps are immediately scalarizable, so no need to check.
+ if (S.getOpcode() == Instruction::Load ||
+ S.getOpcode() == Instruction::ExtractElement ||
+ S.getOpcode() == Instruction::GetElementPtr)
+ return true;
+ // Check only vectorized users, others scalarized (potentially, at least)
+ // already.
+ if (!UserTreeIdx.UserTE || UserTreeIdx.UserTE->isGather() ||
+ UserTreeIdx.UserTE->State == TreeEntry::SplitVectorize)
+ return true;
+ // PHI nodes may have cyclic deps, so cannot check here.
+ if (UserTreeIdx.UserTE->getOpcode() == Instruction::PHI)
+ return true;
+ // Do not check root reduction nodes, they do not have non-vectorized users.
+ if (UserIgnoreList && UserTreeIdx.UserTE->Idx == 0)
+ return true;
+ constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+ ArrayRef<Value *> VL = UserTreeIdx.UserTE->Scalars;
+ Type *UserScalarTy = getValueType(VL.front());
+ if (!isValidElementType(UserScalarTy))
+ return true;
+ Type *ScalarTy = getValueType(Scalars.front());
+ if (!isValidElementType(ScalarTy))
+ return true;
+ // Ignore subvectors extracts.
+ if (UserScalarTy->isVectorTy())
+ return true;
+ auto *UserVecTy =
+ getWidenedType(UserScalarTy, UserTreeIdx.UserTE->getVectorFactor());
+ APInt DemandedElts = APInt::getZero(UserTreeIdx.UserTE->getVectorFactor());
+ // Check the external uses and check, if vector node + extracts is not
+ // profitable for the vectorization.
+ InstructionCost UserScalarsCost = 0;
+ for (Value *V : VL) {
+ auto *I = dyn_cast<Instruction>(V);
+ if (!I)
+ continue;
+ if (areAllUsersVectorized(I, UserIgnoreList))
+ continue;
+ DemandedElts.setBit(UserTreeIdx.UserTE->findLaneForValue(V));
+ UserScalarsCost += TTI->getInstructionCost(I, CostKind);
+ }
+ // No non-vectorized users - success.
+ if (DemandedElts.isZero())
+ return true;
+ // If extracts are cheaper than the original scalars - success.
+ InstructionCost ExtractCost =
+ ::getScalarizationOverhead(*TTI, UserScalarTy, UserVecTy, DemandedElts,
+ /*Insert=*/false, /*Extract=*/true, CostKind);
+ if (ExtractCost <= UserScalarsCost)
+ return true;
+ SmallPtrSet<Value *, 4> CheckedExtracts;
+ InstructionCost NodeCost =
+ UserTreeIdx.UserTE->State == TreeEntry::CombinedVectorize
+ ? InstructionCost(0)
+ : getEntryCost(UserTreeIdx.UserTE, {}, CheckedExtracts);
+ // The node is profitable for vectorization - success.
+ if (ExtractCost + NodeCost <= -SLPCostThreshold)
+ return true;
+ auto *VecTy = getWidenedType(ScalarTy, Scalars.size());
+ InstructionCost ScalarsCost = ::getScalarizationOverhead(
+ *TTI, ScalarTy, VecTy, APInt::getAllOnes(Scalars.size()),
+ /*Insert=*/true, /*Extract=*/false, CostKind);
+ if (!ScalarsMask.empty())
+ ScalarsCost += getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, VecTy,
+ ScalarsMask, CostKind);
+
+ // User extracts are cheaper than user scalars + immediate scalars - success.
+ return ExtractCost - (UserScalarsCost + ScalarsCost) < -SLPCostThreshold;
+}
+
BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
const InstructionsState &S, ArrayRef<Value *> VL,
bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
@@ -10283,6 +10372,17 @@ void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
return;
}
+ // Postpone vectorization, if the node is not profitable because of the
+ // external uses.
+ if (!isProfitableToVectorizeWithNonVecUsers(S, UserTreeIdx, VL,
+ ReuseShuffleIndices)) {
+ PostponedNodesWithNonVecUsers.insert(VectorizableTree.size());
+ auto Invalid = ScheduleBundle::invalid();
+ newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndices);
+ return;
+ }
+
Instruction *VL0 = S.getMainOp();
BasicBlock *BB = VL0->getParent();
auto &BSRef = BlocksSchedules[BB];
@@ -11668,6 +11768,27 @@ void BoUpSLP::transformNodes() {
ArrayRef<Value *> VL = E.Scalars;
const unsigned Sz = getVectorElementSize(VL.front());
unsigned MinVF = getMinVF(2 * Sz);
+ const EdgeInfo &EI = E.UserTreeIndex;
+ // Try to vectorized postponed scalars, if external uses are vectorized.
+ if (PostponedNodesWithNonVecUsers.contains(E.Idx) &&
+ isProfitableToVectorizeWithNonVecUsers(
+ E.getOperations(), EI, E.Scalars, E.ReuseShuffleIndices)) {
+ assert(E.hasState() && "Expected to have state");
+ unsigned PrevSize = VectorizableTree.size();
+ [[maybe_unused]] unsigned PrevEntriesSize =
+ LoadEntriesToVectorize.size();
+ buildTreeRec(VL, 0, EdgeInfo(&E, UINT_MAX));
+ if (PrevSize + 1 == VectorizableTree.size() &&
+ VectorizableTree[PrevSize]->isGather()) {
+ VectorizableTree.pop_back();
+ assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
+ "LoadEntriesToVectorize expected to remain the same");
+ } else {
+ E.CombinedEntriesWithIndices.emplace_back(PrevSize, 0);
+ continue;
+ }
+ }
+
// Do not try partial vectorization for small nodes (<= 2), nodes with the
// same opcode and same parent block or all constants.
if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
@@ -12828,7 +12949,9 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
});
InVectors.front() = V;
}
- if (!SubVectors.empty()) {
+ if (!SubVectors.empty() &&
+ (SubVectors.size() > 1 || SubVectors.front().second != 0 ||
+ SubVectors.front().first->getVectorFactor() != CommonMask.size())) {
const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
if (InVectors.size() == 2)
Cost += createShuffle(Vec, InVectors.back(), CommonMask);
@@ -13348,7 +13471,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
case Instruction::Trunc:
case Instruction::FPTrunc:
case Instruction::BitCast: {
- auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
+ auto SrcIt =
+ MinBWs.empty() ? MinBWs.end() : MinBWs.find(getOperandEntry(E, 0));
Type *SrcScalarTy = VL0->getOperand(0)->getType();
auto *SrcVecTy = getWidenedType(SrcScalarTy, VL.size());
unsigned Opcode = ShuffleOrOp;
@@ -13795,7 +13919,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
auto *SrcTy = getWidenedType(SrcSclTy, VL.size());
if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {
- auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
+ auto SrcIt = MinBWs.empty() ? MinBWs.end()
+ : MinBWs.find(getOperandEntry(E, 0));
unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
unsigned SrcBWSz =
DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
@@ -14793,6 +14918,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
auto *Inst = cast<Instruction>(EU.Scalar);
InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind);
auto OperandIsScalar = [&](Value *V) {
+ if (!isa<Instruction>(V))
+ return true;
if (!isVectorized(V)) {
// Some extractelements might be not vectorized, but
// transformed into shuffle and removed from the function,
@@ -16873,7 +17000,18 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
});
InVectors.front() = Vec;
}
- if (!SubVectors.empty()) {
+ if (SubVectors.size() == 1 && SubVectors.front().second == 0 &&
+ SubVectors.front().first->getVectorFactor() == CommonMask.size()) {
+ Value *Vec = SubVectors.front().first->VectorizedValue;
+ if (Vec->getType()->isIntOrIntVectorTy())
+ Vec = castToScalarTyElem(
+ Vec, any_of(SubVectors.front().first->Scalars, [&](Value *V) {
+ if (isa<PoisonValue>(V))
+ return false;
+ return !isKnownNonNegative(V, SimplifyQuery(*R.DL));
+ }));
+ transformMaskAfterShuffle(CommonMask, CommonMask);
+ } else if (!SubVectors.empty()) {
Value *Vec = InVectors.front();
if (InVectors.size() == 2) {
Vec = createShuffle(Vec, InVectors.back(), CommonMask);
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll
index d4e323819402c..117c892b79f9a 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll
@@ -102,80 +102,88 @@ define fastcc i32 @test(i32 %0, i32 %add111.i.i, <4 x i32> %PredPel.i.sroa.86.72
; THRESH-NEXT: [[SHR143_5_I_I9:%.*]] = ashr i32 [[TMP0]], 1
; THRESH-NEXT: [[ADD1392_I:%.*]] = add i32 [[TMP0]], 1
; THRESH-NEXT: [[MUL1445_I:%.*]] = shl i32 [[TMP0]], 1
-; THRESH-NEXT: [[ADD2136_I:%.*]] = or i32 [[LOOPARRAY_SROA_24_0_I_I3]], [[TMP0]]
-; THRESH-NEXT: [[SHR2137_I:%.*]] = lshr i32 [[ADD2136_I]], 1
-; THRESH-NEXT: [[CONV2138_I:%.*]] = trunc i32 [[SHR2137_I]] to i16
; THRESH-NEXT: [[ADD2174_I:%.*]] = add i32 [[MUL1445_I]], 2
; THRESH-NEXT: [[SHR2175_I:%.*]] = lshr i32 [[ADD2174_I]], 2
-; THRESH-NEXT: [[CONV2176_I:%.*]] = trunc i32 [[SHR2175_I]] to i16
; THRESH-NEXT: [[ADD2190_I:%.*]] = or i32 [[ADD1392_I]], 1
; THRESH-NEXT: [[ADD2191_I:%.*]] = add i32 [[ADD2190_I]], [[TMP0]]
-; THRESH-NEXT: [[CONV2193_I:%.*]] = trunc i32 [[ADD2191_I]] to i16
-; THRESH-NEXT: [[ADD2203_I:%.*]] = or i32 [[TMP0]], 1
-; THRESH-NEXT: [[ADD2204_I:%.*]] = add i32 [[ADD2203_I]], [[TMP0]]
-; THRESH-NEXT: [[CONV2206_I:%.*]] = trunc i32 [[ADD2204_I]] to i16
+; THRESH-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <2 x i32> <i32 poison, i32 0>
+; THRESH-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[LOOPARRAY_SROA_24_0_I_I3]], i32 0
+; THRESH-NEXT: [[TMP4:%.*]] = add <2 x i32> [[TMP3]], splat (i32 1)
+; THRESH-NEXT: [[TMP5:%.*]] = lshr <2 x i32> [[TMP4]], splat (i32 1)
; THRESH-NEXT: [[ADD2235_I16:%.*]] = or i32 [[TMP0]], 1
; THRESH-NEXT: [[ADD2236_I:%.*]] = add i32 [[ADD2235_I16]], 1
; THRESH-NEXT: [[SHR2237_I:%.*]] = lshr i32 [[ADD2236_I]], 1
-; THRESH-NEXT: [[CONV2238_I:%.*]] = trunc i32 [[SHR2237_I]] to i16
-; THRESH-NEXT: store i16 [[CONV2238_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8196), align 4
-; THRESH-NEXT: store i16 [[CONV2238_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8176), align 8
-; THRESH-NEXT: [[ADD2258_I:%.*]] = or i32 [[ADD111_I_I]], [[TMP0]]
-; THRESH-NEXT: [[SHR2259_I:%.*]] = lshr i32 [[ADD2258_I]], 1
-; THRESH-NEXT: [[CONV2260_I:%.*]] = trunc i32 [[SHR2259_I]] to i16
-; THRESH-NEXT: store i16 [[CONV2260_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8212), align 4
-; THRESH-NEXT: store i16 [[CONV2260_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8192), align 8
-; THRESH-NEXT: store i16 [[CONV2260_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8172), align 4
+; THRESH-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[ADD111_I_I]], i32 0
+; THRESH-NEXT: [[TMP19:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[LOOPARRAY_SROA_24_0_I_I3]], i32 1
+; THRESH-NEXT: [[TMP20:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0
+; THRESH-NEXT: [[TMP21:%.*]] = shufflevector <2 x i32> [[TMP20]], <2 x i32> poison, <2 x i32> zeroinitializer
+; THRESH-NEXT: [[TMP22:%.*]] = or <2 x i32> [[TMP19]], [[TMP21]]
; THRESH-NEXT: [[ADD2302_I:%.*]] = add i32 [[TMP0]], 1
; THRESH-NEXT: [[SHR2303_I:%.*]] = lshr i32 [[ADD2302_I]], 1
-; THRESH-NEXT: [[CONV2304_I:%.*]] = trunc i32 [[SHR2303_I]] to i16
-; THRESH-NEXT: store i16 [[CONV2304_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8224), align 8
-; THRESH-NEXT: store i16 [[CONV2304_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8204), align 4
-; THRESH-NEXT: store i16 [[CONV2304_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8184), align 8
; THRESH-NEXT: [[ADD2323_I:%.*]] = add i32 [[TMP0]], 1
; THRESH-NEXT: [[ADD2324_I:%.*]] = or i32 [[ADD2323_I]], [[TMP0]]
; THRESH-NEXT: [[SHR2325_I:%.*]] = lshr i32 [[ADD2324_I]], 1
-; THRESH-NEXT: [[CONV2326_I:%.*]] = trunc i32 [[SHR2325_I]] to i16
-; THRESH-NEXT: store i16 [[CONV2326_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8220), align 4
-; THRESH-NEXT: store i16 [[CONV2326_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8200), align 8
-; THRESH-NEXT: [[ADD2342_I:%.*]] = add i32 [[SHR143_5_I_I9]], 1
-; THRESH-NEXT: [[SHR2343_I:%.*]] = lshr i32 [[ADD2342_I]], 1
-; THRESH-NEXT: [[CONV2344_I:%.*]] = trunc i32 [[SHR2343_I]] to i16
-; THRESH-NEXT: store i16 [[CONV2344_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8216), align 8
+; THRESH-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 1
+; THRESH-NEXT: [[TMP12:%.*]] = insertelement <2 x i32> [[TMP25]], i32 [[SHR143_5_I_I9]], i32 0
+; THRESH-NEXT: [[TMP13:%.*]] = add <2 x i32> [[TMP12]], splat (i32 1)
+; THRESH-NEXT: [[TMP14:%.*]] = or <2 x i32> [[TMP12]], splat (i32 1)
+; THRESH-NEXT: [[TMP15:%.*]] = shufflevector <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> <i32 0, i32 3>
; THRESH-NEXT: [[ADD2355_I:%.*]] = or i32 [[SHR143_5_I_I9]], 1
; THRESH-NEXT: [[ADD2356_I:%.*]] = add i32 [[ADD2355_I]], [[TMP0]]
; THRESH-NEXT: [[CONV2358_I:%.*]] = trunc i32 [[ADD2356_I]] to i16
; THRESH-NEXT: store i16 [[CONV2358_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8232), align 8
-; THRESH-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <2 x i32> <i32 poison, i32 0>
-; THRESH-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[LOOPARRAY_SROA_24_0_I_I3]], i32 0
-; THRESH-NEXT: [[TMP4:%.*]] = add <2 x i32> [[TMP3]], splat (i32 1)
-; THRESH-NEXT: [[TMP5:%.*]] = lshr <2 x i32> [[TMP4]], splat (i32 1)
-; THRESH-NEXT: [[TMP6:%.*]] = trunc <2 x i32> [[TMP5]] to <2 x i16>
-; THRESH-NEXT: store <2 x i16> [[TMP6]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8180), align 4
; THRESH-NEXT: [[ADD2393_I:%.*]] = or i32 [[LOOPARRAY_SROA_24_0_I_I3]], 1
; THRESH-NEXT: [[ADD2394_I:%.*]] = add i32 [[ADD2393_I]], [[TMP0]]
-; THRESH-NEXT: [[CONV2396_I:%.*]] = trunc i32 [[ADD2394_I]] to i16
-; THRESH-NEXT: store i16 [[CONV2396_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8198), align 2
-; THRESH-NEXT: store i16 [[CONV2396_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8178), align 2
-; THRESH-NEXT: store i16 [[CONV2138_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8214), align 2
-; THRESH-NEXT: store i16 [[CONV2138_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8194), align 2
-; THRESH-NEXT: store i16 [[CONV2138_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8174), align 2
+; THRESH-NEXT: [[TMP16:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; THRESH-NEXT: [[TMP17:%.*]] = insertelement <8 x i32> [[TMP16]], i32 [[SHR2303_I]], i32 2
+; THRESH-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> [[TMP17]], i32 [[SHR2175_I]], i32 3
+; THRESH-NEXT: [[CONV2176_I:%.*]] = trunc i32 [[SHR2175_I]] to i16
+; THRESH-NEXT: [[CONV2238_I:%.*]] = trunc i32 [[SHR2237_I]] to i16
+; THRESH-NEXT: store i16 [[CONV2238_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8196), align 4
; THRESH-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[PREDPEL_I_SROA_86_72_VEC_EXTRACT]], <4 x i32> poison, <2 x i32> <i32 poison, i32 0>
; THRESH-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[ADD111_I_I]], i32 0
; THRESH-NEXT: [[TMP9:%.*]] = add <2 x i32> [[TMP8]], splat (i32 1)
; THRESH-NEXT: [[TMP10:%.*]] = lshr <2 x i32> [[TMP9]], splat (i32 1)
+; THRESH-NEXT: [[TMP23:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; THRESH-NEXT: [[TMP24:%.*]] = shufflevector <8 x i32> [[TMP18]], <8 x i32> [[TMP23]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
; THRESH-NEXT: [[TMP11:%.*]] = trunc <2 x i32> [[TMP10]] to <2 x i16>
-; THRESH-NEXT: [[TMP12:%.*]] = extractelement <2 x i16> [[TMP11]], i32 1
-; THRESH-NEXT: store <2 x i16> [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8228), align 4
+; THRESH-NEXT: [[TMP26:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; THRESH-NEXT: [[TMP27:%.*]] = trunc <2 x i32> [[TMP10]] to <2 x i16>
+; THRESH-NEXT: store <2 x i16> [[TMP27]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8228), align 4
; THRESH-NEXT: store <2 x i16> [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8208), align 8
-; THRESH-NEXT: store <2 x i16> [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8188), align 4
-; THRESH-NEXT: store i16 [[TMP12]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8170), align 2
; THRESH-NEXT: store i16 [[CONV2176_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8226), align 2
; THRESH-NEXT: store i16 [[CONV2176_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8206), align 2
-; THRESH-NEXT: store i16 [[CONV2176_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8186), align 2
+; THRESH-NEXT: [[CONV2304_I:%.*]] = trunc i32 [[SHR2303_I]] to i16
+; THRESH-NEXT: [[TMP28:%.*]] = insertelement <4 x i32> poison, i32 [[ADD2394_I]], i32 0
+; THRESH-NEXT: [[TMP29:%.*]] = insertelement <4 x i32> [[TMP28]], i32 [[SHR2325_I]], i32 1
+; THRESH-NEXT: [[TMP30:%.*]] = insertelement <4 x i32> [[TMP29]], i32 [[ADD2191_I]], i32 2
+; THRESH-NEXT: [[TMP31:%.*]] = insertelement <4 x i32> [[TMP30]], i32 [[SHR2303_I]], i32 3
+; THRESH-NEXT: [[TMP32:%.*]] = trunc <4 x i32> [[TMP31]] to <4 x i16>
+; THRESH-NEXT: [[CONV2193_I:%.*]] = trunc i32 [[ADD2191_I]] to i16
+; THRESH-NEXT: [[CONV2326_I:%.*]] = trunc i32 [[SHR2325_I]] to i16
+; THRESH-NEXT: [[CONV2396_I:%.*]] = trunc i32 [[ADD2394_I]] to i16
+; THRESH-NEXT: store i16 [[CONV2326_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8220), align 4
+; THRESH-NEXT: store i16 [[CONV2396_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8178), align 2
+; THRESH-NEXT: store i16 [[CONV2304_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8224), align 8
; THRESH-NEXT: store i16 [[CONV2193_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8222), align 2
-; THRESH-NEXT: store i16 [[CONV2193_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8202), align 2
-; THRESH-NEXT: store i16 [[CONV2206_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8218), align 2
+; THRESH-NEXT: store <4 x i16> [[TMP32]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8198), align 2
+; THRESH-NEXT: [[TMP33:%.*]] = shufflevector <2 x i32> [[TMP22]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; THRESH-NEXT: [[TMP34:%.*]] = shufflevector <2 x i32> [[TMP15]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; THRESH-NEXT: [[TMP35:%.*]] = shufflevector <4 x i32> [[TMP33]], <4 x i32> [[TMP34]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; THRESH-NEXT: [[TMP36:%.*]] = insertelement <4 x i32> <i32 1, i32 1, i32 1, i32 poison>, i32 [[TMP0]], i32 3
+; THRESH-NEXT: [[TMP37:%.*]] = lshr <4 x i32> [[TMP35]], [[TMP36]]
+; THRESH-NEXT: [[TMP38:%.*]] = add <4 x i32> [[TMP35]], [[TMP36]]
+; THRESH-NEXT: [[TMP39:%.*]] = shufflevector <4 x i32> [[TMP37]], <4 x i32> [[TMP38]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; THRESH-NEXT: [[TMP40:%.*]] = shufflevector <4 x i32> [[TMP39]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; THRESH-NEXT: [[TMP41:%.*]] = shufflevector <8 x i32> [[TMP24]], <8 x i32> [[TMP40]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; THRESH-NEXT: [[TMP42:%.*]] = trunc <8 x i32> [[TMP41]] to <8 x i16>
+; THRESH-NEXT: [[TMP43:%.*]] = shufflevector <4 x i32> [[TMP26]], <4 x i32> [[TMP39]], <4 x i32> <i32 0, i32 4, i32 5, i32 poison>
+; THRESH-NEXT: [[TMP44:%.*]] = insertelement <4 x i32> [[TMP43]], i32 [[SHR2237_I]], i32 3
+; THRESH-NEXT: [[TMP45:%.*]] = trunc <4 x i32> [[TMP44]] to <4 x i16>
+; THRESH-NEXT: [[TMP46:%.*]] = trunc <4 x i32> [[TMP39]] to <4 x i16>
+; THRESH-NEXT: store <4 x i16> [[TMP45]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8170), align 2
+; THRESH-NEXT: store <8 x i16> [[TMP42]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8180), align 4
+; THRESH-NEXT: store <4 x i16> [[TMP46]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8212), align 4
; THRESH-NEXT: ret i32 0
;
entry:
More information about the llvm-commits
mailing list