[llvm] [SLP]Check for extracts, being replaced by original scalares, for user nodes (PR #149572)

Fri Jul 18 12:35:09 PDT 2025

https://github.com/alexey-bataev created https://github.com/llvm/llvm-project/pull/149572

Better to check, if the extracts for external uses of the user nodes are
expensive, and they better to be represented as original scalars. In
this case, better to keep their operands scalar too, i.e. for
buildvector node. Otherwise, expensive extracts will be creating,
pessimizing the cost of the tree.


>From eb2d7aa90afdb5fc6f54a4fd9b4828c4050cc7ce Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Fri, 18 Jul 2025 19:34:54 +0000
Subject: [PATCH] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20initia?=
 =?UTF-8?q?l=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Created using spr 1.3.5
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 146 +++++++++++++++++-
 .../RISCV/reordered-buildvector-scalars.ll    | 102 ++++++------
 2 files changed, 197 insertions(+), 51 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 6ad5c60105a28..14d4e45d1ab6f 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1906,6 +1906,7 @@ class BoUpSLP {
   void deleteTree() {
     VectorizableTree.clear();
     ScalarToTreeEntries.clear();
+    PostponedNodesWithNonVecUsers.clear();
     OperandsToTreeEntry.clear();
     ScalarsInSplitNodes.clear();
     MustGather.clear();
@@ -3896,6 +3897,9 @@ class BoUpSLP {
 
     bool hasState() const { return S.valid(); }
 
+    /// Returns the state of the operations.
+    const InstructionsState &getOperations() const { return S; }
+
     /// When ReuseReorderShuffleIndices is empty it just returns position of \p
     /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
     unsigned findLaneForValue(Value *V) const {
@@ -4290,6 +4294,13 @@ class BoUpSLP {
                                OrdersType &CurrentOrder,
                                SmallVectorImpl<Value *> &PointerOps);
 
+  /// Checks if it is profitable to vectorize the specified list of the
+  /// instructions if not all users are vectorized.
+  bool isProfitableToVectorizeWithNonVecUsers(const InstructionsState &S,
+                                              const EdgeInfo &UserTreeIdx,
+                                              ArrayRef<Value *> Scalars,
+                                              ArrayRef<int> ScalarsMask);
+
   /// Maps a specific scalar to its tree entry(ies).
   SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
 
@@ -4300,6 +4311,9 @@ class BoUpSLP {
   /// Scalars, used in split vectorize nodes.
   SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarsInSplitNodes;
 
+  /// List of tree nodes indices, which have non-vectorized users.
+  SmallSet<unsigned, 4> PostponedNodesWithNonVecUsers;
+
   /// Maps a value to the proposed vectorizable size.
   SmallDenseMap<Value *, unsigned> InstrElementSize;
 
@@ -8993,6 +9007,81 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
   return {IntrinsicCost, LibCost};
 }
 
+bool BoUpSLP::isProfitableToVectorizeWithNonVecUsers(
+    const InstructionsState &S, const EdgeInfo &UserTreeIdx,
+    ArrayRef<Value *> Scalars, ArrayRef<int> ScalarsMask) {
+  assert(S && "Expected valid instructions state.");
+  // Loads, extracts and geps are immediately scalarizable, so no need to check.
+  if (S.getOpcode() == Instruction::Load ||
+      S.getOpcode() == Instruction::ExtractElement ||
+      S.getOpcode() == Instruction::GetElementPtr)
+    return true;
+  // Check only vectorized users, others scalarized (potentially, at least)
+  // already.
+  if (!UserTreeIdx.UserTE || UserTreeIdx.UserTE->isGather() ||
+      UserTreeIdx.UserTE->State == TreeEntry::SplitVectorize)
+    return true;
+  // PHI nodes may have cyclic deps, so cannot check here.
+  if (UserTreeIdx.UserTE->getOpcode() == Instruction::PHI)
+    return true;
+  // Do not check root reduction nodes, they do not have non-vectorized users.
+  if (UserIgnoreList && UserTreeIdx.UserTE->Idx == 0)
+    return true;
+  constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  ArrayRef<Value *> VL = UserTreeIdx.UserTE->Scalars;
+  Type *UserScalarTy = getValueType(VL.front());
+  if (!isValidElementType(UserScalarTy))
+    return true;
+  Type *ScalarTy = getValueType(Scalars.front());
+  if (!isValidElementType(ScalarTy))
+    return true;
+  // Ignore subvectors extracts.
+  if (UserScalarTy->isVectorTy())
+    return true;
+  auto *UserVecTy =
+      getWidenedType(UserScalarTy, UserTreeIdx.UserTE->getVectorFactor());
+  APInt DemandedElts = APInt::getZero(UserTreeIdx.UserTE->getVectorFactor());
+  // Check the external uses and check, if vector node + extracts is not
+  // profitable for the vectorization.
+  InstructionCost UserScalarsCost = 0;
+  for (Value *V : VL) {
+    auto *I = dyn_cast<Instruction>(V);
+    if (!I)
+      continue;
+    if (areAllUsersVectorized(I, UserIgnoreList))
+      continue;
+    DemandedElts.setBit(UserTreeIdx.UserTE->findLaneForValue(V));
+    UserScalarsCost += TTI->getInstructionCost(I, CostKind);
+  }
+  // No non-vectorized users - success.
+  if (DemandedElts.isZero())
+    return true;
+  // If extracts are cheaper than the original scalars - success.
+  InstructionCost ExtractCost =
+      ::getScalarizationOverhead(*TTI, UserScalarTy, UserVecTy, DemandedElts,
+                                 /*Insert=*/false, /*Extract=*/true, CostKind);
+  if (ExtractCost <= UserScalarsCost)
+    return true;
+  SmallPtrSet<Value *, 4> CheckedExtracts;
+  InstructionCost NodeCost =
+      UserTreeIdx.UserTE->State == TreeEntry::CombinedVectorize
+          ? InstructionCost(0)
+          : getEntryCost(UserTreeIdx.UserTE, {}, CheckedExtracts);
+  // The node is profitable for vectorization - success.
+  if (ExtractCost + NodeCost <= -SLPCostThreshold)
+    return true;
+  auto *VecTy = getWidenedType(ScalarTy, Scalars.size());
+  InstructionCost ScalarsCost = ::getScalarizationOverhead(
+      *TTI, ScalarTy, VecTy, APInt::getAllOnes(Scalars.size()),
+      /*Insert=*/true, /*Extract=*/false, CostKind);
+  if (!ScalarsMask.empty())
+    ScalarsCost += getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, VecTy,
+                                  ScalarsMask, CostKind);
+
+  // User extracts are cheaper than user scalars + immediate scalars - success.
+  return ExtractCost - (UserScalarsCost + ScalarsCost) < -SLPCostThreshold;
+}
+
 BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
     const InstructionsState &S, ArrayRef<Value *> VL,
     bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
@@ -10283,6 +10372,17 @@ void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
     return;
   }
 
+  // Postpone vectorization, if the node is not profitable because of the
+  // external uses.
+  if (!isProfitableToVectorizeWithNonVecUsers(S, UserTreeIdx, VL,
+                                              ReuseShuffleIndices)) {
+    PostponedNodesWithNonVecUsers.insert(VectorizableTree.size());
+    auto Invalid = ScheduleBundle::invalid();
+    newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx,
+                 ReuseShuffleIndices);
+    return;
+  }
+
   Instruction *VL0 = S.getMainOp();
   BasicBlock *BB = VL0->getParent();
   auto &BSRef = BlocksSchedules[BB];
@@ -11668,6 +11768,27 @@ void BoUpSLP::transformNodes() {
       ArrayRef<Value *> VL = E.Scalars;
       const unsigned Sz = getVectorElementSize(VL.front());
       unsigned MinVF = getMinVF(2 * Sz);
+      const EdgeInfo &EI = E.UserTreeIndex;
+      // Try to vectorized postponed scalars, if external uses are vectorized.
+      if (PostponedNodesWithNonVecUsers.contains(E.Idx) &&
+          isProfitableToVectorizeWithNonVecUsers(
+              E.getOperations(), EI, E.Scalars, E.ReuseShuffleIndices)) {
+        assert(E.hasState() && "Expected to have state");
+        unsigned PrevSize = VectorizableTree.size();
+        [[maybe_unused]] unsigned PrevEntriesSize =
+            LoadEntriesToVectorize.size();
+        buildTreeRec(VL, 0, EdgeInfo(&E, UINT_MAX));
+        if (PrevSize + 1 == VectorizableTree.size() &&
+            VectorizableTree[PrevSize]->isGather()) {
+          VectorizableTree.pop_back();
+          assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
+                 "LoadEntriesToVectorize expected to remain the same");
+        } else {
+          E.CombinedEntriesWithIndices.emplace_back(PrevSize, 0);
+          continue;
+        }
+      }
+
       // Do not try partial vectorization for small nodes (<= 2), nodes with the
       // same opcode and same parent block or all constants.
       if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
@@ -12828,7 +12949,9 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
       });
       InVectors.front() = V;
     }
-    if (!SubVectors.empty()) {
+    if (!SubVectors.empty() &&
+        (SubVectors.size() > 1 || SubVectors.front().second != 0 ||
+         SubVectors.front().first->getVectorFactor() != CommonMask.size())) {
       const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
       if (InVectors.size() == 2)
         Cost += createShuffle(Vec, InVectors.back(), CommonMask);
@@ -13348,7 +13471,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
   case Instruction::Trunc:
   case Instruction::FPTrunc:
   case Instruction::BitCast: {
-    auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
+    auto SrcIt =
+        MinBWs.empty() ? MinBWs.end() : MinBWs.find(getOperandEntry(E, 0));
     Type *SrcScalarTy = VL0->getOperand(0)->getType();
     auto *SrcVecTy = getWidenedType(SrcScalarTy, VL.size());
     unsigned Opcode = ShuffleOrOp;
@@ -13795,7 +13919,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
         Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
         auto *SrcTy = getWidenedType(SrcSclTy, VL.size());
         if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {
-          auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
+          auto SrcIt = MinBWs.empty() ? MinBWs.end()
+                                      : MinBWs.find(getOperandEntry(E, 0));
           unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
           unsigned SrcBWSz =
               DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
@@ -14793,6 +14918,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
       auto *Inst = cast<Instruction>(EU.Scalar);
       InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind);
       auto OperandIsScalar = [&](Value *V) {
+        if (!isa<Instruction>(V))
+          return true;
         if (!isVectorized(V)) {
           // Some extractelements might be not vectorized, but
           // transformed into shuffle and removed from the function,
@@ -16873,7 +17000,18 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
       });
       InVectors.front() = Vec;
     }
-    if (!SubVectors.empty()) {
+    if (SubVectors.size() == 1 && SubVectors.front().second == 0 &&
+        SubVectors.front().first->getVectorFactor() == CommonMask.size()) {
+      Value *Vec = SubVectors.front().first->VectorizedValue;
+      if (Vec->getType()->isIntOrIntVectorTy())
+        Vec = castToScalarTyElem(
+            Vec, any_of(SubVectors.front().first->Scalars, [&](Value *V) {
+              if (isa<PoisonValue>(V))
+                return false;
+              return !isKnownNonNegative(V, SimplifyQuery(*R.DL));
+            }));
+      transformMaskAfterShuffle(CommonMask, CommonMask);
+    } else if (!SubVectors.empty()) {
       Value *Vec = InVectors.front();
       if (InVectors.size() == 2) {
         Vec = createShuffle(Vec, InVectors.back(), CommonMask);
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll
index d4e323819402c..117c892b79f9a 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll
@@ -102,80 +102,88 @@ define fastcc i32 @test(i32 %0, i32 %add111.i.i, <4 x i32> %PredPel.i.sroa.86.72
 ; THRESH-NEXT:    [[SHR143_5_I_I9:%.*]] = ashr i32 [[TMP0]], 1
 ; THRESH-NEXT:    [[ADD1392_I:%.*]] = add i32 [[TMP0]], 1
 ; THRESH-NEXT:    [[MUL1445_I:%.*]] = shl i32 [[TMP0]], 1
-; THRESH-NEXT:    [[ADD2136_I:%.*]] = or i32 [[LOOPARRAY_SROA_24_0_I_I3]], [[TMP0]]
-; THRESH-NEXT:    [[SHR2137_I:%.*]] = lshr i32 [[ADD2136_I]], 1
-; THRESH-NEXT:    [[CONV2138_I:%.*]] = trunc i32 [[SHR2137_I]] to i16
 ; THRESH-NEXT:    [[ADD2174_I:%.*]] = add i32 [[MUL1445_I]], 2
 ; THRESH-NEXT:    [[SHR2175_I:%.*]] = lshr i32 [[ADD2174_I]], 2
-; THRESH-NEXT:    [[CONV2176_I:%.*]] = trunc i32 [[SHR2175_I]] to i16
 ; THRESH-NEXT:    [[ADD2190_I:%.*]] = or i32 [[ADD1392_I]], 1
 ; THRESH-NEXT:    [[ADD2191_I:%.*]] = add i32 [[ADD2190_I]], [[TMP0]]
-; THRESH-NEXT:    [[CONV2193_I:%.*]] = trunc i32 [[ADD2191_I]] to i16
-; THRESH-NEXT:    [[ADD2203_I:%.*]] = or i32 [[TMP0]], 1
-; THRESH-NEXT:    [[ADD2204_I:%.*]] = add i32 [[ADD2203_I]], [[TMP0]]
-; THRESH-NEXT:    [[CONV2206_I:%.*]] = trunc i32 [[ADD2204_I]] to i16
+; THRESH-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <2 x i32> <i32 poison, i32 0>
+; THRESH-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[LOOPARRAY_SROA_24_0_I_I3]], i32 0
+; THRESH-NEXT:    [[TMP4:%.*]] = add <2 x i32> [[TMP3]], splat (i32 1)
+; THRESH-NEXT:    [[TMP5:%.*]] = lshr <2 x i32> [[TMP4]], splat (i32 1)
 ; THRESH-NEXT:    [[ADD2235_I16:%.*]] = or i32 [[TMP0]], 1
 ; THRESH-NEXT:    [[ADD2236_I:%.*]] = add i32 [[ADD2235_I16]], 1
 ; THRESH-NEXT:    [[SHR2237_I:%.*]] = lshr i32 [[ADD2236_I]], 1
-; THRESH-NEXT:    [[CONV2238_I:%.*]] = trunc i32 [[SHR2237_I]] to i16
-; THRESH-NEXT:    store i16 [[CONV2238_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8196), align 4
-; THRESH-NEXT:    store i16 [[CONV2238_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8176), align 8
-; THRESH-NEXT:    [[ADD2258_I:%.*]] = or i32 [[ADD111_I_I]], [[TMP0]]
-; THRESH-NEXT:    [[SHR2259_I:%.*]] = lshr i32 [[ADD2258_I]], 1
-; THRESH-NEXT:    [[CONV2260_I:%.*]] = trunc i32 [[SHR2259_I]] to i16
-; THRESH-NEXT:    store i16 [[CONV2260_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8212), align 4
-; THRESH-NEXT:    store i16 [[CONV2260_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8192), align 8
-; THRESH-NEXT:    store i16 [[CONV2260_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8172), align 4
+; THRESH-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[ADD111_I_I]], i32 0
+; THRESH-NEXT:    [[TMP19:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[LOOPARRAY_SROA_24_0_I_I3]], i32 1
+; THRESH-NEXT:    [[TMP20:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0
+; THRESH-NEXT:    [[TMP21:%.*]] = shufflevector <2 x i32> [[TMP20]], <2 x i32> poison, <2 x i32> zeroinitializer
+; THRESH-NEXT:    [[TMP22:%.*]] = or <2 x i32> [[TMP19]], [[TMP21]]
 ; THRESH-NEXT:    [[ADD2302_I:%.*]] = add i32 [[TMP0]], 1
 ; THRESH-NEXT:    [[SHR2303_I:%.*]] = lshr i32 [[ADD2302_I]], 1
-; THRESH-NEXT:    [[CONV2304_I:%.*]] = trunc i32 [[SHR2303_I]] to i16
-; THRESH-NEXT:    store i16 [[CONV2304_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8224), align 8
-; THRESH-NEXT:    store i16 [[CONV2304_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8204), align 4
-; THRESH-NEXT:    store i16 [[CONV2304_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8184), align 8
 ; THRESH-NEXT:    [[ADD2323_I:%.*]] = add i32 [[TMP0]], 1
 ; THRESH-NEXT:    [[ADD2324_I:%.*]] = or i32 [[ADD2323_I]], [[TMP0]]
 ; THRESH-NEXT:    [[SHR2325_I:%.*]] = lshr i32 [[ADD2324_I]], 1
-; THRESH-NEXT:    [[CONV2326_I:%.*]] = trunc i32 [[SHR2325_I]] to i16
-; THRESH-NEXT:    store i16 [[CONV2326_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8220), align 4
-; THRESH-NEXT:    store i16 [[CONV2326_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8200), align 8
-; THRESH-NEXT:    [[ADD2342_I:%.*]] = add i32 [[SHR143_5_I_I9]], 1
-; THRESH-NEXT:    [[SHR2343_I:%.*]] = lshr i32 [[ADD2342_I]], 1
-; THRESH-NEXT:    [[CONV2344_I:%.*]] = trunc i32 [[SHR2343_I]] to i16
-; THRESH-NEXT:    store i16 [[CONV2344_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8216), align 8
+; THRESH-NEXT:    [[TMP25:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 1
+; THRESH-NEXT:    [[TMP12:%.*]] = insertelement <2 x i32> [[TMP25]], i32 [[SHR143_5_I_I9]], i32 0
+; THRESH-NEXT:    [[TMP13:%.*]] = add <2 x i32> [[TMP12]], splat (i32 1)
+; THRESH-NEXT:    [[TMP14:%.*]] = or <2 x i32> [[TMP12]], splat (i32 1)
+; THRESH-NEXT:    [[TMP15:%.*]] = shufflevector <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> <i32 0, i32 3>
 ; THRESH-NEXT:    [[ADD2355_I:%.*]] = or i32 [[SHR143_5_I_I9]], 1
 ; THRESH-NEXT:    [[ADD2356_I:%.*]] = add i32 [[ADD2355_I]], [[TMP0]]
 ; THRESH-NEXT:    [[CONV2358_I:%.*]] = trunc i32 [[ADD2356_I]] to i16
 ; THRESH-NEXT:    store i16 [[CONV2358_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8232), align 8
-; THRESH-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <2 x i32> <i32 poison, i32 0>
-; THRESH-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[LOOPARRAY_SROA_24_0_I_I3]], i32 0
-; THRESH-NEXT:    [[TMP4:%.*]] = add <2 x i32> [[TMP3]], splat (i32 1)
-; THRESH-NEXT:    [[TMP5:%.*]] = lshr <2 x i32> [[TMP4]], splat (i32 1)
-; THRESH-NEXT:    [[TMP6:%.*]] = trunc <2 x i32> [[TMP5]] to <2 x i16>
-; THRESH-NEXT:    store <2 x i16> [[TMP6]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8180), align 4
 ; THRESH-NEXT:    [[ADD2393_I:%.*]] = or i32 [[LOOPARRAY_SROA_24_0_I_I3]], 1
 ; THRESH-NEXT:    [[ADD2394_I:%.*]] = add i32 [[ADD2393_I]], [[TMP0]]
-; THRESH-NEXT:    [[CONV2396_I:%.*]] = trunc i32 [[ADD2394_I]] to i16
-; THRESH-NEXT:    store i16 [[CONV2396_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8198), align 2
-; THRESH-NEXT:    store i16 [[CONV2396_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8178), align 2
-; THRESH-NEXT:    store i16 [[CONV2138_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8214), align 2
-; THRESH-NEXT:    store i16 [[CONV2138_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8194), align 2
-; THRESH-NEXT:    store i16 [[CONV2138_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8174), align 2
+; THRESH-NEXT:    [[TMP16:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; THRESH-NEXT:    [[TMP17:%.*]] = insertelement <8 x i32> [[TMP16]], i32 [[SHR2303_I]], i32 2
+; THRESH-NEXT:    [[TMP18:%.*]] = insertelement <8 x i32> [[TMP17]], i32 [[SHR2175_I]], i32 3
+; THRESH-NEXT:    [[CONV2176_I:%.*]] = trunc i32 [[SHR2175_I]] to i16
+; THRESH-NEXT:    [[CONV2238_I:%.*]] = trunc i32 [[SHR2237_I]] to i16
+; THRESH-NEXT:    store i16 [[CONV2238_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8196), align 4
 ; THRESH-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[PREDPEL_I_SROA_86_72_VEC_EXTRACT]], <4 x i32> poison, <2 x i32> <i32 poison, i32 0>
 ; THRESH-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[ADD111_I_I]], i32 0
 ; THRESH-NEXT:    [[TMP9:%.*]] = add <2 x i32> [[TMP8]], splat (i32 1)
 ; THRESH-NEXT:    [[TMP10:%.*]] = lshr <2 x i32> [[TMP9]], splat (i32 1)
+; THRESH-NEXT:    [[TMP23:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; THRESH-NEXT:    [[TMP24:%.*]] = shufflevector <8 x i32> [[TMP18]], <8 x i32> [[TMP23]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
 ; THRESH-NEXT:    [[TMP11:%.*]] = trunc <2 x i32> [[TMP10]] to <2 x i16>
-; THRESH-NEXT:    [[TMP12:%.*]] = extractelement <2 x i16> [[TMP11]], i32 1
-; THRESH-NEXT:    store <2 x i16> [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8228), align 4
+; THRESH-NEXT:    [[TMP26:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; THRESH-NEXT:    [[TMP27:%.*]] = trunc <2 x i32> [[TMP10]] to <2 x i16>
+; THRESH-NEXT:    store <2 x i16> [[TMP27]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8228), align 4
 ; THRESH-NEXT:    store <2 x i16> [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8208), align 8
-; THRESH-NEXT:    store <2 x i16> [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8188), align 4
-; THRESH-NEXT:    store i16 [[TMP12]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8170), align 2
 ; THRESH-NEXT:    store i16 [[CONV2176_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8226), align 2
 ; THRESH-NEXT:    store i16 [[CONV2176_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8206), align 2
-; THRESH-NEXT:    store i16 [[CONV2176_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8186), align 2
+; THRESH-NEXT:    [[CONV2304_I:%.*]] = trunc i32 [[SHR2303_I]] to i16
+; THRESH-NEXT:    [[TMP28:%.*]] = insertelement <4 x i32> poison, i32 [[ADD2394_I]], i32 0
+; THRESH-NEXT:    [[TMP29:%.*]] = insertelement <4 x i32> [[TMP28]], i32 [[SHR2325_I]], i32 1
+; THRESH-NEXT:    [[TMP30:%.*]] = insertelement <4 x i32> [[TMP29]], i32 [[ADD2191_I]], i32 2
+; THRESH-NEXT:    [[TMP31:%.*]] = insertelement <4 x i32> [[TMP30]], i32 [[SHR2303_I]], i32 3
+; THRESH-NEXT:    [[TMP32:%.*]] = trunc <4 x i32> [[TMP31]] to <4 x i16>
+; THRESH-NEXT:    [[CONV2193_I:%.*]] = trunc i32 [[ADD2191_I]] to i16
+; THRESH-NEXT:    [[CONV2326_I:%.*]] = trunc i32 [[SHR2325_I]] to i16
+; THRESH-NEXT:    [[CONV2396_I:%.*]] = trunc i32 [[ADD2394_I]] to i16
+; THRESH-NEXT:    store i16 [[CONV2326_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8220), align 4
+; THRESH-NEXT:    store i16 [[CONV2396_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8178), align 2
+; THRESH-NEXT:    store i16 [[CONV2304_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8224), align 8
 ; THRESH-NEXT:    store i16 [[CONV2193_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8222), align 2
-; THRESH-NEXT:    store i16 [[CONV2193_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8202), align 2
-; THRESH-NEXT:    store i16 [[CONV2206_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8218), align 2
+; THRESH-NEXT:    store <4 x i16> [[TMP32]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8198), align 2
+; THRESH-NEXT:    [[TMP33:%.*]] = shufflevector <2 x i32> [[TMP22]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; THRESH-NEXT:    [[TMP34:%.*]] = shufflevector <2 x i32> [[TMP15]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; THRESH-NEXT:    [[TMP35:%.*]] = shufflevector <4 x i32> [[TMP33]], <4 x i32> [[TMP34]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; THRESH-NEXT:    [[TMP36:%.*]] = insertelement <4 x i32> <i32 1, i32 1, i32 1, i32 poison>, i32 [[TMP0]], i32 3
+; THRESH-NEXT:    [[TMP37:%.*]] = lshr <4 x i32> [[TMP35]], [[TMP36]]
+; THRESH-NEXT:    [[TMP38:%.*]] = add <4 x i32> [[TMP35]], [[TMP36]]
+; THRESH-NEXT:    [[TMP39:%.*]] = shufflevector <4 x i32> [[TMP37]], <4 x i32> [[TMP38]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; THRESH-NEXT:    [[TMP40:%.*]] = shufflevector <4 x i32> [[TMP39]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; THRESH-NEXT:    [[TMP41:%.*]] = shufflevector <8 x i32> [[TMP24]], <8 x i32> [[TMP40]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; THRESH-NEXT:    [[TMP42:%.*]] = trunc <8 x i32> [[TMP41]] to <8 x i16>
+; THRESH-NEXT:    [[TMP43:%.*]] = shufflevector <4 x i32> [[TMP26]], <4 x i32> [[TMP39]], <4 x i32> <i32 0, i32 4, i32 5, i32 poison>
+; THRESH-NEXT:    [[TMP44:%.*]] = insertelement <4 x i32> [[TMP43]], i32 [[SHR2237_I]], i32 3
+; THRESH-NEXT:    [[TMP45:%.*]] = trunc <4 x i32> [[TMP44]] to <4 x i16>
+; THRESH-NEXT:    [[TMP46:%.*]] = trunc <4 x i32> [[TMP39]] to <4 x i16>
+; THRESH-NEXT:    store <4 x i16> [[TMP45]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8170), align 2
+; THRESH-NEXT:    store <8 x i16> [[TMP42]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8180), align 4
+; THRESH-NEXT:    store <4 x i16> [[TMP46]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8212), align 4
 ; THRESH-NEXT:    ret i32 0
 ;
 entry: