[llvm] [SLP] Check for extracts, being replaced by original scalars, for user nodes (PR #149572)

Fri Jul 25 12:51:39 PDT 2025

https://github.com/alexey-bataev updated https://github.com/llvm/llvm-project/pull/149572

>From eb2d7aa90afdb5fc6f54a4fd9b4828c4050cc7ce Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Fri, 18 Jul 2025 19:34:54 +0000
Subject: [PATCH] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20initia?=
 =?UTF-8?q?l=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Created using spr 1.3.5
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 146 +++++++++++++++++-
 .../RISCV/reordered-buildvector-scalars.ll    | 102 ++++++------
 2 files changed, 197 insertions(+), 51 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 6ad5c60105a28..14d4e45d1ab6f 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1906,6 +1906,7 @@ class BoUpSLP {
   void deleteTree() {
     VectorizableTree.clear();
     ScalarToTreeEntries.clear();
+    PostponedNodesWithNonVecUsers.clear();
     OperandsToTreeEntry.clear();
     ScalarsInSplitNodes.clear();
     MustGather.clear();
@@ -3896,6 +3897,9 @@ class BoUpSLP {
 
     bool hasState() const { return S.valid(); }
 
+    /// Returns the state of the operations.
+    const InstructionsState &getOperations() const { return S; }
+
     /// When ReuseReorderShuffleIndices is empty it just returns position of \p
     /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
     unsigned findLaneForValue(Value *V) const {
@@ -4290,6 +4294,13 @@ class BoUpSLP {
                                OrdersType &CurrentOrder,
                                SmallVectorImpl<Value *> &PointerOps);
 
+  /// Checks if it is profitable to vectorize the specified list of the
+  /// instructions if not all users are vectorized.
+  bool isProfitableToVectorizeWithNonVecUsers(const InstructionsState &S,
+                                              const EdgeInfo &UserTreeIdx,
+                                              ArrayRef<Value *> Scalars,
+                                              ArrayRef<int> ScalarsMask);
+
   /// Maps a specific scalar to its tree entry(ies).
   SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
 
@@ -4300,6 +4311,9 @@ class BoUpSLP {
   /// Scalars, used in split vectorize nodes.
   SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarsInSplitNodes;
 
+  /// List of tree nodes indices, which have non-vectorized users.
+  SmallSet<unsigned, 4> PostponedNodesWithNonVecUsers;
+
   /// Maps a value to the proposed vectorizable size.
   SmallDenseMap<Value *, unsigned> InstrElementSize;
 
@@ -8993,6 +9007,81 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
   return {IntrinsicCost, LibCost};
 }
 
+bool BoUpSLP::isProfitableToVectorizeWithNonVecUsers(
+    const InstructionsState &S, const EdgeInfo &UserTreeIdx,
+    ArrayRef<Value *> Scalars, ArrayRef<int> ScalarsMask) {
+  assert(S && "Expected valid instructions state.");
+  // Loads, extracts and geps are immediately scalarizable, so no need to check.
+  if (S.getOpcode() == Instruction::Load ||
+      S.getOpcode() == Instruction::ExtractElement ||
+      S.getOpcode() == Instruction::GetElementPtr)
+    return true;
+  // Check only vectorized users, others scalarized (potentially, at least)
+  // already.
+  if (!UserTreeIdx.UserTE || UserTreeIdx.UserTE->isGather() ||
+      UserTreeIdx.UserTE->State == TreeEntry::SplitVectorize)
+    return true;
+  // PHI nodes may have cyclic deps, so cannot check here.
+  if (UserTreeIdx.UserTE->getOpcode() == Instruction::PHI)
+    return true;
+  // Do not check root reduction nodes, they do not have non-vectorized users.
+  if (UserIgnoreList && UserTreeIdx.UserTE->Idx == 0)
+    return true;
+  constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  ArrayRef<Value *> VL = UserTreeIdx.UserTE->Scalars;
+  Type *UserScalarTy = getValueType(VL.front());
+  if (!isValidElementType(UserScalarTy))
+    return true;
+  Type *ScalarTy = getValueType(Scalars.front());
+  if (!isValidElementType(ScalarTy))
+    return true;
+  // Ignore subvectors extracts.
+  if (UserScalarTy->isVectorTy())
+    return true;
+  auto *UserVecTy =
+      getWidenedType(UserScalarTy, UserTreeIdx.UserTE->getVectorFactor());
+  APInt DemandedElts = APInt::getZero(UserTreeIdx.UserTE->getVectorFactor());
+  // Check the external uses and check, if vector node + extracts is not
+  // profitable for the vectorization.
+  InstructionCost UserScalarsCost = 0;
+  for (Value *V : VL) {
+    auto *I = dyn_cast<Instruction>(V);
+    if (!I)
+      continue;
+    if (areAllUsersVectorized(I, UserIgnoreList))
+      continue;
+    DemandedElts.setBit(UserTreeIdx.UserTE->findLaneForValue(V));
+    UserScalarsCost += TTI->getInstructionCost(I, CostKind);
+  }
+  // No non-vectorized users - success.
+  if (DemandedElts.isZero())
+    return true;
+  // If extracts are cheaper than the original scalars - success.
+  InstructionCost ExtractCost =
+      ::getScalarizationOverhead(*TTI, UserScalarTy, UserVecTy, DemandedElts,
+                                 /*Insert=*/false, /*Extract=*/true, CostKind);
+  if (ExtractCost <= UserScalarsCost)
+    return true;
+  SmallPtrSet<Value *, 4> CheckedExtracts;
+  InstructionCost NodeCost =
+      UserTreeIdx.UserTE->State == TreeEntry::CombinedVectorize
+          ? InstructionCost(0)
+          : getEntryCost(UserTreeIdx.UserTE, {}, CheckedExtracts);
+  // The node is profitable for vectorization - success.
+  if (ExtractCost + NodeCost <= -SLPCostThreshold)
+    return true;
+  auto *VecTy = getWidenedType(ScalarTy, Scalars.size());
+  InstructionCost ScalarsCost = ::getScalarizationOverhead(
+      *TTI, ScalarTy, VecTy, APInt::getAllOnes(Scalars.size()),
+      /*Insert=*/true, /*Extract=*/false, CostKind);
+  if (!ScalarsMask.empty())
+    ScalarsCost += getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, VecTy,
+                                  ScalarsMask, CostKind);
+
+  // User extracts are cheaper than user scalars + immediate scalars - success.
+  return ExtractCost - (UserScalarsCost + ScalarsCost) < -SLPCostThreshold;
+}
+
 BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
     const InstructionsState &S, ArrayRef<Value *> VL,
     bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
@@ -10283,6 +10372,17 @@ void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
     return;
   }
 
+  // Postpone vectorization, if the node is not profitable because of the
+  // external uses.
+  if (!isProfitableToVectorizeWithNonVecUsers(S, UserTreeIdx, VL,
+                                              ReuseShuffleIndices)) {
+    PostponedNodesWithNonVecUsers.insert(VectorizableTree.size());
+    auto Invalid = ScheduleBundle::invalid();
+    newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx,
+                 ReuseShuffleIndices);
+    return;
+  }
+
   Instruction *VL0 = S.getMainOp();
   BasicBlock *BB = VL0->getParent();
   auto &BSRef = BlocksSchedules[BB];
@@ -11668,6 +11768,27 @@ void BoUpSLP::transformNodes() {
       ArrayRef<Value *> VL = E.Scalars;
       const unsigned Sz = getVectorElementSize(VL.front());
       unsigned MinVF = getMinVF(2 * Sz);
+      const EdgeInfo &EI = E.UserTreeIndex;
+      // Try to vectorized postponed scalars, if external uses are vectorized.
+      if (PostponedNodesWithNonVecUsers.contains(E.Idx) &&
+          isProfitableToVectorizeWithNonVecUsers(
+              E.getOperations(), EI, E.Scalars, E.ReuseShuffleIndices)) {
+        assert(E.hasState() && "Expected to have state");
+        unsigned PrevSize = VectorizableTree.size();
+        [[maybe_unused]] unsigned PrevEntriesSize =
+            LoadEntriesToVectorize.size();
+        buildTreeRec(VL, 0, EdgeInfo(&E, UINT_MAX));
+        if (PrevSize + 1 == VectorizableTree.size() &&
+            VectorizableTree[PrevSize]->isGather()) {
+          VectorizableTree.pop_back();
+          assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
+                 "LoadEntriesToVectorize expected to remain the same");
+        } else {
+          E.CombinedEntriesWithIndices.emplace_back(PrevSize, 0);
+          continue;
+        }
+      }
+
       // Do not try partial vectorization for small nodes (<= 2), nodes with the
       // same opcode and same parent block or all constants.
       if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
@@ -12828,7 +12949,9 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
       });
       InVectors.front() = V;
     }
-    if (!SubVectors.empty()) {
+    if (!SubVectors.empty() &&
+        (SubVectors.size() > 1 || SubVectors.front().second != 0 ||
+         SubVectors.front().first->getVectorFactor() != CommonMask.size())) {
       const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
       if (InVectors.size() == 2)
         Cost += createShuffle(Vec, InVectors.back(), CommonMask);
@@ -13348,7 +13471,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
   case Instruction::Trunc:
   case Instruction::FPTrunc:
   case Instruction::BitCast: {
-    auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
+    auto SrcIt =
+        MinBWs.empty() ? MinBWs.end() : MinBWs.find(getOperandEntry(E, 0));
     Type *SrcScalarTy = VL0->getOperand(0)->getType();
     auto *SrcVecTy = getWidenedType(SrcScalarTy, VL.size());
     unsigned Opcode = ShuffleOrOp;
@@ -13795,7 +13919,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
         Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
         auto *SrcTy = getWidenedType(SrcSclTy, VL.size());
         if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {
-          auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
+          auto SrcIt = MinBWs.empty() ? MinBWs.end()
+                                      : MinBWs.find(getOperandEntry(E, 0));
           unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
           unsigned SrcBWSz =
               DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
@@ -14793,6 +14918,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
       auto *Inst = cast<Instruction>(EU.Scalar);
       InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind);
       auto OperandIsScalar = [&](Value *V) {
+        if (!isa<Instruction>(V))
+          return true;
         if (!isVectorized(V)) {
           // Some extractelements might be not vectorized, but
           // transformed into shuffle and removed from the function,
@@ -16873,7 +17000,18 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
       });
       InVectors.front() = Vec;
     }
-    if (!SubVectors.empty()) {
+    if (SubVectors.size() == 1 && SubVectors.front().second == 0 &&
+        SubVectors.front().first->getVectorFactor() == CommonMask.size()) {
+      Value *Vec = SubVectors.front().first->VectorizedValue;
+      if (Vec->getType()->isIntOrIntVectorTy())
+        Vec = castToScalarTyElem(
+            Vec, any_of(SubVectors.front().first->Scalars, [&](Value *V) {
+              if (isa<PoisonValue>(V))
+                return false;
+              return !isKnownNonNegative(V, SimplifyQuery(*R.DL));
+            }));
+      transformMaskAfterShuffle(CommonMask, CommonMask);
+    } else if (!SubVectors.empty()) {
       Value *Vec = InVectors.front();
       if (InVectors.size() == 2) {
         Vec = createShuffle(Vec, InVectors.back(), CommonMask);
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll
index d4e323819402c..117c892b79f9a 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll
@@ -102,80 +102,88 @@ define fastcc i32 @test(i32 %0, i32 %add111.i.i, <4 x i32> %PredPel.i.sroa.86.72
 ; THRESH-NEXT:    [[SHR143_5_I_I9:%.*]] = ashr i32 [[TMP0]], 1
 ; THRESH-NEXT:    [[ADD1392_I:%.*]] = add i32 [[TMP0]], 1
 ; THRESH-NEXT:    [[MUL1445_I:%.*]] = shl i32 [[TMP0]], 1
-; THRESH-NEXT:    [[ADD2136_I:%.*]] = or i32 [[LOOPARRAY_SROA_24_0_I_I3]], [[TMP0]]
-; THRESH-NEXT:    [[SHR2137_I:%.*]] = lshr i32 [[ADD2136_I]], 1
-; THRESH-NEXT:    [[CONV2138_I:%.*]] = trunc i32 [[SHR2137_I]] to i16
 ; THRESH-NEXT:    [[ADD2174_I:%.*]] = add i32 [[MUL1445_I]], 2
 ; THRESH-NEXT:    [[SHR2175_I:%.*]] = lshr i32 [[ADD2174_I]], 2
-; THRESH-NEXT:    [[CONV2176_I:%.*]] = trunc i32 [[SHR2175_I]] to i16
 ; THRESH-NEXT:    [[ADD2190_I:%.*]] = or i32 [[ADD1392_I]], 1
 ; THRESH-NEXT:    [[ADD2191_I:%.*]] = add i32 [[ADD2190_I]], [[TMP0]]
-; THRESH-NEXT:    [[CONV2193_I:%.*]] = trunc i32 [[ADD2191_I]] to i16
-; THRESH-NEXT:    [[ADD2203_I:%.*]] = or i32 [[TMP0]], 1
-; THRESH-NEXT:    [[ADD2204_I:%.*]] = add i32 [[ADD2203_I]], [[TMP0]]
-; THRESH-NEXT:    [[CONV2206_I:%.*]] = trunc i32 [[ADD2204_I]] to i16
+; THRESH-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <2 x i32> <i32 poison, i32 0>
+; THRESH-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[LOOPARRAY_SROA_24_0_I_I3]], i32 0
+; THRESH-NEXT:    [[TMP4:%.*]] = add <2 x i32> [[TMP3]], splat (i32 1)
+; THRESH-NEXT:    [[TMP5:%.*]] = lshr <2 x i32> [[TMP4]], splat (i32 1)
 ; THRESH-NEXT:    [[ADD2235_I16:%.*]] = or i32 [[TMP0]], 1
 ; THRESH-NEXT:    [[ADD2236_I:%.*]] = add i32 [[ADD2235_I16]], 1
 ; THRESH-NEXT:    [[SHR2237_I:%.*]] = lshr i32 [[ADD2236_I]], 1
-; THRESH-NEXT:    [[CONV2238_I:%.*]] = trunc i32 [[SHR2237_I]] to i16
-; THRESH-NEXT:    store i16 [[CONV2238_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8196), align 4
-; THRESH-NEXT:    store i16 [[CONV2238_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8176), align 8
-; THRESH-NEXT:    [[ADD2258_I:%.*]] = or i32 [[ADD111_I_I]], [[TMP0]]
-; THRESH-NEXT:    [[SHR2259_I:%.*]] = lshr i32 [[ADD2258_I]], 1
-; THRESH-NEXT:    [[CONV2260_I:%.*]] = trunc i32 [[SHR2259_I]] to i16
-; THRESH-NEXT:    store i16 [[CONV2260_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8212), align 4
-; THRESH-NEXT:    store i16 [[CONV2260_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8192), align 8
-; THRESH-NEXT:    store i16 [[CONV2260_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8172), align 4
+; THRESH-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[ADD111_I_I]], i32 0
+; THRESH-NEXT:    [[TMP19:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[LOOPARRAY_SROA_24_0_I_I3]], i32 1
+; THRESH-NEXT:    [[TMP20:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0
+; THRESH-NEXT:    [[TMP21:%.*]] = shufflevector <2 x i32> [[TMP20]], <2 x i32> poison, <2 x i32> zeroinitializer
+; THRESH-NEXT:    [[TMP22:%.*]] = or <2 x i32> [[TMP19]], [[TMP21]]
 ; THRESH-NEXT:    [[ADD2302_I:%.*]] = add i32 [[TMP0]], 1
 ; THRESH-NEXT:    [[SHR2303_I:%.*]] = lshr i32 [[ADD2302_I]], 1
-; THRESH-NEXT:    [[CONV2304_I:%.*]] = trunc i32 [[SHR2303_I]] to i16
-; THRESH-NEXT:    store i16 [[CONV2304_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8224), align 8
-; THRESH-NEXT:    store i16 [[CONV2304_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8204), align 4
-; THRESH-NEXT:    store i16 [[CONV2304_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8184), align 8
 ; THRESH-NEXT:    [[ADD2323_I:%.*]] = add i32 [[TMP0]], 1
 ; THRESH-NEXT:    [[ADD2324_I:%.*]] = or i32 [[ADD2323_I]], [[TMP0]]
 ; THRESH-NEXT:    [[SHR2325_I:%.*]] = lshr i32 [[ADD2324_I]], 1
-; THRESH-NEXT:    [[CONV2326_I:%.*]] = trunc i32 [[SHR2325_I]] to i16
-; THRESH-NEXT:    store i16 [[CONV2326_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8220), align 4
-; THRESH-NEXT:    store i16 [[CONV2326_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8200), align 8
-; THRESH-NEXT:    [[ADD2342_I:%.*]] = add i32 [[SHR143_5_I_I9]], 1
-; THRESH-NEXT:    [[SHR2343_I:%.*]] = lshr i32 [[ADD2342_I]], 1
-; THRESH-NEXT:    [[CONV2344_I:%.*]] = trunc i32 [[SHR2343_I]] to i16
-; THRESH-NEXT:    store i16 [[CONV2344_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8216), align 8
+; THRESH-NEXT:    [[TMP25:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 1
+; THRESH-NEXT:    [[TMP12:%.*]] = insertelement <2 x i32> [[TMP25]], i32 [[SHR143_5_I_I9]], i32 0
+; THRESH-NEXT:    [[TMP13:%.*]] = add <2 x i32> [[TMP12]], splat (i32 1)
+; THRESH-NEXT:    [[TMP14:%.*]] = or <2 x i32> [[TMP12]], splat (i32 1)
+; THRESH-NEXT:    [[TMP15:%.*]] = shufflevector <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> <i32 0, i32 3>
 ; THRESH-NEXT:    [[ADD2355_I:%.*]] = or i32 [[SHR143_5_I_I9]], 1
 ; THRESH-NEXT:    [[ADD2356_I:%.*]] = add i32 [[ADD2355_I]], [[TMP0]]
 ; THRESH-NEXT:    [[CONV2358_I:%.*]] = trunc i32 [[ADD2356_I]] to i16
 ; THRESH-NEXT:    store i16 [[CONV2358_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8232), align 8
-; THRESH-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <2 x i32> <i32 poison, i32 0>
-; THRESH-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[LOOPARRAY_SROA_24_0_I_I3]], i32 0
-; THRESH-NEXT:    [[TMP4:%.*]] = add <2 x i32> [[TMP3]], splat (i32 1)
-; THRESH-NEXT:    [[TMP5:%.*]] = lshr <2 x i32> [[TMP4]], splat (i32 1)
-; THRESH-NEXT:    [[TMP6:%.*]] = trunc <2 x i32> [[TMP5]] to <2 x i16>
-; THRESH-NEXT:    store <2 x i16> [[TMP6]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8180), align 4
 ; THRESH-NEXT:    [[ADD2393_I:%.*]] = or i32 [[LOOPARRAY_SROA_24_0_I_I3]], 1
 ; THRESH-NEXT:    [[ADD2394_I:%.*]] = add i32 [[ADD2393_I]], [[TMP0]]
-; THRESH-NEXT:    [[CONV2396_I:%.*]] = trunc i32 [[ADD2394_I]] to i16
-; THRESH-NEXT:    store i16 [[CONV2396_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8198), align 2
-; THRESH-NEXT:    store i16 [[CONV2396_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8178), align 2
-; THRESH-NEXT:    store i16 [[CONV2138_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8214), align 2
-; THRESH-NEXT:    store i16 [[CONV2138_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8194), align 2
-; THRESH-NEXT:    store i16 [[CONV2138_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8174), align 2
+; THRESH-NEXT:    [[TMP16:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; THRESH-NEXT:    [[TMP17:%.*]] = insertelement <8 x i32> [[TMP16]], i32 [[SHR2303_I]], i32 2
+; THRESH-NEXT:    [[TMP18:%.*]] = insertelement <8 x i32> [[TMP17]], i32 [[SHR2175_I]], i32 3
+; THRESH-NEXT:    [[CONV2176_I:%.*]] = trunc i32 [[SHR2175_I]] to i16
+; THRESH-NEXT:    [[CONV2238_I:%.*]] = trunc i32 [[SHR2237_I]] to i16
+; THRESH-NEXT:    store i16 [[CONV2238_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8196), align 4
 ; THRESH-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[PREDPEL_I_SROA_86_72_VEC_EXTRACT]], <4 x i32> poison, <2 x i32> <i32 poison, i32 0>
 ; THRESH-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[ADD111_I_I]], i32 0
 ; THRESH-NEXT:    [[TMP9:%.*]] = add <2 x i32> [[TMP8]], splat (i32 1)
 ; THRESH-NEXT:    [[TMP10:%.*]] = lshr <2 x i32> [[TMP9]], splat (i32 1)
+; THRESH-NEXT:    [[TMP23:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; THRESH-NEXT:    [[TMP24:%.*]] = shufflevector <8 x i32> [[TMP18]], <8 x i32> [[TMP23]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
 ; THRESH-NEXT:    [[TMP11:%.*]] = trunc <2 x i32> [[TMP10]] to <2 x i16>
-; THRESH-NEXT:    [[TMP12:%.*]] = extractelement <2 x i16> [[TMP11]], i32 1
-; THRESH-NEXT:    store <2 x i16> [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8228), align 4
+; THRESH-NEXT:    [[TMP26:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; THRESH-NEXT:    [[TMP27:%.*]] = trunc <2 x i32> [[TMP10]] to <2 x i16>
+; THRESH-NEXT:    store <2 x i16> [[TMP27]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8228), align 4
 ; THRESH-NEXT:    store <2 x i16> [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8208), align 8
-; THRESH-NEXT:    store <2 x i16> [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8188), align 4
-; THRESH-NEXT:    store i16 [[TMP12]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8170), align 2
 ; THRESH-NEXT:    store i16 [[CONV2176_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8226), align 2
 ; THRESH-NEXT:    store i16 [[CONV2176_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8206), align 2
-; THRESH-NEXT:    store i16 [[CONV2176_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8186), align 2
+; THRESH-NEXT:    [[CONV2304_I:%.*]] = trunc i32 [[SHR2303_I]] to i16
+; THRESH-NEXT:    [[TMP28:%.*]] = insertelement <4 x i32> poison, i32 [[ADD2394_I]], i32 0
+; THRESH-NEXT:    [[TMP29:%.*]] = insertelement <4 x i32> [[TMP28]], i32 [[SHR2325_I]], i32 1
+; THRESH-NEXT:    [[TMP30:%.*]] = insertelement <4 x i32> [[TMP29]], i32 [[ADD2191_I]], i32 2
+; THRESH-NEXT:    [[TMP31:%.*]] = insertelement <4 x i32> [[TMP30]], i32 [[SHR2303_I]], i32 3
+; THRESH-NEXT:    [[TMP32:%.*]] = trunc <4 x i32> [[TMP31]] to <4 x i16>
+; THRESH-NEXT:    [[CONV2193_I:%.*]] = trunc i32 [[ADD2191_I]] to i16
+; THRESH-NEXT:    [[CONV2326_I:%.*]] = trunc i32 [[SHR2325_I]] to i16
+; THRESH-NEXT:    [[CONV2396_I:%.*]] = trunc i32 [[ADD2394_I]] to i16
+; THRESH-NEXT:    store i16 [[CONV2326_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8220), align 4
+; THRESH-NEXT:    store i16 [[CONV2396_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8178), align 2
+; THRESH-NEXT:    store i16 [[CONV2304_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8224), align 8
 ; THRESH-NEXT:    store i16 [[CONV2193_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8222), align 2
-; THRESH-NEXT:    store i16 [[CONV2193_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8202), align 2
-; THRESH-NEXT:    store i16 [[CONV2206_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8218), align 2
+; THRESH-NEXT:    store <4 x i16> [[TMP32]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8198), align 2
+; THRESH-NEXT:    [[TMP33:%.*]] = shufflevector <2 x i32> [[TMP22]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; THRESH-NEXT:    [[TMP34:%.*]] = shufflevector <2 x i32> [[TMP15]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; THRESH-NEXT:    [[TMP35:%.*]] = shufflevector <4 x i32> [[TMP33]], <4 x i32> [[TMP34]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; THRESH-NEXT:    [[TMP36:%.*]] = insertelement <4 x i32> <i32 1, i32 1, i32 1, i32 poison>, i32 [[TMP0]], i32 3
+; THRESH-NEXT:    [[TMP37:%.*]] = lshr <4 x i32> [[TMP35]], [[TMP36]]
+; THRESH-NEXT:    [[TMP38:%.*]] = add <4 x i32> [[TMP35]], [[TMP36]]
+; THRESH-NEXT:    [[TMP39:%.*]] = shufflevector <4 x i32> [[TMP37]], <4 x i32> [[TMP38]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; THRESH-NEXT:    [[TMP40:%.*]] = shufflevector <4 x i32> [[TMP39]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; THRESH-NEXT:    [[TMP41:%.*]] = shufflevector <8 x i32> [[TMP24]], <8 x i32> [[TMP40]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; THRESH-NEXT:    [[TMP42:%.*]] = trunc <8 x i32> [[TMP41]] to <8 x i16>
+; THRESH-NEXT:    [[TMP43:%.*]] = shufflevector <4 x i32> [[TMP26]], <4 x i32> [[TMP39]], <4 x i32> <i32 0, i32 4, i32 5, i32 poison>
+; THRESH-NEXT:    [[TMP44:%.*]] = insertelement <4 x i32> [[TMP43]], i32 [[SHR2237_I]], i32 3
+; THRESH-NEXT:    [[TMP45:%.*]] = trunc <4 x i32> [[TMP44]] to <4 x i16>
+; THRESH-NEXT:    [[TMP46:%.*]] = trunc <4 x i32> [[TMP39]] to <4 x i16>
+; THRESH-NEXT:    store <4 x i16> [[TMP45]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8170), align 2
+; THRESH-NEXT:    store <8 x i16> [[TMP42]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8180), align 4
+; THRESH-NEXT:    store <4 x i16> [[TMP46]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8212), align 4
 ; THRESH-NEXT:    ret i32 0
 ;
 entry: