[llvm] [SLP] Check for extracts, being replaced by original scalars, for user nodes (PR #149572)

Mon Jul 28 02:01:59 PDT 2025

================
@@ -9149,6 +9163,81 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
   return {IntrinsicCost, LibCost};
 }
 
+bool BoUpSLP::isProfitableToVectorizeWithNonVecUsers(
+    const InstructionsState &S, const EdgeInfo &UserTreeIdx,
+    ArrayRef<Value *> Scalars, ArrayRef<int> ScalarsMask) {
+  assert(S && "Expected valid instructions state.");
+  // Loads, extracts and geps are immediately scalarizable, so no need to check.
+  if (S.getOpcode() == Instruction::Load ||
+      S.getOpcode() == Instruction::ExtractElement ||
+      S.getOpcode() == Instruction::GetElementPtr)
+    return true;
+  // Check only vectorized users, others scalarized (potentially, at least)
+  // already.
+  if (!UserTreeIdx.UserTE || UserTreeIdx.UserTE->isGather() ||
+      UserTreeIdx.UserTE->State == TreeEntry::SplitVectorize)
+    return true;
+  // PHI nodes may have cyclic deps, so cannot check here.
+  if (UserTreeIdx.UserTE->getOpcode() == Instruction::PHI)
+    return true;
+  // Do not check root reduction nodes, they do not have non-vectorized users.
+  if (UserIgnoreList && UserTreeIdx.UserTE->Idx == 0)
+    return true;
+  constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  ArrayRef<Value *> VL = UserTreeIdx.UserTE->Scalars;
+  Type *UserScalarTy = getValueType(VL.front());
+  if (!isValidElementType(UserScalarTy))
+    return true;
+  Type *ScalarTy = getValueType(Scalars.front());
+  if (!isValidElementType(ScalarTy))
+    return true;
+  // Ignore subvectors extracts.
+  if (UserScalarTy->isVectorTy())
+    return true;
+  auto *UserVecTy =
+      getWidenedType(UserScalarTy, UserTreeIdx.UserTE->getVectorFactor());
+  APInt DemandedElts = APInt::getZero(UserTreeIdx.UserTE->getVectorFactor());
+  // Check the external uses and check, if vector node + extracts is not
+  // profitable for the vectorization.
+  InstructionCost UserScalarsCost = 0;
+  for (Value *V : VL) {
+    auto *I = dyn_cast<Instruction>(V);
+    if (!I)
+      continue;
+    if (areAllUsersVectorized(I, UserIgnoreList))
+      continue;
+    DemandedElts.setBit(UserTreeIdx.UserTE->findLaneForValue(V));
+    UserScalarsCost += TTI->getInstructionCost(I, CostKind);
+  }
+  // No non-vectorized users - success.
+  if (DemandedElts.isZero())
+    return true;
+  // If extracts are cheaper than the original scalars - success.
+  InstructionCost ExtractCost =
+      ::getScalarizationOverhead(*TTI, UserScalarTy, UserVecTy, DemandedElts,
+                                 /*Insert=*/false, /*Extract=*/true, CostKind);
+  if (ExtractCost <= UserScalarsCost)
+    return true;
+  SmallPtrSet<Value *, 4> CheckedExtracts;
+  InstructionCost NodeCost =
+      UserTreeIdx.UserTE->State == TreeEntry::CombinedVectorize
+          ? InstructionCost(0)
+          : getEntryCost(UserTreeIdx.UserTE, {}, CheckedExtracts);
+  // The node is profitable for vectorization - success.
+  if (ExtractCost + NodeCost <= -SLPCostThreshold)
+    return true;
----------------
gbossu wrote:

But the code is checking the cost of extracting vs keeping instructions in `UserVL`, not `VL`. The fate of `UserVL` has already been decided, and as you said, you're not going to change it here.

Shouldn't that code instead check the users of `VL`, especially those that are not in `UserVL` and that we consider "external"?

https://github.com/llvm/llvm-project/pull/149572