[clang] [llvm] [VectorCombine] Shrink loads used in shufflevector rebroadcasts (PR #128938)

Wed Mar 12 09:33:43 PDT 2025

================
@@ -3398,6 +3399,101 @@ bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) {
   return true;
 }
 
+// If `I` is a load instruction, used only by shufflevector instructions with
+// poison values, attempt to shrink the load to only the lanes being used.
+bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
+  auto *OldLoad = dyn_cast<LoadInst>(&I);
+  if (!OldLoad || !OldLoad->isSimple())
+    return false;
+
+  auto *VecTy = dyn_cast<FixedVectorType>(I.getType());
+  if (!VecTy)
+    return false;
+
+  auto IsPoisonOrUndef = [](Value *V) -> bool {
+    if (auto *C = dyn_cast<Constant>(V)) {
+      return isa<PoisonValue>(C) || isa<UndefValue>(C);
+    }
+    return false;
+  };
+
+  using IndexRange = std::pair<int, int>;
+  auto GetIndexRangeInShuffles = [&]() -> std::optional<IndexRange> {
+    auto OutputRange = IndexRange(VecTy->getNumElements(), -1);
+    for (auto &Use : I.uses()) {
+      // All uses must be ShuffleVector instructions.
+      auto *Shuffle = dyn_cast<ShuffleVectorInst>(Use.getUser());
+      if (!Shuffle)
+        return {};
+
+      // Get index range for value.
+      auto *Op0 = Shuffle->getOperand(0u);
+      auto *Op1 = Shuffle->getOperand(1u);
+      if (!IsPoisonOrUndef(Op1))
+        return {};
+
+      // Find the min and max indices used by the ShuffleVector instruction.
+      auto Mask = Shuffle->getShuffleMask();
+      auto *Op0Ty = cast<FixedVectorType>(Op0->getType());
+      auto NumElems = int(Op0Ty->getNumElements());
+
+      for (auto Index : Mask) {
+        if (Index >= 0 && Index < NumElems) {
+          OutputRange.first = std::min(Index, OutputRange.first);
+          OutputRange.second = std::max(Index, OutputRange.second);
+        }
+      }
+
+      if (OutputRange.second < OutputRange.first)
+        return {};
+    }
+    return OutputRange;
+  };
+
+  if (auto Indices = GetIndexRangeInShuffles()) {
+    auto OldSize = VecTy->getNumElements();
+    auto NewSize = Indices->second + 1u;
+
+    if (NewSize < OldSize) {
+      auto Builder = IRBuilder(&I);
+      Builder.SetCurrentDebugLocation(I.getDebugLoc());
+
+      // Create new load of smaller vector.
+      auto *ElemTy = VecTy->getElementType();
+      auto *NewVecTy = FixedVectorType::get(ElemTy, NewSize);
+      auto *NewLoad = cast<LoadInst>(
+          Builder.CreateLoad(NewVecTy, OldLoad->getPointerOperand()));
+      NewLoad->copyMetadata(I);
+
+      // Compare cost of old and new loads.
+      auto OldCost = TTI.getMemoryOpCost(
+          Instruction::Load, OldLoad->getType(), OldLoad->getAlign(),
+          OldLoad->getPointerAddressSpace(), CostKind);
+      auto NewCost = TTI.getMemoryOpCost(
+          Instruction::Load, NewLoad->getType(), NewLoad->getAlign(),
+          NewLoad->getPointerAddressSpace(), CostKind);
----------------
nikic wrote:

Does not account for the shufflevector costs?

There may also be problems because this creates length-changing shuffles, which (as far as I know) often have cost-modelling issues.

https://github.com/llvm/llvm-project/pull/128938