[llvm] [LoadStoreVectorizer] Fill gaps in load/store chains to enable vectorization (PR #159388)

Fri Nov 21 14:29:35 PST 2025

================
@@ -1676,3 +1893,113 @@ std::optional<APInt> Vectorizer::getConstantOffset(Value *PtrA, Value *PtrB,
         .sextOrTrunc(OrigBitWidth);
   return std::nullopt;
 }
+
+bool Vectorizer::accessIsAllowedAndFast(unsigned SizeBytes, unsigned AS,
+                                        Align Alignment,
+                                        unsigned VecElemBits) const {
+  // Aligned vector accesses are ALWAYS faster than element-wise accesses.
+  if (Alignment.value() % SizeBytes == 0)
+    return true;
+
+  // Element-wise access *might* be faster than misaligned vector accesses.
+  unsigned VectorizedSpeed = 0;
+  bool AllowsMisaligned = TTI.allowsMisalignedMemoryAccesses(
+      F.getContext(), SizeBytes * 8, AS, Alignment, &VectorizedSpeed);
+  if (!AllowsMisaligned) {
+    LLVM_DEBUG(
+        dbgs() << "LSV: Access of " << SizeBytes << "B in addrspace " << AS
+               << " with alignment " << Alignment.value()
+               << " is misaligned, and therefore can't be vectorized.\n");
+    return false;
+  }
+
+  unsigned ElementwiseSpeed = 0;
+  (TTI).allowsMisalignedMemoryAccesses((F).getContext(), VecElemBits, AS,
+                                       Alignment, &ElementwiseSpeed);
+  if (VectorizedSpeed < ElementwiseSpeed) {
+    LLVM_DEBUG(dbgs() << "LSV: Access of " << SizeBytes << "B in addrspace "
+                      << AS << " with alignment " << Alignment.value()
+                      << " has relative speed " << VectorizedSpeed
+                      << ", which is lower than the elementwise speed of "
+                      << ElementwiseSpeed
+                      << ".  Therefore this access won't be vectorized.\n");
+    return false;
+  }
+  return true;
+}
+
+ChainElem Vectorizer::createExtraElementAfter(const ChainElem &Prev, Type *Ty,
+                                              APInt Offset, StringRef Prefix,
+                                              Align Alignment) {
+  Instruction *NewElement = nullptr;
+  Builder.SetInsertPoint(Prev.Inst->getNextNode());
+  if (LoadInst *PrevLoad = dyn_cast<LoadInst>(Prev.Inst)) {
+    Value *NewGep = Builder.CreatePtrAdd(
+        PrevLoad->getPointerOperand(), Builder.getInt(Offset), Prefix + "GEP");
+    LLVM_DEBUG(dbgs() << "LSV: Extra GEP Created: \n" << *NewGep << "\n");
+    NewElement = Builder.CreateAlignedLoad(Ty, NewGep, Alignment, Prefix);
+  } else {
+    StoreInst *PrevStore = cast<StoreInst>(Prev.Inst);
+
+    Value *NewGep = Builder.CreatePtrAdd(
+        PrevStore->getPointerOperand(), Builder.getInt(Offset), Prefix + "GEP");
+    LLVM_DEBUG(dbgs() << "LSV: Extra GEP Created: \n" << *NewGep << "\n");
+    NewElement =
+        Builder.CreateAlignedStore(PoisonValue::get(Ty), NewGep, Alignment);
+  }
+
+  // Attach all metadata to the new element.
+  // propagateMetadata will fold it into the final vector when applicable.
+  NewElement->copyMetadata(*Prev.Inst);
+
+  // Cache created elements for tracking and cleanup
+  ExtraElements.insert(NewElement);
+
+  APInt NewOffsetFromLeader = Prev.OffsetFromLeader + Offset;
+  LLVM_DEBUG(dbgs() << "LSV: Extra Element Created: \n"
+                    << *NewElement
+                    << " OffsetFromLeader: " << NewOffsetFromLeader << "\n");
+  return ChainElem{NewElement, NewOffsetFromLeader};
+}
+
+Value *Vectorizer::createMaskForExtraElements(const ArrayRef<ChainElem> C,
+                                              FixedVectorType *VecTy) {
+  // Start each mask element as false
+  SmallVector<Constant *, 64> MaskElts(VecTy->getNumElements(), Builder.getInt1(false));
+  // Iterate over the chain and set the corresponding mask element to true for
+  // each element that is not an extra element.
+  for (const ChainElem &E : C) {
+    if (ExtraElements.contains(E.Inst))
+      continue;
+    unsigned EOffset =
+        (E.OffsetFromLeader - C[0].OffsetFromLeader).getZExtValue();
+    unsigned VecIdx =
+        8 * EOffset / DL.getTypeSizeInBits(VecTy->getScalarType());
+    if (FixedVectorType *VT =
+            dyn_cast<FixedVectorType>(getLoadStoreType(E.Inst)))
+      for (unsigned J = 0; J < VT->getNumElements(); ++J)
+        MaskElts[VecIdx + J] = Builder.getInt1(true);
+    else
+      MaskElts[VecIdx] = Builder.getInt1(true);
+  }
+  return ConstantVector::get(MaskElts);
+}
----------------
dakersnar wrote:

Rewrote this to account for redundant elements. Before, each chain element was solely responsible for its corresponding mask element(s) Now, multiple chain elements can overlap on the same mask element, so each mask elems starts at `false` and is set to `true` if _any_ of the corresponding chain elements are real elements.

https://github.com/llvm/llvm-project/pull/159388