[llvm] [LSV] Merge contiguous chains across scalar types (PR #154069)

Tue Dec 2 10:56:35 PST 2025

================
@@ -480,49 +582,120 @@ bool Vectorizer::runOnPseudoBB(BasicBlock::iterator Begin,
   });
 
   bool Changed = false;
+  SmallVector<Chain> ContiguousSubChains;
+
   for (const auto &[EqClassKey, EqClass] :
-       collectEquivalenceClasses(Begin, End))
-    Changed |= runOnEquivalenceClass(EqClassKey, EqClass);
+       collectEquivalenceClasses(Begin, End)) {
 
-  return Changed;
-}
+    LLVM_DEBUG({
+      dbgs() << "LSV: Running on equivalence class of size " << EqClass.size()
+             << " keyed on " << EqClassKey << ":\n";
+      for (Instruction *I : EqClass)
+        dbgs() << "  " << *I << "\n";
+    });
 
-bool Vectorizer::runOnEquivalenceClass(const EqClassKey &EqClassKey,
-                                       ArrayRef<Instruction *> EqClass) {
-  bool Changed = false;
+    for (Chain &C : gatherChains(EqClass)) {
 
-  LLVM_DEBUG({
-    dbgs() << "LSV: Running on equivalence class of size " << EqClass.size()
-           << " keyed on " << EqClassKey << ":\n";
-    for (Instruction *I : EqClass)
-      dbgs() << "  " << *I << "\n";
-  });
+      // Split up the chain into increasingly smaller chains, until we can
+      // finally vectorize the chains.
+      //
+      // (Don't be scared by the depth of the loop nest here.  These operations
+      // are all at worst O(n lg n) in the number of instructions, and splitting
+      // chains doesn't change the number of instrs.  So the whole loop nest is
+      // O(n lg n).)
+      for (auto &C : splitChainByMayAliasInstrs(C)) {
+        for (auto &C : splitChainByContiguity(C)) {
+          ContiguousSubChains.emplace_back(C);
+        }
+      }
+    }
+  }
 
-  std::vector<Chain> Chains = gatherChains(EqClass);
-  LLVM_DEBUG(dbgs() << "LSV: Got " << Chains.size()
-                    << " nontrivial chains.\n";);
-  for (Chain &C : Chains)
-    Changed |= runOnChain(C);
-  return Changed;
-}
+  // Merge chains in reverse order, so that the first chain is the largest.
+  for (int I = ContiguousSubChains.size() - 1; I > 0; I--) {
+    Chain &C1 = ContiguousSubChains[I - 1];
+    Chain &C2 = ContiguousSubChains[I];
 
-bool Vectorizer::runOnChain(Chain &C) {
-  LLVM_DEBUG({
-    dbgs() << "LSV: Running on chain with " << C.size() << " instructions:\n";
-    dumpChain(C);
-  });
+    // If the scalar types of the chains are the same, we can merge them
+    // without inserting any casts.
+    if (getLoadStoreType(C1[0].Inst)->getScalarType() ==
+        getLoadStoreType(C2[0].Inst)->getScalarType())
+      continue;
+
+    const Value *C1Ptr = getLoadStorePointerOperand(C1[0].Inst);
+    const Value *C2Ptr = getLoadStorePointerOperand(C2[0].Inst);
+    unsigned AS1 = C1Ptr->getType()->getPointerAddressSpace();
+    unsigned AS2 = C2Ptr->getType()->getPointerAddressSpace();
+    bool C1IsLoad = isa<LoadInst>(C1[0].Inst);
+    bool C2IsLoad = isa<LoadInst>(C2[0].Inst);
+
+    // If the chains are mapped to different types, have distinct underlying
+    // pointer objects, or include both loads and stores, skip.
+    if (C1IsLoad != C2IsLoad || AS1 != AS2 ||
+        ::getUnderlyingObject(C1Ptr) != ::getUnderlyingObject(C2Ptr))
+      continue;
+
+    // Compute constant offset between chain leaders; if unknown, skip.
+    std::optional<APInt> DeltaOpt = computeLeaderDelta(C1[0].Inst, C2[0].Inst);
+    if (!DeltaOpt)
+      continue;
+
+    // Check that rebasing C2 into C1's coordinate space will not overlap C1.
+    if (chainsOverlapAfterRebase(C1, C2, *DeltaOpt))
+      continue;
+
+    // Determine the common integer cast type for normalization and ensure total
+    // bitwidth matches across all elements of both chains.
+    Type *C1ElemTy = getLoadStoreType(C1[0].Inst);
+    unsigned TotalBits = DL.getTypeSizeInBits(C1ElemTy);
+    auto AllElemsMatchTotalBits = [&](const Chain &C) {
+      return llvm::all_of(C, [&](const ChainElem &E) {
+        return DL.getTypeSizeInBits(getLoadStoreType(E.Inst)) == TotalBits;
+      });
+    };
+    if (!AllElemsMatchTotalBits(C1) || !AllElemsMatchTotalBits(C2))
+      continue;
+
+    // Power-of-two span ensures we can form a legal, single vector access
+    // without padding or splitting. Many targets and cost models assume POT
+    // widths, and it guarantees an integral element count for the chosen
+    // VecElemTy.
+    APInt Sz = C2.front().OffsetFromLeader +
+               DL.getTypeStoreSize(getLoadStoreType(C2.front().Inst)) -
+               C1.back().OffsetFromLeader + *DeltaOpt;
+    if (!Sz.isPowerOf2())
+      continue;
+
----------------
dakersnar wrote:

Do you mind explaining this calculation? This doesn't seem right to me.

https://github.com/llvm/llvm-project/pull/154069