[llvm] [InterleavedAccess] Construct interleaved access store with shuffles (PR #164000)

Wed Nov 5 02:57:51 PST 2025

================
@@ -18139,6 +18146,131 @@ bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store,
   return true;
 }
 
+/// If the interleaved vector elements are greater than supported MaxFactor,
+/// interleaving the data with additional shuffles can be used to
+/// achieve the same.
+///
+/// Consider the following data with 8 interleaves which are shuffled to store
+/// stN instructions. Data needs to be stored in this order:
+///     [v0, v1, v2, v3, v4, v5, v6, v7]
+///
+///    v0      v4      v2      v6      v1      v5      v3      v7
+///    |       |       |       |       |       |       |       |
+///     \     /         \     /         \     /         \     /
+///   [zip v0,v4]      [zip v2,v6]    [zip v1,v5]      [zip v3,v7] ==> stN = 4
+///        |               |              |                 |
+///         \             /                \               /
+///          \           /                  \             /
+///           \         /                    \           /
+///       [zip [v0,v2,v4,v6]]            [zip [v1,v3,v5,v7]]     ==> stN = 2
+///
+/// For stN = 4, upper half of interleaved data V0, V1, V2, V3 is stored
+/// with one st4 instruction. Lower half, i.e, V4, V5, V6, V7 is stored with
+/// another st4.
+///
+/// For stN = 2, upper half of interleaved data V0, V1 is stored
+/// with one st2 instruction. Second set V2, V3 is stored with another st2.
+/// Total of 4 st2's are required here.
+bool AArch64TargetLowering::lowerInterleavedStoreWithShuffle(
+    StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const {
+  unsigned MaxSupportedFactor = getMaxSupportedInterleaveFactor();
+
+  auto *VecTy = cast<FixedVectorType>(SVI->getType());
+  assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
+
+  unsigned LaneLen = VecTy->getNumElements() / Factor;
+  Type *EltTy = VecTy->getElementType();
+  auto *SubVecTy = FixedVectorType::get(EltTy, Factor);
+
+  const DataLayout &DL = SI->getModule()->getDataLayout();
+  bool UseScalable;
+
+  // Skip if we do not have NEON and skip illegal vector types. We can
+  // "legalize" wide vector types into multiple interleaved accesses as long as
+  // the vector types are divisible by 128.
+  if (!Subtarget->hasNEON() ||
+      !isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
+    return false;
+
+  if (UseScalable)
+    return false;
+
+  std::deque<Value *> Shuffles;
+  Shuffles.push_back(SVI);
+  unsigned ConcatLevel = Factor;
+  // Getting all the interleaved operands.
+  while (ConcatLevel > 1) {
+    unsigned InterleavedOperands = Shuffles.size();
+    for (unsigned i = 0; i < InterleavedOperands; i++) {
+      ShuffleVectorInst *SFL = dyn_cast<ShuffleVectorInst>(Shuffles.front());
+      if (!SFL)
+        return false;
+      Shuffles.pop_front();
+
+      Value *Op0 = SFL->getOperand(0);
+      Value *Op1 = SFL->getOperand(1);
+
+      Shuffles.push_back(dyn_cast<Value>(Op0));
+      Shuffles.push_back(dyn_cast<Value>(Op1));
+    }
+    if (Shuffles.size() != InterleavedOperands * 2)
+      return false;
+    ConcatLevel = ConcatLevel >> 1;
+  }
+
+  IRBuilder<> Builder(SI);
+  auto Mask = createInterleaveMask(LaneLen, 2);
+  SmallVector<int, 16> UpperHalfMask, LowerHalfMask;
+  for (unsigned i = 0; i < (2 * LaneLen); i++) {
+    if (i < LaneLen)
+      LowerHalfMask.push_back(Mask[i]);
+    else
+      UpperHalfMask.push_back(Mask[i]);
+  }
----------------
Rajveer100 wrote:

```suggestion
  for (unsigned i = 0; i < LaneLen; i++) {
    LowerHalfMask[i] = Mask[i];
    UpperHalfMask[LaneLen + i] = Mask[LaneLen + i];
  }
```

https://github.com/llvm/llvm-project/pull/164000