[llvm] [InterleavedAccess] Construct interleaved access store with shuffles (PR #164000)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Oct 31 11:47:11 PDT 2025
================
@@ -18173,6 +18180,135 @@ bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store,
return true;
}
+/// If the interleaved vector elements are greter than supported MaxFactor
+/// then, interleaving the data with additional shuffles can be used to
+/// achieve the same.
+/// Below shows how 8 interleaved data are shuffled to store with stN
+/// instructions. Data need store in this order v0,v1,v2,v3,v4,v5,v6,v7
+/// v0 v4 v2 v6 v1 v5 v3 v7
+/// | | | | | | | |
+/// \ / \ / \ / \ /
+/// [zip v0,v4] [zip v2,v6] [zip v1,v5] [zip v3,v7]==> stN = 4
+/// | | | |
+/// \ / \ /
+/// \ / \ /
+/// \ / \ /
+/// [zip [v0,v2,v4,v6]] [zip [v1,v3,v5,v7]] ==> stN = 2
+///
+/// In stN = 4 level upper half of interleaved data V0,V1,V2,V3 is store
+/// withone st4 instruction. Lower half V4,V5,V6,V7 store with another st4.
+///
+/// In stN = 2 level first upper half of interleaved data V0,V1 is store
+/// with one st2 instruction. Second set V2,V3 with store with another st2.
+/// Total of 4 st2 are required.
+bool AArch64TargetLowering::lowerInterleavedStoreWithShuffle(
+ StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const {
+ unsigned MaxSupportedFactor = getMaxSupportedInterleaveFactor();
+
+ auto *VecTy = cast<FixedVectorType>(SVI->getType());
+ assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
+
+ unsigned LaneLen = VecTy->getNumElements() / Factor;
+ Type *EltTy = VecTy->getElementType();
+ auto *SubVecTy = FixedVectorType::get(EltTy, Factor);
+
+ const DataLayout &DL = SI->getModule()->getDataLayout();
+ bool UseScalable;
+
+ // Skip if we do not have NEON and skip illegal vector types. We can
+ // "legalize" wide vector types into multiple interleaved accesses as long as
+ // the vector types are divisible by 128.
+ if (!Subtarget->hasNEON() ||
+ !isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
+ return false;
+
+ if (UseScalable)
+ return false;
+
+ std::deque<Value *> Shuffles;
+ Shuffles.push_back(SVI);
+ unsigned ConcatLevel = Factor;
+ while (ConcatLevel > 1) {
+ std::deque<Value *> ShufflesIntermediate;
+ ShufflesIntermediate = Shuffles;
+ Shuffles.clear();
+ while (!ShufflesIntermediate.empty()) {
+ ShuffleVectorInst *SFL =
+ dyn_cast<ShuffleVectorInst>(ShufflesIntermediate.front());
+ if (!SFL)
+ break;
+ ShufflesIntermediate.pop_front();
+
+ Value *Op0 = SFL->getOperand(0);
+ Value *Op1 = SFL->getOperand(1);
+
+ Shuffles.push_back(dyn_cast<Value>(Op0));
+ Shuffles.push_back(dyn_cast<Value>(Op1));
+ }
+ if (!ShufflesIntermediate.empty()) {
+ Shuffles = ShufflesIntermediate;
+ break;
+ }
+ ConcatLevel = ConcatLevel >> 1;
+ }
----------------
ram-NK wrote:
Normal interleaved store by Factor=8 is like this
```
%v0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%v1 = shufflevector <4 x i32> %a2, <4 x i32> %a3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%v2 = shufflevector <4 x i32> %a4, <4 x i32> %a5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%v3 = shufflevector <4 x i32> %a6, <4 x i32> %a7, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%s1 = shufflevector <8 x i32> %v2, <8 x i32> %v3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> %s1, <32 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
store <32 x i32> %interleaved.vec, ptr %ptr, align 4
```
As per my logic.
At ConcatLevel == 8
Shuffles has %s0 and %s1
At ConcatLevel == 4
Shuffles has %v0, %v1, %v2, %v3
At ConcatLevel == 2
Shuffles has %a0, %a1, %a2, %a3, %a4, %a5, %a6 and %a7
After exiting `while (ConcatLevel > 1)` loop, I have 8 shuffles element.
With this logic I am verifying ConcatLevel and number of elements in Shuffles.
If %a is again a shufflevector `%a = shufflevector <2 x i32> %shuf, <2 x i32> undef, <2 x i32> zeroinitializer`.
As per your logic, on exiting `while (!Shuffles.empty()),` Shuffles will contain 9 elements and miss the interleave opportunity.
%a1, %a2, %a3, %a4, %a5, %a6, %a7, %shuf and undef
https://github.com/llvm/llvm-project/pull/164000
More information about the llvm-commits
mailing list