[llvm] [LV][AArch64]: Utilise SVE ld4/st4 instructions via auto-vectorisation (PR #89018)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Apr 16 21:20:09 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-transforms
Author: Hassnaa Hamdi (hassnaaHamdi)
<details>
<summary>Changes</summary>
- Given an array of struct like this: struct xyzt { int x; int y; int z; int t; },
The LoopVectorize can't use scalable vectors to vectorize it,
because SV have to use intrinsics to deinterleave,
BUT (de)interleave4 is not available.
- This patch uses (de)interleave2 recursively to get same results of using (de)interleave4;
then the vectorizer could use SV.
- ex: if we have vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %vec),
it will be deinterleaved into { <vscale x 8 x i32>, <8 x 16 x i32>},
then each extracted vector: <vscale x 8 x i32> will be deinterleaved into { <vscale x 4 x i32>, <vscale x 4 i32> },
so the final result would be: { <vscale x 4 x i32>, <vscale x 4 i32>, <vscale x 4 x i32>, <vscale x 4 i32> },
which is the same result if we could use deinterleave4.
- Finally the TLIs that have (de)interleave4 intrinsics can spot that sequence of (de)interleave2 and replace it by (de)interleave4.
- this solution is expected to work for any interleaving factor that is pow(2), as long as the TLI has the equivalent intrinsics.
---
Patch is 32.23 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/89018.diff
10 Files Affected:
- (modified) llvm/include/llvm/CodeGen/TargetLowering.h (+4)
- (modified) llvm/lib/CodeGen/InterleavedAccessPass.cpp (+66-6)
- (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+38-11)
- (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.h (+2)
- (modified) llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp (+5-3)
- (modified) llvm/lib/Target/RISCV/RISCVISelLowering.cpp (+37-4)
- (modified) llvm/lib/Target/RISCV/RISCVISelLowering.h (+3-1)
- (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+48-16)
- (added) llvm/test/CodeGen/AArch64/sve-deinterleave-load.ll (+89)
- (added) llvm/test/CodeGen/RISCV/rvv/sve-deinterleave-load.ll (+74)
``````````diff
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index e0ade02959025f..e233d430e98dd5 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -59,6 +59,8 @@
#include <string>
#include <utility>
#include <vector>
+#include <stack>
+#include <queue>
namespace llvm {
@@ -3145,6 +3147,7 @@ class TargetLoweringBase {
/// \p DI is the deinterleave intrinsic.
/// \p LI is the accompanying load instruction
virtual bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
+ std::queue<std::pair<unsigned, Value*>>& LeafNodes,
LoadInst *LI) const {
return false;
}
@@ -3156,6 +3159,7 @@ class TargetLoweringBase {
/// \p II is the interleave intrinsic.
/// \p SI is the accompanying store instruction
virtual bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
+ std::queue<Value*>& LeafNodes,
StoreInst *SI) const {
return false;
}
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 438ac1c3cc6e2c..73c3a63b61da3b 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -71,6 +71,7 @@
#include "llvm/Transforms/Utils/Local.h"
#include <cassert>
#include <utility>
+#include <queue>
using namespace llvm;
@@ -510,12 +511,52 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
LLVM_DEBUG(dbgs() << "IA: Found a deinterleave intrinsic: " << *DI << "\n");
+ std::stack<IntrinsicInst*> DeinterleaveTreeQueue;
+ std::queue<std::pair<unsigned, Value*>> LeafNodes;
+ std::map<IntrinsicInst*, bool>mp;
+ SmallVector<Instruction *> TempDeadInsts;
+
+ DeinterleaveTreeQueue.push(DI);
+ unsigned DILeafCount = 0;
+ while(!DeinterleaveTreeQueue.empty()) {
+ auto CurrentDI = DeinterleaveTreeQueue.top();
+ DeinterleaveTreeQueue.pop();
+ TempDeadInsts.push_back(CurrentDI);
+ bool RootFound = false;
+ for (auto UserExtract : CurrentDI->users()) { // iterate over extract users of deinterleave
+ Instruction *Extract = dyn_cast<Instruction>(UserExtract);
+ if (!Extract || Extract->getOpcode() != Instruction::ExtractValue)
+ continue;
+ bool IsLeaf = true;
+ for (auto UserDI : UserExtract->users()) { // iterate over deinterleave users of extract
+ IntrinsicInst *Child_DI = dyn_cast<IntrinsicInst>(UserDI);
+ if (!Child_DI ||
+ Child_DI->getIntrinsicID() != Intrinsic::experimental_vector_deinterleave2)
+ continue;
+ IsLeaf = false;
+ if (mp.count(Child_DI) == 0) {
+ DeinterleaveTreeQueue.push(Child_DI);
+ }
+ continue;
+ }
+ if (IsLeaf) {
+ RootFound = true;
+ LeafNodes.push(std::make_pair(DILeafCount, UserExtract));
+ TempDeadInsts.push_back(Extract);
+ }
+ else {
+ TempDeadInsts.push_back(Extract);
+ }
+ }
+ if (RootFound)
+ DILeafCount += CurrentDI->getNumUses();
+ }
// Try and match this with target specific intrinsics.
- if (!TLI->lowerDeinterleaveIntrinsicToLoad(DI, LI))
+ if (!TLI->lowerDeinterleaveIntrinsicToLoad(DI, LeafNodes, LI))
return false;
// We now have a target-specific load, so delete the old one.
- DeadInsts.push_back(DI);
+ DeadInsts.insert(DeadInsts.end(), TempDeadInsts.rbegin(), TempDeadInsts.rend());
DeadInsts.push_back(LI);
return true;
}
@@ -531,14 +572,33 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
return false;
LLVM_DEBUG(dbgs() << "IA: Found an interleave intrinsic: " << *II << "\n");
-
+ std::queue<IntrinsicInst*> IeinterleaveTreeQueue;
+ std::queue<Value*> LeafNodes;
+ SmallVector<Instruction *> TempDeadInsts;
+
+ IeinterleaveTreeQueue.push(II);
+ while(!IeinterleaveTreeQueue.empty()) {
+ auto node = IeinterleaveTreeQueue.front();
+ TempDeadInsts.push_back(node);
+ IeinterleaveTreeQueue.pop();
+ for(unsigned i = 0; i < 2; i++) {
+ auto op = node->getOperand(i);
+ if(auto CurrentII = dyn_cast<IntrinsicInst>(op)) {
+ if (CurrentII->getIntrinsicID() != Intrinsic::experimental_vector_interleave2)
+ continue;
+ IeinterleaveTreeQueue.push(CurrentII);
+ continue;
+ }
+ LeafNodes.push(op);
+ }
+ }
// Try and match this with target specific intrinsics.
- if (!TLI->lowerInterleaveIntrinsicToStore(II, SI))
+ if (!TLI->lowerInterleaveIntrinsicToStore(II, LeafNodes, SI))
return false;
// We now have a target-specific store, so delete the old one.
DeadInsts.push_back(SI);
- DeadInsts.push_back(II);
+ DeadInsts.insert(DeadInsts.end(), TempDeadInsts.begin(), TempDeadInsts.end());
return true;
}
@@ -559,7 +619,7 @@ bool InterleavedAccessImpl::runOnFunction(Function &F) {
// with a factor of 2.
if (II->getIntrinsicID() == Intrinsic::experimental_vector_deinterleave2)
Changed |= lowerDeinterleaveIntrinsic(II, DeadInsts);
- if (II->getIntrinsicID() == Intrinsic::experimental_vector_interleave2)
+ else if (II->getIntrinsicID() == Intrinsic::experimental_vector_interleave2)
Changed |= lowerInterleaveIntrinsic(II, DeadInsts);
}
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7947d73f9a4dd0..ab8c01e2df5a9a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -16345,15 +16345,15 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
}
bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
- IntrinsicInst *DI, LoadInst *LI) const {
+ IntrinsicInst *DI, std::queue<std::pair<unsigned, llvm::Value*>>& LeafNodes, LoadInst *LI) const {
// Only deinterleave2 supported at present.
if (DI->getIntrinsicID() != Intrinsic::experimental_vector_deinterleave2)
return false;
- // Only a factor of 2 supported at present.
- const unsigned Factor = 2;
+ const unsigned Factor = std::max(2, (int)LeafNodes.size());
- VectorType *VTy = cast<VectorType>(DI->getType()->getContainedType(0));
+ VectorType *VTy = (LeafNodes.size() > 0) ? cast<VectorType>(LeafNodes.front().second->getType()) :
+ cast<VectorType>(DI->getType()->getContainedType(0));
const DataLayout &DL = DI->getModule()->getDataLayout();
bool UseScalable;
if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
@@ -16409,8 +16409,27 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
Result = Builder.CreateInsertValue(Result, Left, 0);
Result = Builder.CreateInsertValue(Result, Right, 1);
} else {
- if (UseScalable)
+ if (UseScalable) {
Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
+ if (Factor == 2) {
+ DI->replaceAllUsesWith(Result);
+ return true;
+ }
+ while (!LeafNodes.empty()) {
+ unsigned ExtractIndex = LeafNodes.front().first;
+ llvm::Value* CurrentExtract = LeafNodes.front().second;
+ LeafNodes.pop();
+ ExtractValueInst* ExtractValueInst = dyn_cast<llvm::ExtractValueInst>(CurrentExtract);
+
+ SmallVector<unsigned, 4> NewIndices;
+ for (auto index : ExtractValueInst->indices())
+ NewIndices.push_back(index + ExtractIndex);
+
+ Value *extrc =Builder.CreateExtractValue(Result, NewIndices);
+ CurrentExtract->replaceAllUsesWith(extrc);
+ }
+ return true;
+ }
else
Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
}
@@ -16420,15 +16439,15 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
}
bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
- IntrinsicInst *II, StoreInst *SI) const {
+ IntrinsicInst *II, std::queue<Value*>& LeafNodes, StoreInst *SI) const {
// Only interleave2 supported at present.
if (II->getIntrinsicID() != Intrinsic::experimental_vector_interleave2)
return false;
- // Only a factor of 2 supported at present.
- const unsigned Factor = 2;
+ // leaf nodes are the nodes that will be interleaved
+ const unsigned Factor = LeafNodes.size();
- VectorType *VTy = cast<VectorType>(II->getOperand(0)->getType());
+ VectorType *VTy = cast<VectorType>(LeafNodes.front()->getType());
const DataLayout &DL = II->getModule()->getDataLayout();
bool UseScalable;
if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
@@ -16473,8 +16492,16 @@ bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
R = Builder.CreateExtractVector(StTy, II->getOperand(1), Idx);
}
- if (UseScalable)
- Builder.CreateCall(StNFunc, {L, R, Pred, Address});
+ if (UseScalable) {
+ SmallVector<Value *> Args;
+ while (!LeafNodes.empty()) {
+ Args.push_back(LeafNodes.front());
+ LeafNodes.pop();
+ }
+ Args.push_back(Pred);
+ Args.push_back(Address);
+ Builder.CreateCall(StNFunc, Args);
+ }
else
Builder.CreateCall(StNFunc, {L, R, Address});
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index db6e8a00d2fb5e..85497a1f7ae41a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -683,9 +683,11 @@ class AArch64TargetLowering : public TargetLowering {
unsigned Factor) const override;
bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
+ std::queue<std::pair<unsigned, Value*>>& LeafNodes,
LoadInst *LI) const override;
bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
+ std::queue<Value*>& LeafNodes,
StoreInst *SI) const override;
bool isLegalAddImmediate(int64_t) const override;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index e80931a03f30b6..35150928f0adb0 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3315,15 +3315,17 @@ InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
assert(Factor >= 2 && "Invalid interleave factor");
auto *VecVTy = cast<VectorType>(VecTy);
- if (VecTy->isScalableTy() && (!ST->hasSVE() || Factor != 2))
- return InstructionCost::getInvalid();
+ unsigned MaxFactor = TLI->getMaxSupportedInterleaveFactor();
+ if (VecTy->isScalableTy() &&
+ (!ST->hasSVE() || Factor > MaxFactor))
+ return InstructionCost::getInvalid();
// Vectorization for masked interleaved accesses is only enabled for scalable
// VF.
if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
return InstructionCost::getInvalid();
- if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
+ if (!UseMaskForGaps && Factor <= MaxFactor) {
unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
auto *SubVecTy =
VectorType::get(VecVTy->getElementType(),
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index dc7c6f83b98579..64e0a2bb1f2942 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -21025,6 +21025,7 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
}
bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
+ std::queue<std::pair<unsigned, Value*>>& LeafNodes,
LoadInst *LI) const {
assert(LI->isSimple());
IRBuilder<> Builder(LI);
@@ -21033,10 +21034,11 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
if (DI->getIntrinsicID() != Intrinsic::experimental_vector_deinterleave2)
return false;
- unsigned Factor = 2;
+ unsigned Factor = std::max(2, (int)LeafNodes.size());
VectorType *VTy = cast<VectorType>(DI->getOperand(0)->getType());
- VectorType *ResVTy = cast<VectorType>(DI->getType()->getContainedType(0));
+ VectorType *ResVTy = (LeafNodes.size() > 0) ? cast<VectorType>(LeafNodes.front().second->getType()) :
+ cast<VectorType>(DI->getType()->getContainedType(0));
if (!isLegalInterleavedAccessType(ResVTy, Factor, LI->getAlign(),
LI->getPointerAddressSpace(),
@@ -21064,6 +21066,27 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
{ResVTy, XLenTy});
VL = Constant::getAllOnesValue(XLenTy);
Ops.append(Factor, PoisonValue::get(ResVTy));
+ Ops.append({LI->getPointerOperand(), VL});
+ Value *Vlseg = Builder.CreateCall(VlsegNFunc, Ops);
+ //-----------
+ if (Factor == 2) {
+ DI->replaceAllUsesWith(Vlseg);
+ return true;
+ }
+ unsigned ExtractIndex = 0;
+ while (!LeafNodes.empty()) {
+ ExtractIndex = LeafNodes.front().first;
+ auto CurrentExtract = LeafNodes.front().second;
+ LeafNodes.pop();
+ ExtractValueInst* ExtractValueInst = dyn_cast<llvm::ExtractValueInst>(CurrentExtract);
+ SmallVector<unsigned, 4> NewIndices;
+ for (auto index : ExtractValueInst->indices()) {
+ NewIndices.push_back(index + ExtractIndex);
+ }
+ Value *extrc = Builder.CreateExtractValue(Vlseg, NewIndices);
+ CurrentExtract->replaceAllUsesWith(extrc);
+ }
+ return true;
}
Ops.append({LI->getPointerOperand(), VL});
@@ -21075,6 +21098,7 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
}
bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
+ std::queue<Value*>& LeafNodes,
StoreInst *SI) const {
assert(SI->isSimple());
IRBuilder<> Builder(SI);
@@ -21083,10 +21107,10 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
if (II->getIntrinsicID() != Intrinsic::experimental_vector_interleave2)
return false;
- unsigned Factor = 2;
+ unsigned Factor = LeafNodes.size();
VectorType *VTy = cast<VectorType>(II->getType());
- VectorType *InVTy = cast<VectorType>(II->getOperand(0)->getType());
+ VectorType *InVTy = cast<VectorType>(LeafNodes.front()->getType());
if (!isLegalInterleavedAccessType(InVTy, Factor, SI->getAlign(),
SI->getPointerAddressSpace(),
@@ -21112,6 +21136,15 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
VssegNFunc = Intrinsic::getDeclaration(SI->getModule(), IntrIds[Factor - 2],
{InVTy, XLenTy});
VL = Constant::getAllOnesValue(XLenTy);
+ SmallVector<Value *> Args;
+ while (!LeafNodes.empty()) {
+ Args.push_back(LeafNodes.front());
+ LeafNodes.pop();
+ }
+ Args.push_back(SI->getPointerOperand());
+ Args.push_back(VL);
+ Builder.CreateCall(VssegNFunc, Args);
+ return true;
}
Builder.CreateCall(VssegNFunc, {II->getOperand(0), II->getOperand(1),
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index b10da3d40befb7..1f104cf3bc15d5 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -855,10 +855,12 @@ class RISCVTargetLowering : public TargetLowering {
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
unsigned Factor) const override;
- bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *II,
+ bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
+ std::queue<std::pair<unsigned, Value*>>& LeafNodes,
LoadInst *LI) const override;
bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
+ std::queue<Value*>& LeafNodes,
StoreInst *SI) const override;
bool supportKCFIBundles() const override { return true; }
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 2057cab46135ff..41f8c5a72ce1e7 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -154,6 +154,7 @@
#include <string>
#include <tuple>
#include <utility>
+#include <queue>
using namespace llvm;
@@ -459,10 +460,23 @@ static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
// Scalable vectors cannot use arbitrary shufflevectors (only splats), so
// must use intrinsics to interleave.
if (VecTy->isScalableTy()) {
- VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
- return Builder.CreateIntrinsic(
- WideVecTy, Intrinsic::experimental_vector_interleave2, Vals,
- /*FMFSource=*/nullptr, Name);
+ SmallVector<Value *> Vecs(Vals);
+ unsigned AllNodesNum = (2*Vals.size()) - 1;
+ // last element in the vec should be the final interleaved result,
+ // so, skip processing last element.
+ AllNodesNum --;
+ // interleave each 2 consecutive nodes, and push result to the vec,
+ // so that we can interleave the interleaved results again if we have
+ // more than 2 vectors to interleave.
+ for (unsigned i = 0; i < AllNodesNum; i +=2) {
+ VectorType *VecTy = cast<VectorType>(Vecs[i]->getType());
+ VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
+ auto InterleavedVec = Builder.CreateIntrinsic(
+ WideVecTy, Intrinsic::experimental_vector_interleave2,
+ {Vecs[i], Vecs[i+1]}, /*FMFSource=*/nullptr, Name);
+ Vecs.push_back(InterleavedVec);
+ }
+ return Vecs[Vecs.size()-1];
}
// Fixed length. Start by concatenating all vectors into a wide vector.
@@ -2519,7 +2533,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
unsigned Part, Value *MaskForGaps) -> Value * {
if (VF.isScalable()) {
assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
- assert(InterleaveFactor == 2 &&
+ assert(isPowerOf2_32(InterleaveFactor) &&
"Unsupported deinterleave factor for scalable vectors");
auto *BlockInMaskPart = State.get(BlockInMask, Part);
SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart};
@@ -2572,23 +2586,40 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
}
if (VecTy->isScalableTy()) {
- assert(InterleaveFactor == 2 &&
- "Unsupported deinterleave factor for scalable vectors");
-
+ assert(isPowerOf2_32(InterleaveFactor) &&
+ "Unsupported deinterleave factor for scalable vectors");
for (unsigned Part = 0; Part < UF; ++Part) {
// Scalable vectors cannot use arbitrary shufflevectors (only splats),
// so must use intrinsics to deinterleave.
- Value *DI = Builder.CreateIntrinsic(
- Intrinsic::experimental_vector_deinterleave2, VecTy, NewLoads[Part],
- /*FMFSource=*/nullptr, "strided.vec");
+
+ std::queue<Value *>Queue;
+ Queue.push(NewLoads[Part]);
+ // NonLeaf represents how many times we will do deinterleaving,
+ // think of it as a tree, each node will be deinterleaved, untill we reach to
+ // the leaf nodes which will be the final results of deinterleaving.
+ unsigned NonLeaf = InterleaveFactor - 1;
+ for (unsigned i = 0; i < NonLeaf; i ++) {
+ auto Node = Queue.front();
+ Queue.pop();
+ auto DeinterleaveType = Node->getType();
+ Value *DI = Builder.CreateIntrinsic(
+ Intrinsic::experimental_vector_deinterleave2, DeinterleaveType, Node,
+ /*FMFSource=*/nullptr, "root.strided.vec");
+ Value *StridedVec1 = Builder.CreateExtractValue(DI, 0);
+ Value *Strid...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/89018
More information about the llvm-commits
mailing list